# Prepare dataset (jsonl file)

- Prepare AbstRCT datasets for llama factory.

- Argument Relation Identification and Classification (ARIC) task.

- We create the data files: `abstRCT_aric_train.json`, `abstRCT_aric_test_neo.json`, `abstRCT_aric_test_gla.json`, `abstRCT_aric_test_mix.json`

## Libraries

In [1]:
import os
import re
import ast
import json
import random
import pickle
import pandas as pd

### Helper Functions

In [2]:
def get_triplet(x):

    return [x.adu_pos, x.parent_pos, x.afu]

In [3]:
def process_aty(x):

    x = x.aty
    x = x.split(" ")
    return x

In [4]:
def filter_nones(x):

    ar_l = x.relation_type

    rel_l = [item for item in ar_l if item[2] != 'none']

    return rel_l

In [5]:
def insert_ac_tags(x):

    abstract_text = x.abstract_text
    acs_list = x.acs_list
    ac_types_list = x.ac_types

    
    # for idx, ac in enumerate(acs_list):
    
    counter = 1
    for idx, (ac, ac_type) in enumerate(zip(acs_list, ac_types_list)):

        # if ac != '':
        
        if ac != '' and ac_type != 'none':

            # ac_tags_w_tags = f"<AC> " + ac + f" </AC>"
            
            ac_tags_w_tags = f"""<AC{counter}> """ + ac + f""" </AC{counter}>""" 
            abstract_text = abstract_text.replace(ac, ac_tags_w_tags) 
            counter += 1

    return abstract_text

In [6]:
def get_dataframe(dataset):

    rct_df = pd.read_csv(os.path.join(data_dir, dataset))
    rct_df['relation_type'] = rct_df.apply(lambda x: get_triplet(x), axis=1)
    rct_df = rct_df.drop_duplicates(subset=["text"], keep="first")   

    
    abstract_texts_df = rct_df.fillna('').groupby(["doc_id"]).agg({"text": "".join, "aty": " ".join}).reset_index()
    abstract_acs_df = rct_df.fillna('').groupby('doc_id')['text'].agg(list).reset_index()
    abstract_ars_df = rct_df.fillna('').groupby('doc_id')['relation_type'].agg(list).reset_index()
    
    abstract_texts_df["ac_types"] = abstract_texts_df.apply(lambda x: process_aty(x), axis=1)

    abstract_texts_df.rename(columns={'text': 'abstract_text'}, inplace=True)
    abstract_texts_df.drop(columns=['aty'], inplace=True)
    abstract_acs_df.rename(columns={'text': 'acs_list'}, inplace=True)
    
    df_merged_1 = pd.merge(abstract_texts_df, abstract_acs_df, on='doc_id')
    df_merged = pd.merge(df_merged_1, abstract_ars_df, on='doc_id')

    df_merged["tagged_abstract_text"] = df_merged.apply(lambda x: insert_ac_tags(x), axis=1)
    df_merged['relations_list'] = df_merged.apply(lambda x: filter_nones(x), axis=1)

    return df_merged    

## Load Data

In [7]:
data_dir = os.path.join(os.getcwd(), "../data")

In [37]:
neo_train_df = get_dataframe("neo/train.csv")

In [38]:
neo_test_df = get_dataframe("neo/test.csv")

In [39]:
gla_test_df = get_dataframe("gla/test.csv")

In [40]:
mix_test_df = get_dataframe("mix/test.csv")

In [48]:
neo_test_df.iloc[7]

doc_id                                                           10403690
abstract_text            In a prospective randomized study, 287 patien...
ac_types                [none, Premise, Premise, Premise, Premise, Pre...
acs_list                [ In a prospective randomized study, 287 patie...
relation_type           [[0, 0, none], [1, 0, none], [2, 6, support], ...
tagged_abstract_text     In a prospective randomized study, 287 patien...
relations_list          [[2, 6, support], [3, 6, support], [4, 6, supp...
Name: 7, dtype: object

In [49]:
neo_test_df.iloc[7]['relations_list']

[[2, 6, 'support'], [3, 6, 'support'], [4, 6, 'support'], [5, 6, 'support']]

In [42]:
neo_train_df.to_pickle("../datasets/neo_train_df.pkl")
neo_test_df.to_pickle("../datasets/neo_test_df.pkl")
gla_test_df.to_pickle("../datasets/gla_test_df.pkl")
mix_test_df.to_pickle("../datasets/mix_test_df.pkl")

## Prepare prompt

In [13]:
def formatting_fct(instruction="", input="", output=""):
    
    prompt_d ={
            
        "instruction": f"""{instruction}""",
        "input": f"""{input}""",
        "output": f"""{output}"""
            
        }
    
    return prompt_d

In [14]:
def write_instruction():

    # results = json.dumps(["component_type (str)"] * nr_acs)

    instruction = """### You are an expert in Argument Mining. You are given a biomedical abstract text which contains numbered argument components enclosed by <AC></AC> tags. Your task is to identify argument relations between argument components in the abstract text and classify their relation type as either "support" or "attack". You must return a list of triplets in the following JSON format: {"list_argument_relation_types": [[source AC (int), target AC (int), relation_type (str)], ..., [source AC (int), target AC (int), relation_type (str)]]} where each element "relation_type (str)" is replaced by either "support" or "attack".
"""
    
    return instruction

In [15]:
write_instruction()

'### You are an expert in Argument Mining. You are given a biomedical abstract text which contains numbered argument components enclosed by <AC></AC> tags. Your task is to identify argument relations between argument components in the abstract text and classify their relation type as either "support" or "attack". You must return a list of triplets in the following JSON format: {"list_argument_relation_types": [[source AC (int), target AC (int), relation_type (str)], ..., [source AC (int), target AC (int), relation_type (str)]]} where each element "relation_type (str)" is replaced by either "support" or "attack".\n'

In [16]:
def build_input(abstract_text):
    
    question = f"""### Here is the abstract text: {abstract_text}"""
    
    return question

In [17]:
def build_answer(relation_types):

    #ac_types = [label for label in ac_types if label != 'none']
    #ac_types = ['Claim' if label == 'MajorClaim' else label for label in ac_types]
    return json.dumps({"list_argument_relation_types": relation_types})

## Prepare data files

### train file (neo)

In [18]:
data_file_train = []

for index, _ in neo_train_df.iterrows():
    i = index

    instruction = write_instruction()
    question = build_input(neo_train_df.iloc[i].tagged_abstract_text)
    answer = build_answer(neo_train_df.iloc[i].relations_list)
    
    data_file_train.append( formatting_fct(instruction, question, answer) )

In [19]:
len(data_file_train)

350

In [20]:
for i in range(3):
    
    print(data_file_train[i])
    print()

{'instruction': '### You are an expert in Argument Mining. You are given a biomedical abstract text which contains numbered argument components enclosed by <AC></AC> tags. Your task is to identify argument relations between argument components in the abstract text and classify their relation type as either "support" or "attack". You must return a list of triplets in the following JSON format: {"list_argument_relation_types": [[source AC (int), target AC (int), relation_type (str)], ..., [source AC (int), target AC (int), relation_type (str)]]} where each element "relation_type (str)" is replaced by either "support" or "attack".\n', 'input': '### Here is the abstract text:  Single-agent therapy with bicalutamide, a nonsteroidal antiandrogen, was compared with castration, either surgical or medical, in patients with untreated Stage D2 prostate cancer. In an open, randomized, multicenter trial, patients were randomized to treatment with 50 mg bicalutamide (n = 243) once daily or to castra

### test file (neo)

In [21]:
data_file_test_neo = []

for index, _ in neo_test_df.iterrows():    
    i = index
    
    instruction = write_instruction()
    question = build_input(neo_test_df.iloc[i].tagged_abstract_text)
    answer = build_answer(neo_test_df.iloc[i].relations_list)
    
    data_file_test_neo.append( formatting_fct(instruction, question, answer) )

In [22]:
len(data_file_test_neo)

100

In [23]:
for i in range(3):
    
    print(data_file_test_neo[i])
    print()

{'instruction': '### You are an expert in Argument Mining. You are given a biomedical abstract text which contains numbered argument components enclosed by <AC></AC> tags. Your task is to identify argument relations between argument components in the abstract text and classify their relation type as either "support" or "attack". You must return a list of triplets in the following JSON format: {"list_argument_relation_types": [[source AC (int), target AC (int), relation_type (str)], ..., [source AC (int), target AC (int), relation_type (str)]]} where each element "relation_type (str)" is replaced by either "support" or "attack".\n', 'input': '### Here is the abstract text:  To investigate the effects of medroxyprogesterone acetate (MPA) on appetite, weight, and quality of life (QL) in patients with advanced-stage, incurable, non-hormone-sensitive cancer. Two hundred six eligible patients were randomized between double-blind MPA 500 mg twice daily or placebo. Appetite (0 to 10 numerical 

In [24]:
# Error!
# Error corrected !

### test file (gla)

In [25]:
data_file_test_gla = []

for index, _ in gla_test_df.iterrows():    
    i = index
    
    instruction = write_instruction()
    question = build_input(gla_test_df.iloc[i].tagged_abstract_text)
    answer = build_answer(gla_test_df.iloc[i].relations_list)
    
    data_file_test_gla.append( formatting_fct(instruction, question, answer) )

In [26]:
len(data_file_test_gla)

100

In [27]:
for i in range(3):
    
    print(data_file_test_gla[i])
    print()

{'instruction': '### You are an expert in Argument Mining. You are given a biomedical abstract text which contains numbered argument components enclosed by <AC></AC> tags. Your task is to identify argument relations between argument components in the abstract text and classify their relation type as either "support" or "attack". You must return a list of triplets in the following JSON format: {"list_argument_relation_types": [[source AC (int), target AC (int), relation_type (str)], ..., [source AC (int), target AC (int), relation_type (str)]]} where each element "relation_type (str)" is replaced by either "support" or "attack".\n', 'input': '### Here is the abstract text: \n\nIn a randomized clinical trial, the authors compared the use of postoperative subconjunctival injections of 5-fluorouracil (5-FU) in 19 eyes with a single intraoperative application of subconjunctival mitomycin (MMC) at the filtering site in 20 eyes at high risk for failure of glaucoma filtering surgery.<AC1> Six 

In [28]:
# Error!
# Error corrected!

### test file (mix)

In [29]:
data_file_test_mix = []

for index, _ in mix_test_df.iterrows():    
    i = index
    
    instruction = write_instruction()
    question = build_input(mix_test_df.iloc[i].tagged_abstract_text)
    answer = build_answer(mix_test_df.iloc[i].relations_list)
    
    data_file_test_mix.append( formatting_fct(instruction, question, answer) )

In [30]:
len(data_file_test_mix)

100

In [31]:
for i in range(3):
    
    print(data_file_test_mix[i])
    print()

{'instruction': '### You are an expert in Argument Mining. You are given a biomedical abstract text which contains numbered argument components enclosed by <AC></AC> tags. Your task is to identify argument relations between argument components in the abstract text and classify their relation type as either "support" or "attack". You must return a list of triplets in the following JSON format: {"list_argument_relation_types": [[source AC (int), target AC (int), relation_type (str)], ..., [source AC (int), target AC (int), relation_type (str)]]} where each element "relation_type (str)" is replaced by either "support" or "attack".\n', 'input': '### Here is the abstract text:  To evaluate the efficacy and safety of a slow-release formulation of cytarabine (DepoCyt; Chiron Corp, Emeryville, CA, and Skye Pharma, Inc, San Diego, CA) that maintains cytotoxic concentrations of cytarabine (ara-C) in the CSF of most patients for more than 14 days. Twenty-eight patients with lymphoma and a positiv

In [32]:
# Error!
# Error corrected!

## Save `jsonl` files

In [33]:
file_path = os.path.join(os.getcwd(), "../datasets/abstRCT_aric_train_neo.json")

with open(file_path, 'w') as file:
    
    json.dump(data_file_train, file)

In [34]:
file_path = os.path.join(os.getcwd(), "../datasets/abstRCT_aric_test_neo.json")

with open(file_path, 'w') as file:
    
    json.dump(data_file_test_neo, file)

In [35]:
file_path = os.path.join(os.getcwd(), "../datasets/abstRCT_aric_test_gla.json")

with open(file_path, 'w') as file:
    
    json.dump(data_file_test_gla, file)

In [36]:
file_path = os.path.join(os.getcwd(), "../datasets/abstRCT_aric_test_mix.json")

with open(file_path, 'w') as file:
    
    json.dump(data_file_test_mix, file)