# MEGA DATASET

In [1]:
import os
import ast
import json
import random
import pickle
import datasets
import pandas as pd

from datasets import load_dataset

## AbstRCT Dataset

In [2]:
def process_aty(x):

    x = x.aty
    x = x.split(" ")
    return x

In [3]:
def insert_ac_tags(x):

    abstract_text = x.abstract_text
    acs_list = x.acs_list
    ac_types_list = x.ac_types

    
    # for idx, ac in enumerate(acs_list):
    
    counter = 1
    for idx, (ac, ac_type) in enumerate(zip(acs_list, ac_types_list)):

        # if ac != '':
        
        if ac != '' and ac_type != 'none':

            # ac_tags_w_tags = f"<AC> " + ac + f" </AC>"
            
            ac_tags_w_tags = f"""<AC{counter}> """ + ac + f""" </AC{counter}>""" 
            abstract_text = abstract_text.replace(ac, ac_tags_w_tags) 
            counter += 1

    return abstract_text

In [4]:
def get_dataframe(dataset):

    rct_df = pd.read_csv(os.path.join(data_dir, dataset))

    
    abstract_texts_df = rct_df.fillna('').groupby(["doc_id"]).agg({"text": "".join, "aty": " ".join}).reset_index()
    abstract_acs_df = rct_df.fillna('').groupby('doc_id')['text'].agg(list).reset_index()
    #abstract_ars_df = rct_df.fillna('').groupby('doc_id')['rel_pairs'].agg(list).reset_index()
    #abstract_reltypes_df = rct_df.fillna('').groupby('doc_id')['afu'].agg(list).reset_index()
    
    abstract_texts_df["ac_types"] = abstract_texts_df.apply(lambda x: process_aty(x), axis=1)

    abstract_texts_df.rename(columns={'text': 'abstract_text'}, inplace=True)
    abstract_texts_df.drop(columns=['aty'], inplace=True)

    abstract_acs_df.rename(columns={'text': 'acs_list'}, inplace=True)
    
    df_merged = pd.merge(abstract_texts_df, abstract_acs_df, on='doc_id')

    df_merged["tagged_abstract_text"] = df_merged.apply(lambda x: insert_ac_tags(x), axis=1)

    return df_merged    

In [5]:
data_dir = os.path.join(os.getcwd(), "../abstRCT/data")

In [6]:
neo_train_df = get_dataframe("neo/train.csv")

In [7]:
neo_test_df = get_dataframe("neo/test.csv")

In [8]:
gla_test_df = get_dataframe("gla/test.csv")

In [9]:
mix_test_df = get_dataframe("mix/test.csv")

In [10]:
def formatting_fct(instruction="", input="", output=""):
    
    prompt_d ={
            
        "instruction": f"""{instruction}""",
        "input": f"""{input}""",
        "output": f"""{output}"""
            
        }
    
    return prompt_d

In [11]:
def write_instruction(nr_acs):

    results = json.dumps(["component_type (str)"] * nr_acs)

    instruction = f"""### You are an expert in Argument Mining. You are given a text which contains numbered argument components enclosed by <AC></AC> tags. Your task is to classify each argument component in the text as either "Major Claim", "Claim", "Premise", "fact", "policy", "reference", "testimony" or "value". You must return a list of argument component types, strictly of length {nr_acs}, in following JSON format: {{"component_types": {results}}} where each element "component_type (str)" is replaced by either "Major Claim", "Claim", "Premise", "fact", "policy", "reference", "testimony" or "value". 
"""
    
    return instruction

In [12]:
def build_input(abstract_text):
    
    question = f"""### Here is the abstract text: {abstract_text}"""
    
    return question

In [13]:
def build_answer(ac_types):

    ac_types = [label for label in ac_types if label != 'none']
    ac_types = ['Claim' if label == 'MajorClaim' else label for label in ac_types]
    return json.dumps({"component_types": ac_types})

In [14]:
data_file_train = []

for index, _ in neo_train_df.iterrows():
    i = index

    instruction = write_instruction(len([ac for ac in neo_train_df.iloc[i].ac_types if ac != 'none']))
    question = build_input(neo_train_df.iloc[i].tagged_abstract_text)
    answer = build_answer(neo_train_df.iloc[i].ac_types)
    
    data_file_train.append( formatting_fct(instruction, question, answer) )

In [15]:
data_file_test_neo = []

for index, _ in neo_test_df.iterrows():    
    i = index
    
    instruction = write_instruction(len([ac for ac in neo_test_df.iloc[i].ac_types if ac != 'none']))
    question = build_input(neo_test_df.iloc[i].tagged_abstract_text)
    answer = build_answer(neo_test_df.iloc[i].ac_types)
    
    data_file_test_neo.append( formatting_fct(instruction, question, answer) )

In [16]:
data_file_test_gla = []

for index, _ in gla_test_df.iterrows():    
    i = index
    
    instruction = write_instruction(len([ac for ac in gla_test_df.iloc[i].ac_types if ac != 'none']))
    question = build_input(gla_test_df.iloc[i].tagged_abstract_text)
    answer = build_answer(gla_test_df.iloc[i].ac_types)
    
    data_file_test_gla.append( formatting_fct(instruction, question, answer) )

In [17]:
data_file_test_mix = []

for index, _ in mix_test_df.iterrows():    
    i = index
    
    instruction = write_instruction(len([ac for ac in mix_test_df.iloc[i].ac_types if ac != 'none']))
    question = build_input(mix_test_df.iloc[i].tagged_abstract_text)
    answer = build_answer(mix_test_df.iloc[i].ac_types)
    
    data_file_test_mix.append( formatting_fct(instruction, question, answer) )

## CDCP Dataset

In [18]:
cdcp_dataset = load_dataset("DFKI-SLT/cdcp", trust_remote_code=True)

In [19]:
def write_instruction(nr_acs):

    results = json.dumps(["component_type (str)"] * nr_acs)

    instruction = f"""### You are an expert in Argument Mining. You are given a text which contains numbered argument components enclosed by <AC></AC> tags. Your task is to classify each argument component in the text as either "Major Claim", "Claim", "Premise", "fact", "policy", "reference", "testimony" or "value". You must return a list of argument component types, strictly of length {nr_acs}, in following JSON format: {{"component_types": {results}}} where each element "component_type (str)" is replaced by either "Major Claim", "Claim", "Premise", "fact", "policy", "reference", "testimony" or "value". 
"""
    
    return instruction

In [20]:
def formatting_fct(instruction="", input="", output=""):
    
    prompt_d ={
            
        "instruction": f"""{instruction}""",
        "input": f"""{input}""",
        "output": f"""{output}"""
            
        }
    
    return prompt_d

In [21]:
def insert_tags(text, start_indices, end_indices):

    offset = 0

    for i, (start_i, end_i) in enumerate(zip(start_indices, end_indices)):
            
        start_tag = "<AC" + str(i+1) + ">"
        end_tag = "</AC" + str(i+1) + ">"
        
        start_idx = start_i + offset
        end_idx = end_i + offset

        offset = offset + (len(start_tag)  + len(end_tag))
        
        text_r = text[start_idx:end_idx]
        new_text = start_tag + text_r + end_tag
        text = text.replace(text_r, new_text)

        question = f"""### Here is the text: {text}"""

    return question

In [22]:
def get_ac_types(raw_labels):

    
    class_labels = ["fact", "policy", "reference", "testimony", "value"]

    labels = [class_labels[i] for i in raw_labels]
    
    return json.dumps({"component_types": labels})

In [23]:
# data_file_train = []

for sample in cdcp_dataset["train"]:

    sample_text = sample["text"]
    start_l = sample["propositions"]["start"]
    end_l = sample["propositions"]["end"]
    raw_labels = sample["propositions"]["label"]

    instruction = write_instruction(len(raw_labels))
    question = insert_tags(sample_text, start_l, end_l)
    answer = get_ac_types(raw_labels)
    
    data_file_train.append( formatting_fct(instruction, question, answer) )

In [24]:
data_file_test_cdcp = []

for sample in cdcp_dataset["test"]:

    sample_text = sample["text"]
    start_l = sample["propositions"]["start"]
    end_l = sample["propositions"]["end"]
    raw_labels = sample["propositions"]["label"]

    instruction = write_instruction(len(raw_labels))
    question = insert_tags(sample_text, start_l, end_l)
    answer = get_ac_types(raw_labels)
    
    data_file_test_cdcp.append( formatting_fct(instruction, question, answer) )

## PE Dataset

In [25]:
pe_df = pd.read_csv("PE_data.csv")

In [26]:
df_split = pd.read_csv("train-test-split.csv", sep=";")

In [27]:
pe_df['split'] = pe_df['essay_id'].map(df_split['SET'])

In [28]:
def get_ac_count(x):

    return len(ast.literal_eval(x.AC_types))

In [29]:
pe_df["AC_count"] = pe_df.apply(lambda x: get_ac_count(x), axis=1)

In [30]:
def get_ar_pair_count(x):

    return len(ast.literal_eval(x.AR_pairs))

In [31]:
pe_df["AR_count"] = pe_df.apply(lambda x: get_ar_pair_count(x), axis=1)

In [32]:
essays_df=pe_df.groupby(["essay_id", "split"]).agg({"para_text": "".join, "AC_types": "".join})#.iloc[0]["AC_types"]

In [33]:
essays_df['split'] = [x[1] for x in essays_df.index]

In [34]:
def process_ac_types(x):

    x = x.AC_types
    x = x.replace("[]", "")
    x = x.replace("][",", ") 
    x = ast.literal_eval(x)

    return x

In [35]:
essays_df["Essay_AC_types"] = essays_df.apply(lambda x: process_ac_types(x), axis=1)

In [36]:
def formatting_fct(instruction="", input="", output="", mode="train"):
    
    prompt_d ={
            
        "instruction": f"""{instruction}""",
        "input": f"""{input}""",
        "output": f"""{output if mode=='train' else ''}"""
            
        }
    
    return prompt_d

In [37]:
def write_instruction(nr_acs):

    results = json.dumps(["component_type (str)"] * nr_acs)

    instruction = f"""### You are an expert in Argument Mining. You are given a text which contains numbered argument components enclosed by <AC></AC> tags. Your task is to classify each argument component in the text as either "Major Claim", "Claim", "Premise", "fact", "policy", "reference", "testimony" or "value". You must return a list of argument component types, strictly of length {nr_acs}, in following JSON format: {{"component_types": {results}}} where each element "component_type (str)" is replaced by either "Major Claim", "Claim", "Premise", "fact", "policy", "reference", "testimony" or "value". 
"""
    
    return instruction

In [38]:
def replace_substring_with_position(main_string, substring):
    result = ""
    start = 0
    current_index = 0
    position = 0  # Initialize position counter
    
    while current_index < len(main_string):
        current_index = main_string.find(substring, current_index)
        if current_index == -1:
            break
        
        # Append part of the string before the current match
        result += main_string[start:current_index]
        
        # Append the replacement
        sstring = substring[:-1]
        # ac_type = ac_types[position]
        
        result += f"{sstring}{position+1}>"
        
        # Update the start to be the end of the current match
        start = current_index + len(substring)
        current_index = start
        
        # Increment the position counter
        position += 1
    
    # Append any remaining part of the string
    result += main_string[start:]
    
    return result


In [39]:
def build_input(paragraph):
    
    paragraph = paragraph.replace("<prompt> ", "<topic> ")
    paragraph = paragraph.replace(" </prompt>", " </topic>")
    # comment next lines for with paragraph tags
    paragraph = paragraph.replace("<topic> ", "")
    paragraph = paragraph.replace(" </topic>", "")
    paragraph = paragraph.replace("<para-intro> ", "")
    paragraph = paragraph.replace(" </para-intro>", "")
    paragraph = paragraph.replace("<para-body> ", "")
    paragraph = paragraph.replace(" </para-body>", "")
    paragraph = paragraph.replace("<para-conclusion> ", "")
    paragraph = paragraph.replace(" </para-conclusion>", "")
    
    
    paragraph = replace_substring_with_position(paragraph, "<AC>")
    paragraph = replace_substring_with_position(paragraph, "</AC>")

    
    question = f"""### Here is the essay text: {paragraph}"""
    
    return question

In [40]:
def build_answer(ac_types):

    
    return json.dumps({"component_types": ac_types})

In [41]:
essays_df

Unnamed: 0_level_0,Unnamed: 1_level_0,para_text,AC_types,split,Essay_AC_types
essay_id,split,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,TRAIN,<prompt> Should students be taught to compete ...,"[]['MajorClaim']['Claim', 'Premise', 'Premise'...",TRAIN,"[MajorClaim, Claim, Premise, Premise, Premise,..."
1,TRAIN,<prompt> More people are migrating to other co...,"[]['MajorClaim']['Premise', 'Premise', 'Premis...",TRAIN,"[MajorClaim, Premise, Premise, Premise, Premis..."
2,TRAIN,<prompt> International tourism is now more com...,"[]['MajorClaim']['Premise', 'Premise', 'Premis...",TRAIN,"[MajorClaim, Premise, Premise, Premise, Claim,..."
3,TEST,<prompt> Will newspapers become a thing of the...,"[]['MajorClaim']['Premise', 'Premise', 'Premis...",TEST,"[MajorClaim, Premise, Premise, Premise, Claim,..."
4,TEST,"<prompt> Government budget focus , young child...","[]['MajorClaim']['Premise', 'Premise', 'Premis...",TEST,"[MajorClaim, Premise, Premise, Premise, Claim,..."
...,...,...,...,...,...
397,TEST,<prompt> The maintenance of traditional skills...,"[]['MajorClaim']['Claim', 'Premise', 'Premise'...",TEST,"[MajorClaim, Claim, Premise, Premise, Claim, P..."
398,TRAIN,<prompt> University education restriction </pr...,"[][]['Claim', 'Premise', 'Premise', 'Premise']...",TRAIN,"[Claim, Premise, Premise, Premise, Premise, Cl..."
399,TRAIN,<prompt> Police force carries guns - significa...,"[]['MajorClaim']['Premise', 'Premise', 'Premis...",TRAIN,"[MajorClaim, Premise, Premise, Premise, Premis..."
400,TRAIN,<prompt> Gun control and increasing violence <...,"[]['MajorClaim']['Claim', 'Premise', 'Premise'...",TRAIN,"[MajorClaim, Claim, Premise, Premise, Claim, P..."


In [42]:
# data_file_train = []

for index, _ in essays_df[essays_df["split"] == "TRAIN"].iterrows():
    i = index[0]

    instruction = write_instruction(len(essays_df.iloc[i].Essay_AC_types))
    question = build_input(essays_df.iloc[i].para_text)
    answer = build_answer(essays_df.iloc[i].Essay_AC_types)
    
    data_file_train.append( formatting_fct(instruction, question, answer, mode="train") )

In [43]:
data_file_test_pe = []

for index, _ in essays_df[essays_df.split == "TEST"].iterrows():    
    i = index[0]

    instruction = write_instruction(len(essays_df.iloc[i].Essay_AC_types))
    question = build_input(essays_df.iloc[i].para_text)
    answer = build_answer(essays_df.iloc[i].Essay_AC_types)
    
    data_file_test_pe.append( formatting_fct(instruction, question, answer, mode="train") )

In [44]:
len(data_file_train)

1252

In [45]:
len(data_file_test_neo), len(data_file_test_gla), len(data_file_test_mix), len(data_file_test_cdcp), len(data_file_test_pe)

(100, 100, 100, 150, 80)

## Save JSON Files

In [46]:
data_file_train[0]

{'instruction': '### You are an expert in Argument Mining. You are given a text which contains numbered argument components enclosed by <AC></AC> tags. Your task is to classify each argument component in the text as either "Major Claim", "Claim", "Premise", "fact", "policy", "reference", "testimony" or "value". You must return a list of argument component types, strictly of length 8, in following JSON format: {"component_types": ["component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)"]} where each element "component_type (str)" is replaced by either "Major Claim", "Claim", "Premise", "fact", "policy", "reference", "testimony" or "value". \n',
 'input': '### Here is the abstract text:  Single-agent therapy with bicalutamide, a nonsteroidal antiandrogen, was compared with castration, either surgical or medical, in patients with untreated Stage D2 prostate

In [47]:
data_file_train[550]

{'instruction': '### You are an expert in Argument Mining. You are given a text which contains numbered argument components enclosed by <AC></AC> tags. Your task is to classify each argument component in the text as either "Major Claim", "Claim", "Premise", "fact", "policy", "reference", "testimony" or "value". You must return a list of argument component types, strictly of length 14, in following JSON format: {"component_types": ["component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)"]} where each element "component_type (str)" is replaced by either "Major Claim", "Claim", "Premise", "fact", "policy", "reference", "testimony" or "value". \n',
 'input': '### Here is the text: <AC1>Any collector who uses a roboca

In [48]:
data_file_train[1150]

{'instruction': '### You are an expert in Argument Mining. You are given a text which contains numbered argument components enclosed by <AC></AC> tags. Your task is to classify each argument component in the text as either "Major Claim", "Claim", "Premise", "fact", "policy", "reference", "testimony" or "value". You must return a list of argument component types, strictly of length 12, in following JSON format: {"component_types": ["component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)", "component_type (str)"]} where each element "component_type (str)" is replaced by either "Major Claim", "Claim", "Premise", "fact", "policy", "reference", "testimony" or "value". \n',
 'input': "### Here is the essay text: Is it necessary to teach children handwriting ?In this age of modern technolog

In [49]:
file_path = os.path.join("mega_acc_train.json")

with open(file_path, 'w') as file:
    
    json.dump(data_file_train, file)

In [51]:
file_path = os.path.join("abstRCT_acc_test_neo.json")

with open(file_path, 'w') as file:
    
    json.dump(data_file_test_neo, file)

In [52]:
file_path = os.path.join("abstRCT_acc_test_gla.json")

with open(file_path, 'w') as file:
    
    json.dump(data_file_test_gla, file)

In [53]:
file_path = os.path.join("abstRCT_acc_test_mix.json")

with open(file_path, 'w') as file:
    
    json.dump(data_file_test_mix, file)

In [54]:
file_path = os.path.join("CDCP_acc_test.json")

with open(file_path, 'w') as file:
    
    json.dump(data_file_test_cdcp, file)

In [55]:
file_path = os.path.join("PE_ATC_essay_wo_tags_test.json")

with open(file_path, 'w') as file:
    
    json.dump(data_file_test_pe, file)