# Prepare dataset (jsonl file)

- Prepare PE datasets for llama factory.

- Argument Type Classification (essay wise), with/without paragraph tags

- We create the data files: `PE_ATC_train.json`, `PE_ATC_test.json`

## Libraries

In [159]:
import os
import ast
import json
import random
import pickle
import pandas as pd

In [160]:
random.seed(42)

## Load Data

In [161]:
data_dir = os.path.join(os.getcwd(), "datasets")

In [162]:
pe_df = pd.read_csv(os.path.join(data_dir, "PE_data.csv"))

In [163]:
# pe_df.isna().sum()

In [164]:
# pe_df

### Load train-test-split CSV 

In [165]:
df_split = pd.read_csv("datasets/train-test-split.csv", sep=";")

In [166]:
# df_split

In [167]:
pe_df['split'] = pe_df['essay_id'].map(df_split['SET'])

In [168]:
# pe_df

In [169]:
def get_ac_count(x):

    return len(ast.literal_eval(x.AC_types))

In [170]:
pe_df["AC_count"] = pe_df.apply(lambda x: get_ac_count(x), axis=1)

In [171]:
# pe_df

In [172]:
pe_df.split.value_counts()

split
TRAIN    1796
TEST      439
Name: count, dtype: int64

In [173]:
pe_df

Unnamed: 0,essay_id,para_id,para_types,para_text,adu_spans,ac_spans,ai_spans,AC_types,AR_pairs,AR_types,split,AC_count
0,0,0,prompt,<prompt> Should students be taught to compete ...,[],[],[],[],[],[],TRAIN,0
1,0,1,intro,<para-intro> It is always said that competitio...,"[(76, 97)]","[(86, 97)]","[(76, 85)]",['MajorClaim'],[],[],TRAIN,1
2,0,2,body,"<para-body> First of all , <AC> through cooper...","[(1, 25), (26, 55), (56, 99), (100, 123)]","[(5, 25), (27, 55), (57, 99), (101, 123)]","[(1, 4), (26, 26), (56, 56), (100, 100)]","['Claim', 'Premise', 'Premise', 'Premise']","[(0, 1), (0, 2), (0, 3)]","['Support', 'Support', 'Support']",TRAIN,4
3,0,3,body,"<para-body> On the other hand , <AC> the signi...","[(1, 22), (24, 37), (39, 63), (76, 139), (155,...","[(6, 22), (30, 37), (41, 63), (77, 139), (156,...","[(1, 5), (24, 29), (39, 40), (76, 76), (155, 1...","['Premise', 'Claim', 'Premise', 'Premise', 'Cl...","[(1, 0), (4, 2), (4, 3)]","['Support', 'Support', 'Support']",TRAIN,5
4,0,4,conclusion,"<para-conclusion> Consequently , no matter fro...","[(1, 40)]","[(25, 40)]","[(1, 24)]",['MajorClaim'],[],[],TRAIN,1
...,...,...,...,...,...,...,...,...,...,...,...,...
2230,401,2230,prompt,<prompt> A greater proportion of the budget sh...,[],[],[],[],[],[],TRAIN,0
2231,401,2231,intro,"<para-intro> In today ' s world , the concept ...","[(26, 51)]","[(33, 51)]","[(26, 32)]",['MajorClaim'],[],[],TRAIN,1
2232,401,2232,body,<para-body> <AC> The first reason why educatio...,"[(0, 22), (24, 56), (58, 71), (72, 98)]","[(1, 22), (27, 56), (60, 71), (74, 98)]","[(0, 0), (24, 26), (58, 59), (72, 73)]","['Premise', 'Premise', 'Claim', 'Premise']","[(2, 0), (2, 1), (2, 3)]","['Support', 'Support', 'Support']",TRAIN,4
2233,401,2233,body,<para-body> The second reason why <AC> governm...,"[(1, 25), (27, 53), (55, 88), (90, 128)]","[(5, 25), (31, 53), (57, 88), (93, 128)]","[(1, 4), (27, 30), (55, 56), (90, 92)]","['Claim', 'Premise', 'Premise', 'Premise']","[(2, 1), (0, 2), (2, 3)]","['Support', 'Support', 'Support']",TRAIN,4


In [174]:
def get_ar_pair_count(x):

    return len(ast.literal_eval(x.AR_pairs))

In [175]:
pe_df["AR_count"] = pe_df.apply(lambda x: get_ar_pair_count(x), axis=1)

In [176]:
# Remove paragraphs with 0 ACs

In [177]:
#pe_df = pe_df[pe_df.AC_count > 0].reset_index() 

In [178]:
pe_df

Unnamed: 0,essay_id,para_id,para_types,para_text,adu_spans,ac_spans,ai_spans,AC_types,AR_pairs,AR_types,split,AC_count,AR_count
0,0,0,prompt,<prompt> Should students be taught to compete ...,[],[],[],[],[],[],TRAIN,0,0
1,0,1,intro,<para-intro> It is always said that competitio...,"[(76, 97)]","[(86, 97)]","[(76, 85)]",['MajorClaim'],[],[],TRAIN,1,0
2,0,2,body,"<para-body> First of all , <AC> through cooper...","[(1, 25), (26, 55), (56, 99), (100, 123)]","[(5, 25), (27, 55), (57, 99), (101, 123)]","[(1, 4), (26, 26), (56, 56), (100, 100)]","['Claim', 'Premise', 'Premise', 'Premise']","[(0, 1), (0, 2), (0, 3)]","['Support', 'Support', 'Support']",TRAIN,4,3
3,0,3,body,"<para-body> On the other hand , <AC> the signi...","[(1, 22), (24, 37), (39, 63), (76, 139), (155,...","[(6, 22), (30, 37), (41, 63), (77, 139), (156,...","[(1, 5), (24, 29), (39, 40), (76, 76), (155, 1...","['Premise', 'Claim', 'Premise', 'Premise', 'Cl...","[(1, 0), (4, 2), (4, 3)]","['Support', 'Support', 'Support']",TRAIN,5,3
4,0,4,conclusion,"<para-conclusion> Consequently , no matter fro...","[(1, 40)]","[(25, 40)]","[(1, 24)]",['MajorClaim'],[],[],TRAIN,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2230,401,2230,prompt,<prompt> A greater proportion of the budget sh...,[],[],[],[],[],[],TRAIN,0,0
2231,401,2231,intro,"<para-intro> In today ' s world , the concept ...","[(26, 51)]","[(33, 51)]","[(26, 32)]",['MajorClaim'],[],[],TRAIN,1,0
2232,401,2232,body,<para-body> <AC> The first reason why educatio...,"[(0, 22), (24, 56), (58, 71), (72, 98)]","[(1, 22), (27, 56), (60, 71), (74, 98)]","[(0, 0), (24, 26), (58, 59), (72, 73)]","['Premise', 'Premise', 'Claim', 'Premise']","[(2, 0), (2, 1), (2, 3)]","['Support', 'Support', 'Support']",TRAIN,4,3
2233,401,2233,body,<para-body> The second reason why <AC> governm...,"[(1, 25), (27, 53), (55, 88), (90, 128)]","[(5, 25), (31, 53), (57, 88), (93, 128)]","[(1, 4), (27, 30), (55, 56), (90, 92)]","['Claim', 'Premise', 'Premise', 'Premise']","[(2, 1), (0, 2), (2, 3)]","['Support', 'Support', 'Support']",TRAIN,4,3


In [179]:
pe_df.split.value_counts()

split
TRAIN    1796
TEST      439
Name: count, dtype: int64

In [180]:
essays_df=pe_df.groupby(["essay_id", "split"]).agg({"para_text": "".join, "AC_types": "".join})#.iloc[0]["AC_types"]

In [181]:
essays_df

Unnamed: 0_level_0,Unnamed: 1_level_0,para_text,AC_types
essay_id,split,Unnamed: 2_level_1,Unnamed: 3_level_1
0,TRAIN,<prompt> Should students be taught to compete ...,"[]['MajorClaim']['Claim', 'Premise', 'Premise'..."
1,TRAIN,<prompt> More people are migrating to other co...,"[]['MajorClaim']['Premise', 'Premise', 'Premis..."
2,TRAIN,<prompt> International tourism is now more com...,"[]['MajorClaim']['Premise', 'Premise', 'Premis..."
3,TEST,<prompt> Will newspapers become a thing of the...,"[]['MajorClaim']['Premise', 'Premise', 'Premis..."
4,TEST,"<prompt> Government budget focus , young child...","[]['MajorClaim']['Premise', 'Premise', 'Premis..."
...,...,...,...
397,TEST,<prompt> The maintenance of traditional skills...,"[]['MajorClaim']['Claim', 'Premise', 'Premise'..."
398,TRAIN,<prompt> University education restriction </pr...,"[][]['Claim', 'Premise', 'Premise', 'Premise']..."
399,TRAIN,<prompt> Police force carries guns - significa...,"[]['MajorClaim']['Premise', 'Premise', 'Premis..."
400,TRAIN,<prompt> Gun control and increasing violence <...,"[]['MajorClaim']['Claim', 'Premise', 'Premise'..."


In [182]:
essays_df['split'] = [x[1] for x in essays_df.index]

In [183]:
essays_df

Unnamed: 0_level_0,Unnamed: 1_level_0,para_text,AC_types,split
essay_id,split,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,TRAIN,<prompt> Should students be taught to compete ...,"[]['MajorClaim']['Claim', 'Premise', 'Premise'...",TRAIN
1,TRAIN,<prompt> More people are migrating to other co...,"[]['MajorClaim']['Premise', 'Premise', 'Premis...",TRAIN
2,TRAIN,<prompt> International tourism is now more com...,"[]['MajorClaim']['Premise', 'Premise', 'Premis...",TRAIN
3,TEST,<prompt> Will newspapers become a thing of the...,"[]['MajorClaim']['Premise', 'Premise', 'Premis...",TEST
4,TEST,"<prompt> Government budget focus , young child...","[]['MajorClaim']['Premise', 'Premise', 'Premis...",TEST
...,...,...,...,...
397,TEST,<prompt> The maintenance of traditional skills...,"[]['MajorClaim']['Claim', 'Premise', 'Premise'...",TEST
398,TRAIN,<prompt> University education restriction </pr...,"[][]['Claim', 'Premise', 'Premise', 'Premise']...",TRAIN
399,TRAIN,<prompt> Police force carries guns - significa...,"[]['MajorClaim']['Premise', 'Premise', 'Premis...",TRAIN
400,TRAIN,<prompt> Gun control and increasing violence <...,"[]['MajorClaim']['Claim', 'Premise', 'Premise'...",TRAIN


In [184]:
def process_ac_types(x):

    x = x.AC_types
    x = x.replace("[]", "")
    x = x.replace("][",", ") 
    x = ast.literal_eval(x)

    return x

In [185]:
essays_df["Essay_AC_types"] = essays_df.apply(lambda x: process_ac_types(x), axis=1)

In [186]:
essays_df

Unnamed: 0_level_0,Unnamed: 1_level_0,para_text,AC_types,split,Essay_AC_types
essay_id,split,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,TRAIN,<prompt> Should students be taught to compete ...,"[]['MajorClaim']['Claim', 'Premise', 'Premise'...",TRAIN,"[MajorClaim, Claim, Premise, Premise, Premise,..."
1,TRAIN,<prompt> More people are migrating to other co...,"[]['MajorClaim']['Premise', 'Premise', 'Premis...",TRAIN,"[MajorClaim, Premise, Premise, Premise, Premis..."
2,TRAIN,<prompt> International tourism is now more com...,"[]['MajorClaim']['Premise', 'Premise', 'Premis...",TRAIN,"[MajorClaim, Premise, Premise, Premise, Claim,..."
3,TEST,<prompt> Will newspapers become a thing of the...,"[]['MajorClaim']['Premise', 'Premise', 'Premis...",TEST,"[MajorClaim, Premise, Premise, Premise, Claim,..."
4,TEST,"<prompt> Government budget focus , young child...","[]['MajorClaim']['Premise', 'Premise', 'Premis...",TEST,"[MajorClaim, Premise, Premise, Premise, Claim,..."
...,...,...,...,...,...
397,TEST,<prompt> The maintenance of traditional skills...,"[]['MajorClaim']['Claim', 'Premise', 'Premise'...",TEST,"[MajorClaim, Claim, Premise, Premise, Claim, P..."
398,TRAIN,<prompt> University education restriction </pr...,"[][]['Claim', 'Premise', 'Premise', 'Premise']...",TRAIN,"[Claim, Premise, Premise, Premise, Premise, Cl..."
399,TRAIN,<prompt> Police force carries guns - significa...,"[]['MajorClaim']['Premise', 'Premise', 'Premis...",TRAIN,"[MajorClaim, Premise, Premise, Premise, Premis..."
400,TRAIN,<prompt> Gun control and increasing violence <...,"[]['MajorClaim']['Claim', 'Premise', 'Premise'...",TRAIN,"[MajorClaim, Claim, Premise, Premise, Claim, P..."


## Prepare prompt

In [187]:
def formatting_fct(instruction="", input="", output="", mode="train"):
    
    prompt_d ={
            
        "instruction": f"""{instruction}""",
        "input": f"""{input}""",
        "output": f"""{output if mode=='train' else ''}"""
            
        }
    
    return prompt_d

In [188]:
instruction = """### You are an expert in Argument Mining. You are given an essay which contains numbered argument components enclosed by <AC></AC> tags. Your task is to classify each argument components in the essay as either "MajorClaim", "Claim" or "Premise". You must return a list of argument component types in following JSON format: {"component_types": [component_type (str), component_type (str), ..., component_type (str)]}
"""

In [189]:
print(instruction)

### You are an expert in Argument Mining. You are given an essay which contains numbered argument components enclosed by <AC></AC> tags. Your task is to classify each argument components in the essay as either "MajorClaim", "Claim" or "Premise". You must return a list of argument component types in following JSON format: {"component_types": [component_type (str), component_type (str), ..., component_type (str)]}



In [190]:
def replace_substring_with_position(main_string, substring):
    result = ""
    start = 0
    current_index = 0
    position = 0  # Initialize position counter
    
    while current_index < len(main_string):
        current_index = main_string.find(substring, current_index)
        if current_index == -1:
            break
        
        # Append part of the string before the current match
        result += main_string[start:current_index]
        
        # Append the replacement
        sstring = substring[:-1]
        # ac_type = ac_types[position]
        
        result += f"{sstring}{position}>"
        
        # Update the start to be the end of the current match
        start = current_index + len(substring)
        current_index = start
        
        # Increment the position counter
        position += 1
    
    # Append any remaining part of the string
    result += main_string[start:]
    
    return result


In [191]:
def build_input(paragraph):
    
    paragraph = paragraph.replace("<prompt> ", "<topic> ")
    paragraph = paragraph.replace(" </prompt>", " </topic>")
    # comment next lines for with paragraph tags
    paragraph = paragraph.replace("<topic> ", "")
    paragraph = paragraph.replace(" </topic>", "")
    paragraph = paragraph.replace("<para-intro> ", "")
    paragraph = paragraph.replace(" </para-intro>", "")
    paragraph = paragraph.replace("<para-body> ", "")
    paragraph = paragraph.replace(" </para-body>", "")
    paragraph = paragraph.replace("<para-conclusion> ", "")
    paragraph = paragraph.replace(" </para-conclusion>", "")
    
    
    paragraph = replace_substring_with_position(paragraph, "<AC>")
    paragraph = replace_substring_with_position(paragraph, "</AC>")

    
    question = f"""### Here is the essay text: {paragraph}"""
    
    return question

In [192]:
def build_answer(ac_types):

    
    return json.dumps({"component_types": ac_types})

In [193]:
# print(formatting_fct(instruction, question, answer, mode="train"))

In [194]:
essays_df.split.value_counts()

split
TRAIN    322
TEST      80
Name: count, dtype: int64

## Prepare data files

### Train set

In [195]:
data_file_train = []

for index, _ in essays_df[essays_df["split"] == "TRAIN"].iterrows():
    i = index[0]
    question = build_input(essays_df.iloc[i].para_text)
    answer = build_answer(essays_df.iloc[i].Essay_AC_types)
    
    data_file_train.append( formatting_fct(instruction, question, answer, mode="train") )

In [196]:
len(data_file_train)

322

In [197]:
for i in range(3):
    
    print(data_file_train[i])
    print()

{'instruction': '### You are an expert in Argument Mining. You are given an essay which contains numbered argument components enclosed by <AC></AC> tags. Your task is to classify each argument components in the essay as either "MajorClaim", "Claim" or "Premise". You must return a list of argument component types in following JSON format: {"component_types": [component_type (str), component_type (str), ..., component_type (str)]}\n', 'input': "### Here is the essay text: Should students be taught to compete or to cooperate ?It is always said that competition can effectively promote the development of economy . In order to survive in the competition , companies continue to improve their products and service , and as a result , the whole society prospers . However , when we discuss the issue of competition or cooperation , what we are concerned about is not the whole society , but the development of an individual ' s whole life . From this point of view , I firmly believe that <AC0> we sh

In [198]:
print(data_file_train[2]["output"])

{"component_types": ["MajorClaim", "Premise", "Premise", "Premise", "Claim", "Premise", "Premise", "Premise", "Claim", "MajorClaim"]}


In [199]:
len(data_file_train)

322

In [200]:
type(json.loads(data_file_train[2]["output"])["component_types"])

list

### Validation set (optional)

In [37]:
# val_length = int(1417/10)

In [38]:
# val_length

In [39]:
# data_file_val = random.sample(data_file_train, val_length)

In [40]:
# len(data_file_val)

In [41]:
# data_file_train = [message for message in data_file_train if message not in data_file_val]

In [42]:
# len(data_file_train)

In [43]:
# data_file_val = []

# for i, _ in df[df["essay_file"].isin(val_essays_l)].iterrows():
    
#     question = build_question(df.iloc[i].argument_component)
#     answer = build_answer(df.iloc[i].label)
    
#     data_file_val.append( formatting_fct(my_task_description, question, answer, mode="train") )

In [44]:
#len(data_file_val)

In [45]:
# for i in range(3):
    
#     print(data_file_val[i])
#     print()

### Test set

In [201]:
data_file_test = []

for index, _ in essays_df[essays_df.split == "TEST"].iterrows():    
    i = index[0]
    question = build_input(essays_df.iloc[i].para_text)
    answer = build_answer(essays_df.iloc[i].Essay_AC_types)
    
    data_file_test.append( formatting_fct(instruction, question, answer, mode="train") )

In [202]:
len(data_file_test)

80

In [203]:
for i in range(3):
    
    print(data_file_test[i])
    print()

{'instruction': '### You are an expert in Argument Mining. You are given an essay which contains numbered argument components enclosed by <AC></AC> tags. Your task is to classify each argument components in the essay as either "MajorClaim", "Claim" or "Premise". You must return a list of argument component types in following JSON format: {"component_types": [component_type (str), component_type (str), ..., component_type (str)]}\n', 'input': "### Here is the essay text: Will newspapers become a thing of the past ?The internet has been more and more popular for recent years , providing people with a huge source of information . As a result of this , print media such as newspapers have experienced a dramatic decline in the number of readers . Some people , however , still believe that they can exist for long time ; others disagree , arguing that <AC0> newspapers have lost their competitive advantage to sustain their prolonged existence </AC0> . Personally , I am inclined to agree with th

## Save `jsonl` files

In [204]:
file_path = os.path.join(os.getcwd(), "../datasets/PE_ATC_essay_wo_tags_train.json")

with open(file_path, 'w') as file:
    
    json.dump(data_file_train, file)

In [205]:
file_path = os.path.join(os.getcwd(), "../datasets/PE_ATC_essay_wo_tags_test.json")

with open(file_path, 'w') as file:
    
    json.dump(data_file_test, file)