# Prepare dataset (jsonl file)

- Prepare PE datasets for llama factory.

- Paragraph level link identification-link type classificaiton (LI-LTC joint), with/without paragraph tags

## Libraries

In [1]:
import os
import ast
import json
import random
import pickle
import pandas as pd

In [2]:
random.seed(42)

## Load Data

In [3]:
data_dir = os.path.join(os.getcwd(), "datasets")

In [4]:
pe_df = pd.read_csv(os.path.join(data_dir, "PE_data.csv"))

In [5]:
# pe_df.isna().sum()

In [6]:
pe_df

Unnamed: 0,essay_id,para_id,para_types,para_text,adu_spans,ac_spans,ai_spans,AC_types,AR_pairs,AR_types
0,0,0,prompt,<prompt> Should students be taught to compete ...,[],[],[],[],[],[]
1,0,1,intro,<para-intro> It is always said that competitio...,"[(76, 97)]","[(86, 97)]","[(76, 85)]",['MajorClaim'],[],[]
2,0,2,body,"<para-body> First of all , <AC> through cooper...","[(1, 25), (26, 55), (56, 99), (100, 123)]","[(5, 25), (27, 55), (57, 99), (101, 123)]","[(1, 4), (26, 26), (56, 56), (100, 100)]","['Claim', 'Premise', 'Premise', 'Premise']","[(0, 1), (0, 2), (0, 3)]","['Support', 'Support', 'Support']"
3,0,3,body,"<para-body> On the other hand , <AC> the signi...","[(1, 22), (24, 37), (39, 63), (76, 139), (155,...","[(6, 22), (30, 37), (41, 63), (77, 139), (156,...","[(1, 5), (24, 29), (39, 40), (76, 76), (155, 1...","['Premise', 'Claim', 'Premise', 'Premise', 'Cl...","[(1, 0), (4, 2), (4, 3)]","['Support', 'Support', 'Support']"
4,0,4,conclusion,"<para-conclusion> Consequently , no matter fro...","[(1, 40)]","[(25, 40)]","[(1, 24)]",['MajorClaim'],[],[]
...,...,...,...,...,...,...,...,...,...,...
2230,401,2230,prompt,<prompt> A greater proportion of the budget sh...,[],[],[],[],[],[]
2231,401,2231,intro,"<para-intro> In today ' s world , the concept ...","[(26, 51)]","[(33, 51)]","[(26, 32)]",['MajorClaim'],[],[]
2232,401,2232,body,<para-body> <AC> The first reason why educatio...,"[(0, 22), (24, 56), (58, 71), (72, 98)]","[(1, 22), (27, 56), (60, 71), (74, 98)]","[(0, 0), (24, 26), (58, 59), (72, 73)]","['Premise', 'Premise', 'Claim', 'Premise']","[(2, 0), (2, 1), (2, 3)]","['Support', 'Support', 'Support']"
2233,401,2233,body,<para-body> The second reason why <AC> governm...,"[(1, 25), (27, 53), (55, 88), (90, 128)]","[(5, 25), (31, 53), (57, 88), (93, 128)]","[(1, 4), (27, 30), (55, 56), (90, 92)]","['Claim', 'Premise', 'Premise', 'Premise']","[(2, 1), (0, 2), (2, 3)]","['Support', 'Support', 'Support']"


In [7]:
set(pe_df.AC_types.values)

{"['Claim', 'Claim', 'Claim', 'MajorClaim']",
 "['Claim', 'Claim', 'MajorClaim']",
 "['Claim', 'Claim', 'Premise', 'Claim', 'Premise']",
 "['Claim', 'Claim', 'Premise', 'MajorClaim']",
 "['Claim', 'Claim', 'Premise', 'Premise', 'Premise']",
 "['Claim', 'Claim', 'Premise', 'Premise']",
 "['Claim', 'Claim', 'Premise']",
 "['Claim', 'Claim']",
 "['Claim', 'MajorClaim', 'Claim', 'Claim']",
 "['Claim', 'MajorClaim', 'Claim']",
 "['Claim', 'MajorClaim']",
 "['Claim', 'Premise', 'Claim', 'Claim', 'MajorClaim']",
 "['Claim', 'Premise', 'Claim', 'Premise', 'MajorClaim']",
 "['Claim', 'Premise', 'Claim', 'Premise', 'Premise', 'Premise']",
 "['Claim', 'Premise', 'Claim', 'Premise', 'Premise']",
 "['Claim', 'Premise', 'Claim']",
 "['Claim', 'Premise', 'MajorClaim']",
 "['Claim', 'Premise', 'Premise', 'Claim', 'Premise', 'Claim']",
 "['Claim', 'Premise', 'Premise', 'Claim', 'Premise', 'Premise', 'Premise']",
 "['Claim', 'Premise', 'Premise', 'Claim', 'Premise', 'Premise']",
 "['Claim', 'Premise', '

### Load train-test-split CSV 

In [8]:
df_split = pd.read_csv("datasets/train-test-split.csv", sep=";")

In [9]:
# df_split

In [10]:
pe_df['split'] = pe_df['essay_id'].map(df_split['SET'])

In [11]:
# pe_df

In [12]:
def get_ac_count(x):

    return len(ast.literal_eval(x.AC_types))

In [13]:
pe_df["AC_count"] = pe_df.apply(lambda x: get_ac_count(x), axis=1)

In [14]:
# pe_df

In [15]:
pe_df.split.value_counts()

split
TRAIN    1796
TEST      439
Name: count, dtype: int64

In [16]:
# Remove paragraphs with 0 ACs

In [17]:
# pe_df = pe_df[pe_df.AC_count > 0].reset_index() 
# we want model to learn that when there are 0 ACs, there should be 0 ARs

In [18]:
# pe_df

In [19]:
pe_df.split.value_counts()

split
TRAIN    1796
TEST      439
Name: count, dtype: int64

## Prepare prompt

In [20]:
def formatting_fct(instruction="", input="", output="", mode="train"):
    
    prompt_d ={
            
        "instruction": f"{instruction}",
        "input": f"{input}",
        "output": f"{output if mode=='train' else ''}"
            
        }
    
    return prompt_d

In [21]:
instruction = """### You are an expert in Argument Mining. You are given a paragraph which contains argument components enclosed by <AC></AC> tags. Your task is to classify the argument components as well as to identify and classify argument relations between argument components in the paragraph. For each argument component, its AC type (str) is either "MajorClaim", "Claim" or "Premise". For each argument relation (target AC (int), source AC (int)), its link type (str) is either "Support" or "Attack". You must return two lists in following JSON format: {"list_component_types": [AC type (str), ..., AC type (str)], "list_argument_relations_and_types": [[target AC (int), source AC (int), link type (str)], ..., [target AC (int), source AC (int), link type (str)]]}
"""

In [22]:
print(instruction)

### You are an expert in Argument Mining. You are given a paragraph which contains argument components enclosed by <AC></AC> tags. Your task is to classify the argument components as well as to identify and classify argument relations between argument components in the paragraph. For each argument component, its AC type (str) is either "MajorClaim", "Claim" or "Premise". For each argument relation (target AC (int), source AC (int)), its link type (str) is either "Support" or "Attack". You must return two lists in following JSON format: {"list_component_types": [AC type (str), ..., AC type (str)], "list_argument_relations_and_types": [[target AC (int), source AC (int), link type (str)], ..., [target AC (int), source AC (int), link type (str)]]}



In [23]:
def replace_substring_with_position(main_string, substring, ac_types):
    result = ""
    start = 0
    current_index = 0
    position = 0  # Initialize position counter
    
    while current_index < len(main_string):
        current_index = main_string.find(substring, current_index)
        if current_index == -1:
            break
        
        # Append part of the string before the current match
        result += main_string[start:current_index]
        
        # Append the replacement
        sstring = substring[:-1]
        ac_type = ac_types[position]
        
        # result += f"{sstring}{position}, {ac_type}>" 
        result += f"{sstring}{position}>" 
        
        # Update the start to be the end of the current match
        start = current_index + len(substring)
        current_index = start
        
        # Increment the position counter
        position += 1
    
    # Append any remaining part of the string
    result += main_string[start:]
    
    return result


In [24]:
def build_input(paragraph, ac_types):

    paragraph = paragraph.replace("<prompt> ", "<topic> ")
    paragraph = paragraph.replace(" </prompt>", " </topic>") 
    # HUGO: comment the next 8 lines for paragraph tags. Leave as is for no paragraph tags.
    paragraph = paragraph.replace("<topic> ", "")
    paragraph = paragraph.replace(" </topic>", "") 
    paragraph = paragraph.replace("<para-intro> ", "")
    paragraph = paragraph.replace(" </para-intro>", "")
    paragraph = paragraph.replace("<para-body> ", "")
    paragraph = paragraph.replace(" </para-body>", "")
    paragraph = paragraph.replace("<para-conclusion> ", "")
    paragraph = paragraph.replace(" </para-conclusion>", "")
    
    
    paragraph = replace_substring_with_position(paragraph, "<AC>", ast.literal_eval(ac_types))
    paragraph = replace_substring_with_position(paragraph, "</AC>", ast.literal_eval(ac_types))

    
    question = f"""### Here is the paragraph text: {paragraph}"""
    
    return question

In [25]:
def build_answer(ac_types, ar_pairs, ar_types):
    answer_2 = [[x[0], x[1], y] for x,y in zip(ast.literal_eval(ar_pairs), ast.literal_eval(ar_types))]
    answer_1 = [x for x in ast.literal_eval(ac_types)]
    return json.dumps({"list_component_types": answer_1, "list_argument_relations_and_types": answer_2})

In [26]:
# print(formatting_fct(instruction, question, answer, mode="train"))

In [27]:
pe_df.split.value_counts()

split
TRAIN    1796
TEST      439
Name: count, dtype: int64

## Prepare data files

### Train set

In [28]:
data_file_train = []

for i, _ in pe_df[pe_df["split"] == "TRAIN"].iterrows():
    
    question = build_input(pe_df.iloc[i].para_text, pe_df.iloc[i].AC_types)
    answer = build_answer(pe_df.iloc[i].AC_types, pe_df.iloc[i].AR_pairs, pe_df.iloc[i].AR_types)
    
    data_file_train.append( formatting_fct(instruction, question, answer, mode="train") )

In [29]:
len(data_file_train)

1796

In [30]:
for i in range(3):
    
    print(data_file_train[i])
    print()

{'instruction': '### You are an expert in Argument Mining. You are given a paragraph which contains argument components enclosed by <AC></AC> tags. Your task is to classify the argument components as well as to identify and classify argument relations between argument components in the paragraph. For each argument component, its AC type (str) is either "MajorClaim", "Claim" or "Premise". For each argument relation (target AC (int), source AC (int)), its link type (str) is either "Support" or "Attack". You must return two lists in following JSON format: {"list_component_types": [AC type (str), ..., AC type (str)], "list_argument_relations_and_types": [[target AC (int), source AC (int), link type (str)], ..., [target AC (int), source AC (int), link type (str)]]}\n', 'input': '### Here is the paragraph text: Should students be taught to compete or to cooperate ?', 'output': '{"list_component_types": [], "list_argument_relations_and_types": []}'}

{'instruction': '### You are an expert in 

In [31]:
json.loads(data_file_train[2]["output"])["list_argument_relations_and_types"]

[[0, 1, 'Support'], [0, 2, 'Support'], [0, 3, 'Support']]

In [32]:
len(data_file_train)

1796

### Test set

In [33]:
data_file_test = []

for i, _ in pe_df[pe_df.split == "TEST"].iterrows():
    
    question = build_input(pe_df.iloc[i].para_text, pe_df.iloc[i].AC_types)
    answer = build_answer(pe_df.iloc[i].AC_types, pe_df.iloc[i].AR_pairs, pe_df.iloc[i].AR_types)
    
    data_file_test.append( formatting_fct(instruction, question, answer, mode="train") )

In [34]:
len(data_file_train), len(data_file_test)

(1796, 439)

In [35]:
for i in range(3):
    
    print(data_file_test[i])
    print()

{'instruction': '### You are an expert in Argument Mining. You are given a paragraph which contains argument components enclosed by <AC></AC> tags. Your task is to classify the argument components as well as to identify and classify argument relations between argument components in the paragraph. For each argument component, its AC type (str) is either "MajorClaim", "Claim" or "Premise". For each argument relation (target AC (int), source AC (int)), its link type (str) is either "Support" or "Attack". You must return two lists in following JSON format: {"list_component_types": [AC type (str), ..., AC type (str)], "list_argument_relations_and_types": [[target AC (int), source AC (int), link type (str)], ..., [target AC (int), source AC (int), link type (str)]]}\n', 'input': '### Here is the paragraph text: Will newspapers become a thing of the past ?', 'output': '{"list_component_types": [], "list_argument_relations_and_types": []}'}

{'instruction': '### You are an expert in Argument M

## Save `jsonl` files

In [36]:
len(data_file_train)

1796

In [37]:
file_path = os.path.join(os.getcwd(), "datasets/PE_ATC-LI-LTC_paragraph_wo_tags_train.json")

with open(file_path, 'w') as file:
    
    json.dump(data_file_train, file)

In [38]:
file_path = os.path.join(os.getcwd(), "datasets/PE_ATC-LI-LTC_paragraph_wo_tags_test.json")

with open(file_path, 'w') as file:
    
    json.dump(data_file_test, file)

In [39]:
# file_path = os.path.join(os.getcwd(), "datasets/PE_LI_val.json")

# with open(file_path, 'w') as file:
    
#     json.dump(data_file_val, file)