# Prepare dataset (jsonl file)

- Prepare PE datasets for llama factory.

- Stance classification


## Libraries

In [1]:
import os
import ast
import json
import random
import pickle
import pandas as pd

In [2]:
random.seed(42)

## Load Data

In [3]:
data_dir = os.path.join(os.getcwd(), "datasets")

In [4]:
pe_df = pd.read_csv(os.path.join(data_dir, "PE_data.csv"))

In [5]:
# pe_df.isna().sum()

In [6]:
# pe_df

### Load train-test-split CSV 

In [7]:
df_split = pd.read_csv("datasets/train-test-split.csv", sep=";")

In [12]:
df_split.head(10)

Unnamed: 0,ID,SET
0,essay001,TRAIN
1,essay002,TRAIN
2,essay003,TRAIN
3,essay004,TEST
4,essay005,TEST
5,essay006,TEST
6,essay007,TRAIN
7,essay008,TRAIN
8,essay009,TRAIN
9,essay010,TRAIN


In [9]:
pe_df['split'] = pe_df['essay_id'].map(df_split['SET'])

In [11]:
pe_df.head(50)

Unnamed: 0,essay_id,para_id,para_types,para_text,adu_spans,ac_spans,ai_spans,AC_types,AR_pairs,AR_types,split
0,0,0,prompt,<prompt> Should students be taught to compete ...,[],[],[],[],[],[],TRAIN
1,0,1,intro,<para-intro> It is always said that competitio...,"[(76, 97)]","[(86, 97)]","[(76, 85)]",['MajorClaim'],[],[],TRAIN
2,0,2,body,"<para-body> First of all , <AC> through cooper...","[(1, 25), (26, 55), (56, 99), (100, 123)]","[(5, 25), (27, 55), (57, 99), (101, 123)]","[(1, 4), (26, 26), (56, 56), (100, 100)]","['Claim', 'Premise', 'Premise', 'Premise']","[(0, 1), (0, 2), (0, 3)]","['Support', 'Support', 'Support']",TRAIN
3,0,3,body,"<para-body> On the other hand , <AC> the signi...","[(1, 22), (24, 37), (39, 63), (76, 139), (155,...","[(6, 22), (30, 37), (41, 63), (77, 139), (156,...","[(1, 5), (24, 29), (39, 40), (76, 76), (155, 1...","['Premise', 'Claim', 'Premise', 'Premise', 'Cl...","[(1, 0), (4, 2), (4, 3)]","['Support', 'Support', 'Support']",TRAIN
4,0,4,conclusion,"<para-conclusion> Consequently , no matter fro...","[(1, 40)]","[(25, 40)]","[(1, 24)]",['MajorClaim'],[],[],TRAIN
5,1,5,prompt,<prompt> More people are migrating to other co...,[],[],[],[],[],[],TRAIN
6,1,6,intro,<para-intro> The last 50 years have seen an in...,"[(54, 77)]","[(59, 77)]","[(54, 58)]",['MajorClaim'],[],[],TRAIN
7,1,7,body,"<para-body> Firstly , <AC> maintaining one ’ s...","[(1, 24), (25, 60), (61, 82), (83, 107), (109,...","[(3, 24), (26, 60), (62, 82), (84, 107), (115,...","[(1, 2), (25, 25), (61, 61), (83, 83), (109, 1...","['Premise', 'Premise', 'Premise', 'Premise', '...","[(4, 0), (4, 1), (4, 2), (4, 3)]","['Support', 'Support', 'Support', 'Support']",TRAIN
8,1,8,body,"<para-body> Secondly , <AC> it is crucial to k...","[(1, 13), (14, 34), (36, 55), (56, 86), (87, 1...","[(3, 13), (15, 34), (39, 55), (57, 86), (88, 1...","[(1, 2), (14, 14), (36, 38), (56, 56), (87, 87...","['Premise', 'Premise', 'Premise', 'Premise', '...","[(5, 0), (0, 1), (5, 2), (5, 3), (5, 4)]","['Support', 'Support', 'Support', 'Support', '...",TRAIN
9,1,9,conclusion,"<para-conclusion> To conclude , although <AC> ...","[(1, 23), (25, 54)]","[(5, 23), (32, 54)]","[(1, 4), (25, 31)]","['Claim', 'MajorClaim']",[],[],TRAIN


In [11]:
def get_ac_count(x):

    return len(ast.literal_eval(x.AC_types))

In [12]:
pe_df["AC_count"] = pe_df.apply(lambda x: get_ac_count(x), axis=1)

In [13]:
# pe_df

In [14]:
pe_df.split.value_counts()

split
TRAIN    1796
TEST      439
Name: count, dtype: int64

In [15]:
pe_df

Unnamed: 0,essay_id,para_id,para_types,para_text,adu_spans,ac_spans,ai_spans,AC_types,AR_pairs,AR_types,split,AC_count
0,0,0,prompt,<prompt> Should students be taught to compete ...,[],[],[],[],[],[],TRAIN,0
1,0,1,intro,<para-intro> It is always said that competitio...,"[(76, 97)]","[(86, 97)]","[(76, 85)]",['MajorClaim'],[],[],TRAIN,1
2,0,2,body,"<para-body> First of all , <AC> through cooper...","[(1, 25), (26, 55), (56, 99), (100, 123)]","[(5, 25), (27, 55), (57, 99), (101, 123)]","[(1, 4), (26, 26), (56, 56), (100, 100)]","['Claim', 'Premise', 'Premise', 'Premise']","[(0, 1), (0, 2), (0, 3)]","['Support', 'Support', 'Support']",TRAIN,4
3,0,3,body,"<para-body> On the other hand , <AC> the signi...","[(1, 22), (24, 37), (39, 63), (76, 139), (155,...","[(6, 22), (30, 37), (41, 63), (77, 139), (156,...","[(1, 5), (24, 29), (39, 40), (76, 76), (155, 1...","['Premise', 'Claim', 'Premise', 'Premise', 'Cl...","[(1, 0), (4, 2), (4, 3)]","['Support', 'Support', 'Support']",TRAIN,5
4,0,4,conclusion,"<para-conclusion> Consequently , no matter fro...","[(1, 40)]","[(25, 40)]","[(1, 24)]",['MajorClaim'],[],[],TRAIN,1
...,...,...,...,...,...,...,...,...,...,...,...,...
2230,401,2230,prompt,<prompt> A greater proportion of the budget sh...,[],[],[],[],[],[],TRAIN,0
2231,401,2231,intro,"<para-intro> In today ' s world , the concept ...","[(26, 51)]","[(33, 51)]","[(26, 32)]",['MajorClaim'],[],[],TRAIN,1
2232,401,2232,body,<para-body> <AC> The first reason why educatio...,"[(0, 22), (24, 56), (58, 71), (72, 98)]","[(1, 22), (27, 56), (60, 71), (74, 98)]","[(0, 0), (24, 26), (58, 59), (72, 73)]","['Premise', 'Premise', 'Claim', 'Premise']","[(2, 0), (2, 1), (2, 3)]","['Support', 'Support', 'Support']",TRAIN,4
2233,401,2233,body,<para-body> The second reason why <AC> governm...,"[(1, 25), (27, 53), (55, 88), (90, 128)]","[(5, 25), (31, 53), (57, 88), (93, 128)]","[(1, 4), (27, 30), (55, 56), (90, 92)]","['Claim', 'Premise', 'Premise', 'Premise']","[(2, 1), (0, 2), (2, 3)]","['Support', 'Support', 'Support']",TRAIN,4


In [16]:
def get_ar_pair_count(x):

    return len(ast.literal_eval(x.AR_pairs))

In [17]:
pe_df["AR_count"] = pe_df.apply(lambda x: get_ar_pair_count(x), axis=1)

In [18]:
# Remove paragraphs with 0 ACs

In [19]:
#pe_df = pe_df[pe_df.AC_count > 0].reset_index() 

In [20]:
pe_df

Unnamed: 0,essay_id,para_id,para_types,para_text,adu_spans,ac_spans,ai_spans,AC_types,AR_pairs,AR_types,split,AC_count,AR_count
0,0,0,prompt,<prompt> Should students be taught to compete ...,[],[],[],[],[],[],TRAIN,0,0
1,0,1,intro,<para-intro> It is always said that competitio...,"[(76, 97)]","[(86, 97)]","[(76, 85)]",['MajorClaim'],[],[],TRAIN,1,0
2,0,2,body,"<para-body> First of all , <AC> through cooper...","[(1, 25), (26, 55), (56, 99), (100, 123)]","[(5, 25), (27, 55), (57, 99), (101, 123)]","[(1, 4), (26, 26), (56, 56), (100, 100)]","['Claim', 'Premise', 'Premise', 'Premise']","[(0, 1), (0, 2), (0, 3)]","['Support', 'Support', 'Support']",TRAIN,4,3
3,0,3,body,"<para-body> On the other hand , <AC> the signi...","[(1, 22), (24, 37), (39, 63), (76, 139), (155,...","[(6, 22), (30, 37), (41, 63), (77, 139), (156,...","[(1, 5), (24, 29), (39, 40), (76, 76), (155, 1...","['Premise', 'Claim', 'Premise', 'Premise', 'Cl...","[(1, 0), (4, 2), (4, 3)]","['Support', 'Support', 'Support']",TRAIN,5,3
4,0,4,conclusion,"<para-conclusion> Consequently , no matter fro...","[(1, 40)]","[(25, 40)]","[(1, 24)]",['MajorClaim'],[],[],TRAIN,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2230,401,2230,prompt,<prompt> A greater proportion of the budget sh...,[],[],[],[],[],[],TRAIN,0,0
2231,401,2231,intro,"<para-intro> In today ' s world , the concept ...","[(26, 51)]","[(33, 51)]","[(26, 32)]",['MajorClaim'],[],[],TRAIN,1,0
2232,401,2232,body,<para-body> <AC> The first reason why educatio...,"[(0, 22), (24, 56), (58, 71), (72, 98)]","[(1, 22), (27, 56), (60, 71), (74, 98)]","[(0, 0), (24, 26), (58, 59), (72, 73)]","['Premise', 'Premise', 'Claim', 'Premise']","[(2, 0), (2, 1), (2, 3)]","['Support', 'Support', 'Support']",TRAIN,4,3
2233,401,2233,body,<para-body> The second reason why <AC> governm...,"[(1, 25), (27, 53), (55, 88), (90, 128)]","[(5, 25), (31, 53), (57, 88), (93, 128)]","[(1, 4), (27, 30), (55, 56), (90, 92)]","['Claim', 'Premise', 'Premise', 'Premise']","[(2, 1), (0, 2), (2, 3)]","['Support', 'Support', 'Support']",TRAIN,4,3


In [21]:
pe_df.split.value_counts()

split
TRAIN    1796
TEST      439
Name: count, dtype: int64

## Prepare prompt

In [22]:
def formatting_fct(instruction="", input="", output="", mode="train"):
    
    prompt_d ={
            
        "instruction": f"""{instruction}""",
        "input": f"""{input}""",
        "output": f"""{output if mode=='train' else ''}"""
            
        }
    
    return prompt_d

In [23]:
instruction = """### You are an expert in Argument Mining. You are given a paragraph which contains argument components enclosed by <AC></AC> tags. You are also given a list of pairs of related argument components in the form: [(target AC (int), source AC (int)), (target AC (int), source AC (int)), ..., (target AC (int), source AC (int))]. Your task is to classify each pair of related argument components in the list as either "Support" or "Attack". You must return a list of relation types in following JSON format: {"relation_types": [relation_type (str), relation_type (str), ..., relation_type (str)]}
"""

In [24]:
print(instruction)

### You are an expert in Argument Mining. You are given a paragraph which contains argument components enclosed by <AC></AC> tags. You are also given a list of pairs of related argument components in the form: [(target AC (int), source AC (int)), (target AC (int), source AC (int)), ..., (target AC (int), source AC (int))]. Your task is to classify each pair of related argument components in the list as either "Support" or "Attack". You must return a list of relation types in following JSON format: {"relation_types": [relation_type (str), relation_type (str), ..., relation_type (str)]}



In [25]:
def replace_substring_with_position(main_string, substring, ac_types):
    result = ""
    start = 0
    current_index = 0
    position = 0  # Initialize position counter
    
    while current_index < len(main_string):
        current_index = main_string.find(substring, current_index)
        if current_index == -1:
            break
        
        # Append part of the string before the current match
        result += main_string[start:current_index]
        
        # Append the replacement
        sstring = substring[:-1]
        ac_type = ac_types[position]
        
        # result += f"{sstring}{position}, {ac_type}>"
        result += f"{sstring}{position}>"
        
        # Update the start to be the end of the current match
        start = current_index + len(substring)
        current_index = start
        
        # Increment the position counter
        position += 1
    
    # Append any remaining part of the string
    result += main_string[start:]
    
    return result


In [26]:
def build_input(paragraph, ac_types, ar_pairs):
    
    paragraph = paragraph.replace("<prompt> ", "<topic> ")
    paragraph = paragraph.replace(" </prompt>", " </topic>")
    # comment next line to keep para tags.
    paragraph = paragraph.replace("<topic> ", "")
    paragraph = paragraph.replace(" </topic>", "")
    paragraph = paragraph.replace("<para-intro> ", "")
    paragraph = paragraph.replace(" </para-intro>", "")
    paragraph = paragraph.replace("<para-body> ", "")
    paragraph = paragraph.replace(" </para-body>", "")
    paragraph = paragraph.replace("<para-conclusion> ", "")
    paragraph = paragraph.replace(" </para-conclusion>", "")
    
    
    paragraph = replace_substring_with_position(paragraph, "<AC>", ast.literal_eval(ac_types))
    paragraph = replace_substring_with_position(paragraph, "</AC>", ast.literal_eval(ac_types))

    
    question = f"""### Here is the paragraph text: {paragraph}\n###Here is the list of pairs of related argument components in this paragraph: {ar_pairs}"""
    
    return question

In [27]:
def build_answer(ar_types):

    
    return json.dumps({"relation_types": ast.literal_eval(ar_types)})

In [28]:
# print(formatting_fct(instruction, question, answer, mode="train"))

In [29]:
pe_df.split.value_counts()

split
TRAIN    1796
TEST      439
Name: count, dtype: int64

## Prepare data files

### Train set

In [30]:
data_file_train = []

for i, _ in pe_df[pe_df["split"] == "TRAIN"].iterrows():

    question = build_input(pe_df.iloc[i].para_text, pe_df.iloc[i].AC_types, pe_df.iloc[i].AR_pairs)
    answer = build_answer(pe_df.iloc[i].AR_types)
    
    data_file_train.append( formatting_fct(instruction, question, answer, mode="train") )

In [31]:
len(data_file_train)

1796

In [32]:
for i in range(3):
    
    print(data_file_train[i])
    print()

{'instruction': '### You are an expert in Argument Mining. You are given a paragraph which contains argument components enclosed by <AC></AC> tags. You are also given a list of pairs of related argument components in the form: [(target AC (int), source AC (int)), (target AC (int), source AC (int)), ..., (target AC (int), source AC (int))]. Your task is to classify each pair of related argument components in the list as either "Support" or "Attack". You must return a list of relation types in following JSON format: {"relation_types": [relation_type (str), relation_type (str), ..., relation_type (str)]}\n', 'input': '### Here is the paragraph text: Should students be taught to compete or to cooperate ?\n###Here is the list of pairs of related argument components in this paragraph: []', 'output': '{"relation_types": []}'}

{'instruction': '### You are an expert in Argument Mining. You are given a paragraph which contains argument components enclosed by <AC></AC> tags. You are also given a

In [33]:
print(data_file_train[674])

{'instruction': '### You are an expert in Argument Mining. You are given a paragraph which contains argument components enclosed by <AC></AC> tags. You are also given a list of pairs of related argument components in the form: [(target AC (int), source AC (int)), (target AC (int), source AC (int)), ..., (target AC (int), source AC (int))]. Your task is to classify each pair of related argument components in the list as either "Support" or "Attack". You must return a list of relation types in following JSON format: {"relation_types": [relation_type (str), relation_type (str), ..., relation_type (str)]}\n', 'input': '### Here is the paragraph text: In short , I can say for certain that <AC0> spending money on improving public transportation is necessary </AC0> because <AC1> it is the key to protect our world , creating a healthy and safe environment for people to live in </AC1> .\n###Here is the list of pairs of related argument components in this paragraph: []', 'output': '{"relation_ty

In [34]:
print(data_file_train[2]["output"])

{"relation_types": ["Support", "Support", "Support"]}


In [35]:
len(data_file_train)

1796

In [36]:
type(json.loads(data_file_train[2]["output"])["relation_types"])

list

### Validation set (optional)

In [37]:
# val_length = int(1417/10)

In [38]:
# val_length

In [39]:
# data_file_val = random.sample(data_file_train, val_length)

In [40]:
# len(data_file_val)

In [41]:
# data_file_train = [message for message in data_file_train if message not in data_file_val]

In [42]:
# len(data_file_train)

In [43]:
# data_file_val = []

# for i, _ in df[df["essay_file"].isin(val_essays_l)].iterrows():
    
#     question = build_question(df.iloc[i].argument_component)
#     answer = build_answer(df.iloc[i].label)
    
#     data_file_val.append( formatting_fct(my_task_description, question, answer, mode="train") )

In [44]:
#len(data_file_val)

In [45]:
# for i in range(3):
    
#     print(data_file_val[i])
#     print()

### Test set

In [46]:
data_file_test = []

for i, _ in pe_df[pe_df.split == "TEST"].iterrows():
    
    question = build_input(pe_df.iloc[i].para_text, pe_df.iloc[i].AC_types, pe_df.iloc[i].AR_pairs)
    answer = build_answer(pe_df.iloc[i].AR_types)
    
    data_file_test.append( formatting_fct(instruction, question, answer, mode="train") )

In [47]:
len(data_file_test)

439

In [48]:
for i in range(3):
    
    print(data_file_test[i])
    print()

{'instruction': '### You are an expert in Argument Mining. You are given a paragraph which contains argument components enclosed by <AC></AC> tags. You are also given a list of pairs of related argument components in the form: [(target AC (int), source AC (int)), (target AC (int), source AC (int)), ..., (target AC (int), source AC (int))]. Your task is to classify each pair of related argument components in the list as either "Support" or "Attack". You must return a list of relation types in following JSON format: {"relation_types": [relation_type (str), relation_type (str), ..., relation_type (str)]}\n', 'input': '### Here is the paragraph text: Will newspapers become a thing of the past ?\n###Here is the list of pairs of related argument components in this paragraph: []', 'output': '{"relation_types": []}'}

{'instruction': '### You are an expert in Argument Mining. You are given a paragraph which contains argument components enclosed by <AC></AC> tags. You are also given a list of p

## Save `jsonl` files

In [49]:
file_path = os.path.join(os.getcwd(), "datasets/PE_LTC_paragraph_wo_tags_train.json")

with open(file_path, 'w') as file:
    
    json.dump(data_file_train, file)

In [50]:
file_path = os.path.join(os.getcwd(), "datasets/PE_LTC_paragraph_wo_tags_test.json")

with open(file_path, 'w') as file:
    
    json.dump(data_file_test, file)

In [51]:
# file_path = os.path.join(os.getcwd(), "datasets/PE_LI_val.json")

# with open(file_path, 'w') as file:
    
#     json.dump(data_file_val, file)