In [1]:
import os
import re
import pandas as pd
from tqdm import tqdm

In [64]:
ann_files_directory = "/Utilisateurs/umushtaq/am_reasoning/raw_files/ann_files"
text_files_directory = "/Utilisateurs/umushtaq/am_reasoning/raw_files/txt_files"

In [65]:
components_data = []
relations_data = []
stances_data = []
essays_data = []

In [66]:
for filename in tqdm(os.listdir(ann_files_directory), desc="Parsing annotation files for components ..."):
    
    print("processing: " + filename)
    #if filename.endswith(".ann"):  # Process only text files
    file_path = os.path.join(ann_files_directory, filename)
    
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split("\t")
            #print(parts)
            # Process "T" lines (claims/premises)
            if line.startswith("T"):
                t_id = parts[0]  # Extract T ID (e.g., T1)
                t_type = parts[1].split(" ")[0]  # Extract type (e.g., MajorClaim, Claim, Premise)
                t_s_bound = parts[1].split(" ")[1]
                t_e_bound = parts[1].split(" ")[2]
                text = parts[2] #if len(parts) > 3 else ""  # Extract text if available
                components_data.append(["T", filename, t_id, t_type, t_s_bound, t_e_bound, text])

Parsing annotation files for components ...: 100%|██████████| 402/402 [00:00<00:00, 2438.34it/s]

processing: essay001.ann
processing: essay002.ann
processing: essay003.ann
processing: essay004.ann
processing: essay005.ann
processing: essay006.ann
processing: essay007.ann
processing: essay008.ann
processing: essay009.ann
processing: essay010.ann
processing: essay011.ann
processing: essay012.ann
processing: essay013.ann
processing: essay014.ann
processing: essay015.ann
processing: essay016.ann
processing: essay017.ann
processing: essay018.ann
processing: essay019.ann
processing: essay020.ann
processing: essay021.ann
processing: essay022.ann
processing: essay023.ann
processing: essay024.ann
processing: essay025.ann
processing: essay026.ann
processing: essay027.ann
processing: essay028.ann
processing: essay029.ann
processing: essay030.ann
processing: essay031.ann
processing: essay032.ann
processing: essay033.ann
processing: essay034.ann
processing: essay035.ann
processing: essay036.ann
processing: essay037.ann
processing: essay038.ann
processing: essay039.ann
processing: essay040.ann





In [67]:
for filename in tqdm(os.listdir(ann_files_directory), desc="Parsing annotation files for relations ... "):
    
    print("processing: " + filename)
    #if filename.endswith(".ann"):  # Process only text files
    file_path = os.path.join(ann_files_directory, filename)
    
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split("\t")
            #print(parts)
            # Process "T" lines (claims/premises)
            if line.startswith("R"):
                parts = line.strip().split()
                relation_type = parts[1]  # "supports" or "attacks"
                arg1 = parts[2].split(":")[1]  # Extract T value from Arg1
                arg2 = parts[3].split(":")[1]  # Extract T value from Arg2
                relations_data.append(["R", filename, arg1, arg2, relation_type])

Parsing annotation files for relations ... : 100%|██████████| 402/402 [00:00<00:00, 3861.86it/s]

processing: essay001.ann
processing: essay002.ann
processing: essay003.ann
processing: essay004.ann
processing: essay005.ann
processing: essay006.ann
processing: essay007.ann
processing: essay008.ann
processing: essay009.ann
processing: essay010.ann
processing: essay011.ann
processing: essay012.ann
processing: essay013.ann
processing: essay014.ann
processing: essay015.ann
processing: essay016.ann
processing: essay017.ann
processing: essay018.ann
processing: essay019.ann
processing: essay020.ann
processing: essay021.ann
processing: essay022.ann
processing: essay023.ann
processing: essay024.ann
processing: essay025.ann
processing: essay026.ann
processing: essay027.ann
processing: essay028.ann
processing: essay029.ann
processing: essay030.ann
processing: essay031.ann
processing: essay032.ann
processing: essay033.ann
processing: essay034.ann
processing: essay035.ann
processing: essay036.ann
processing: essay037.ann
processing: essay038.ann
processing: essay039.ann
processing: essay040.ann





In [68]:
for filename in tqdm(os.listdir(ann_files_directory), desc="Parsing annotation files for stances ... "):
    
    print("processing: " + filename)
    #if filename.endswith(".ann"):  # Process only text files
    file_path = os.path.join(ann_files_directory, filename)
    
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split("\t")
            #print(parts)
            # Process "T" lines (claims/premises)
            if line.startswith("A"):
                parts = line.strip().split()
                relation_type = parts[-1]  # "supports" or "attacks"
                #arg1 = parts[2].split(":")[1]  # Extract T value from Arg1
                #arg2 = parts[3].split(":")[1]  # Extract T value from Arg2
                stances_data.append(["A", filename, relation_type])

Parsing annotation files for stances ... : 100%|██████████| 402/402 [00:00<00:00, 3482.36it/s]

processing: essay001.ann
processing: essay002.ann
processing: essay003.ann
processing: essay004.ann
processing: essay005.ann
processing: essay006.ann
processing: essay007.ann
processing: essay008.ann
processing: essay009.ann
processing: essay010.ann
processing: essay011.ann
processing: essay012.ann
processing: essay013.ann
processing: essay014.ann
processing: essay015.ann
processing: essay016.ann
processing: essay017.ann
processing: essay018.ann
processing: essay019.ann
processing: essay020.ann
processing: essay021.ann
processing: essay022.ann
processing: essay023.ann
processing: essay024.ann
processing: essay025.ann
processing: essay026.ann
processing: essay027.ann
processing: essay028.ann
processing: essay029.ann
processing: essay030.ann
processing: essay031.ann
processing: essay032.ann
processing: essay033.ann
processing: essay034.ann
processing: essay035.ann
processing: essay036.ann
processing: essay037.ann
processing: essay038.ann
processing: essay039.ann
processing: essay040.ann





In [69]:
for filename in tqdm(os.listdir(text_files_directory), desc="Parsing text files for esssays ... "):
    
    print("processing: " + filename)
    #if filename.endswith(".ann"):  # Process only text files
    file_path = os.path.join(text_files_directory, filename)
    filename_ann = filename.replace(".txt", ".ann")
    
    with open(file_path, "r", encoding="utf-8") as file:
        
        lines = file.readlines()
        essay_title = lines[0].strip()
        essay_text = "".join(lines[1:]).strip()
        
        essays_data.append([filename.replace(".txt", ".ann"), essay_title, essay_text])

Parsing text files for esssays ... : 100%|██████████| 402/402 [00:00<00:00, 3904.13it/s]

processing: essay001.txt
processing: essay002.txt
processing: essay003.txt
processing: essay004.txt
processing: essay005.txt
processing: essay006.txt
processing: essay007.txt
processing: essay008.txt
processing: essay009.txt
processing: essay010.txt
processing: essay011.txt
processing: essay012.txt
processing: essay013.txt
processing: essay014.txt
processing: essay015.txt
processing: essay016.txt
processing: essay017.txt
processing: essay018.txt
processing: essay019.txt
processing: essay020.txt
processing: essay021.txt
processing: essay022.txt
processing: essay023.txt
processing: essay024.txt
processing: essay025.txt
processing: essay026.txt
processing: essay027.txt
processing: essay028.txt
processing: essay029.txt
processing: essay030.txt
processing: essay031.txt
processing: essay032.txt
processing: essay033.txt
processing: essay034.txt
processing: essay035.txt
processing: essay036.txt
processing: essay037.txt
processing: essay038.txt
processing: essay039.txt
processing: essay040.txt





In [70]:
components_df = pd.DataFrame(components_data)
relations_df = pd.DataFrame(relations_data)
stances_df = pd.DataFrame(stances_data)
essays_df = pd.DataFrame(essays_data)

In [71]:
essays_df

Unnamed: 0,0,1,2
0,essay001.ann,Should students be taught to compete or to coo...,It is always said that competition can effecti...
1,essay002.ann,More people are migrating to other countries t...,The last 50 years have seen an increasing numb...
2,essay003.ann,International tourism is now more common than ...,The last decade has seen an increasing number ...
3,essay004.ann,International tourism is now more common than ...,The last 50 years have seen a significant incr...
4,essay005.ann,Living and studying overseas,It is every student's desire to study at a goo...
...,...,...,...
397,essay398.ann,We can not forcedly put the same numbers of ma...,There is a view that universities and colleges...
398,essay399.ann,"Drugs, alcohol and messy sex lives","Celebrities, for example movie stars, are usua..."
399,essay400.ann,A greater proportion of the budget should be a...,"In today's world, the concept of the welfare s..."
400,essay401.ann,Fatherhood should be as present as motherhood ...,"In today's world, having and raising children ..."


In [72]:
components_df = components_df.drop(columns=components_df.columns[0])
components_df.columns = ["file_name", "argument_id", "argument_type", "arg_bound_1", "arg_bound2", "argument_component"]

In [73]:
relations_df = relations_df.drop(columns=relations_df.columns[0])
relations_df.columns = ["file_name", "source_arg", "target_arg", "relation_type"]

In [76]:
stances_df = stances_df.drop(columns=stances_df.columns[0])
stances_df.columns = ["file_name", "stance_type"]

In [77]:
essays_df.columns = ["file_name", "essay_title", "essay_text"]

### Group and Merge

In [78]:
comp_grouped_df = components_df.groupby(['file_name'], sort=False).agg({
    'argument_id': list,
    'argument_type': list,
    'arg_bound_1': list,
    'arg_bound2': list,
    'argument_component': list,
}).reset_index()

In [79]:
rels_grouped_df = relations_df.groupby(['file_name'], sort=False).agg({
    'source_arg': list,
    'target_arg': list,
    'relation_type': list,
}).reset_index()

In [80]:
stance_grouped_df = stances_df.groupby(['file_name'], sort=False).agg({
    'stance_type': list,
}).reset_index()

In [81]:
pe_df = comp_grouped_df.merge(rels_grouped_df, on="file_name").merge(stance_grouped_df, on="file_name").merge(essays_df, on="file_name")

In [82]:
pe_df.columns = ['ann_file_name', 'argument_ids', 'argument_types', 'arg_start_bounds',
       'arg_end_bounds', 'argument_components', 'source_args', 'target_args',
       'relation_types', 'stance_types', 'essay_title', 'essay_text']

In [83]:
pe_df

Unnamed: 0,ann_file_name,argument_ids,argument_types,arg_start_bounds,arg_end_bounds,argument_components,source_args,target_args,relation_types,stance_types,essay_title,essay_text
0,essay001.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[503, 2154, 591, 716, 853, 1088, 1332, 1212, 1...","[575, 2231, 714, 851, 1086, 1191, 1376, 1301, ...",[we should attach more importance to cooperati...,"[T4, T5, T6, T10, T9, T8]","[T3, T3, T3, T11, T11, T7]","[supports, supports, supports, supports, suppo...","[For, Against, For]",Should students be taught to compete or to coo...,It is always said that competition can effecti...
1,essay002.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, Premise, Claim, Premi...","[391, 1936, 500, 1089, 626, 841, 948, 1168, 12...","[489, 2077, 624, 1156, 839, 946, 1057, 1204, 1...",[they are able to sustain their cultural ident...,"[T9, T11, T12, T13, T5, T6, T7, T3, T8]","[T8, T10, T10, T10, T4, T4, T4, T4, T10]","[supports, supports, supports, supports, suppo...","[For, For, Against]",More people are migrating to other countries t...,The last 50 years have seen an increasing numb...
2,essay003.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[352, 1568, 955, 487, 615, 786, 1024, 1454, 11...","[475, 1701, 1012, 613, 784, 943, 1135, 1526, 1...",[it has contributed to the economic developmen...,"[T10, T9, T6, T5, T4, T7]","[T8, T8, T3, T3, T3, T8]","[supports, supports, supports, supports, suppo...","[For, For]",International tourism is now more common than ...,The last decade has seen an increasing number ...
3,essay004.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11]","[MajorClaim, MajorClaim, Claim, Claim, Claim, ...","[262, 1663, 179, 953, 1578, 417, 532, 820, 107...","[376, 1758, 239, 1031, 1624, 530, 818, 924, 11...",[this industry has affected the cultural attri...,"[T8, T7, T6, T10, T9, T11]","[T4, T4, T4, T11, T5, T5]","[supports, supports, supports, supports, suppo...","[Against, For, For]",International tourism is now more common than ...,The last 50 years have seen a significant incr...
4,essay005.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, Claim, Claim, Claim, ...","[286, 1535, 388, 775, 1330, 566, 885, 993, 109...","[359, 1652, 564, 883, 1425, 765, 988, 1078, 11...",[one who studies overseas will gain many skill...,"[T6, T7, T8, T9, T10, T11]","[T3, T4, T7, T4, T5, T5]","[supports, attacks, attacks, supports, support...","[For, For, For, Against]",Living and studying overseas,It is every student's desire to study at a goo...
...,...,...,...,...,...,...,...,...,...,...,...,...
397,essay398.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[Claim, MajorClaim, Premise, Premise, Premise,...","[254, 1751, 409, 557, 647, 778, 915, 333, 1260...","[318, 1951, 541, 645, 767, 897, 1035, 407, 134...",[many different characters exist between male ...,"[T3, T4, T5, T6, T7, T13, T14, T11, T9, T15, T16]","[T8, T8, T8, T8, T8, T12, T12, T10, T10, T10, ...","[supports, supports, supports, supports, suppo...","[For, For, For, For]",We can not forcedly put the same numbers of ma...,There is a view that universities and colleges...
398,essay399.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[367, 1323, 527, 431, 661, 787, 966, 750, 1086...","[429, 1390, 645, 525, 737, 951, 1073, 785, 117...",[this is a worrying tread which has negative i...,"[T4, T5, T6, T8, T10]","[T3, T3, T7, T7, T9]","[supports, supports, supports, supports, suppo...","[For, For, For]","Drugs, alcohol and messy sex lives","Celebrities, for example movie stars, are usua..."
399,essay400.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[274, 1619, 736, 541, 805, 389, 1112, 1233, 14...","[387, 1812, 796, 723, 944, 524, 1220, 1388, 16...",[governments should devote a greater portion o...,"[T4, T5, T6, T7, T9, T8]","[T3, T3, T3, T8, T8, T10]","[supports, supports, supports, supports, suppo...","[For, For]",A greater proportion of the budget should be a...,"In today's world, the concept of the welfare s..."
400,essay401.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Premise, Premise, Cla...","[311, 1496, 502, 622, 386, 954, 1103, 818, 128...","[384, 1667, 607, 815, 488, 1087, 1278, 937, 13...",[fatherhood is a as vital part of a healty par...,"[T3, T7, T10, T4, T9, T6]","[T5, T6, T8, T3, T8, T8]","[supports, supports, supports, supports, suppo...","[For, For]",Fatherhood should be as present as motherhood ...,"In today's world, having and raising children ..."


In [84]:
def claim_sanity(row):
    
    return 1 if len([elem for elem in row.argument_types if elem == "Claim"]) == len(row.stance_types) else 0

In [85]:
pe_df['sanity_claims'] = pe_df.apply(lambda row: claim_sanity(row), axis=1)

In [87]:
pe_df['sanity_claims'].value_counts()

sanity_claims
1    402
Name: count, dtype: int64

In [88]:
### Sanity Checks

In [89]:
def is_ordered(row):
    """Checks if the list is in the correct order."""
    extracted_numbers = [int(re.search(r'\d+', item).group()) for item in row.argument_ids] # type: ignore
    return extracted_numbers == sorted(extracted_numbers)

In [90]:
pe_df['sanity_check_1'] = pe_df.apply(lambda row: is_ordered(row), axis=1)

In [91]:
pe_df['sanity_check_1'].value_counts()

sanity_check_1
True     354
False     48
Name: count, dtype: int64

In [92]:
pe_df[pe_df['sanity_check_1'] == False]

Unnamed: 0,ann_file_name,argument_ids,argument_types,arg_start_bounds,arg_end_bounds,argument_components,source_args,target_args,relation_types,stance_types,essay_title,essay_text,sanity_claims,sanity_check_1
9,essay010.ann,"[T3, T4, T5, T1, T2, T6, T7, T8, T9, T10, T11,...","[MajorClaim, Claim, Premise, Claim, Premise, P...","[328, 484, 399, 1244, 1086, 637, 732, 822, 947...","[397, 578, 473, 1353, 1231, 713, 820, 945, 107...","[governments spend more money on buses, trains...","[T5, T2, T8, T6, T7, T13, T11, T10, T14]","[T4, T1, T9, T9, T9, T12, T12, T12, T12]","[supports, supports, supports, supports, suppo...","[For, For, For, Against, For, For]",Should governments spend more money on improvi...,Nowadays transportation has attracted much soc...,1,False
11,essay012.ann,"[T3, T4, T5, T6, T7, T1, T2, T8, T9, T10, T11,...","[Claim, Premise, Premise, Premise, Premise, Cl...","[390, 469, 702, 899, 961, 1151, 1845, 1901, 10...","[467, 696, 897, 959, 1035, 1247, 1899, 1980, 1...",[email can be count as one of the most benefic...,"[T4, T5, T9, T7, T6, T13, T12, T11, T10]","[T3, T3, T1, T1, T1, T14, T14, T14, T14]","[supports, supports, supports, supports, suppo...","[For, For, For, For]",Advance in transportation and communication li...,"Nowadays, everyone can see the effect of techn...",1,False
13,essay014.ann,"[T1, T2, T3, T4, T5, T7, T8, T9, T10, T11, T12...","[Claim, Premise, Premise, Premise, Premise, Ma...","[419, 522, 611, 733, 811, 1519, 1607, 1137, 12...","[504, 609, 731, 790, 929, 1585, 1679, 1252, 12...",[students at schools and universities still le...,"[T2, T3, T4, T5, T12, T13, T10, T14, T9, T11]","[T1, T1, T1, T1, T11, T11, T9, T6, T6, T6]","[supports, supports, supports, supports, suppo...","[For, For, Against]",Lessons with teachers versus others sources,Over the last half century the change in the l...,1,False
15,essay016.ann,"[T1, T2, T3, T5, T4, T6, T7, T8, T9, T10, T11,...","[Claim, MajorClaim, MajorClaim, Claim, Premise...","[1530, 1579, 320, 800, 540, 591, 705, 885, 949...","[1577, 1639, 447, 876, 581, 703, 782, 947, 101...",[there is some disadvantages of animals' profi...,"[T6, T4, T7, T11, T9, T8, T13, T12]","[T4, T5, T5, T10, T8, T10, T10, T10]","[supports, supports, supports, supports, attac...","[Against, Against, For]",Using animals for the benefit of the human beings,With the rapid development of the standard of ...,1,False
22,essay023.ann,"[T3, T1, T2, T4, T5, T6, T7, T9, T10, T11, T12...","[MajorClaim, MajorClaim, Claim, Claim, Premise...","[216, 1831, 1959, 410, 541, 686, 853, 1002, 10...","[302, 1948, 2014, 497, 669, 837, 967, 1047, 12...",[I do not think these disadvantages will outwe...,"[T5, T6, T7, T10, T12, T13, T15]","[T4, T4, T4, T9, T11, T11, T14]","[supports, supports, attacks, supports, suppor...","[Against, Against, Against, Against, For]",Effects of mobile phones,"Nowadays, the popularity of mobile phones has ...",1,False
25,essay026.ann,"[T3, T4, T5, T6, T7, T8, T9, T10, T1, T2, T11,...","[MajorClaim, Claim, Claim, Premise, Premise, P...","[193, 104, 911, 501, 683, 752, 979, 1137, 1745...","[383, 183, 977, 676, 736, 903, 1041, 1295, 185...",[it is obvious that prepared food can bring ab...,"[T8, T7, T6, T10, T12, T13]","[T7, T5, T5, T9, T11, T11]","[supports, supports, supports, supports, suppo...","[Against, For, For, Against, For]",Prepared Food,"Nowadays, more and more people begin to select...",1,False
29,essay030.ann,"[T12, T13, T14, T15, T1, T2, T3, T4, T5, T6, T7]","[Claim, Premise, MajorClaim, Claim, MajorClaim...","[1123, 1236, 1342, 1400, 148, 318, 421, 572, 7...","[1232, 1301, 1391, 1558, 316, 400, 570, 720, 8...",[this will encourage all the parents to think ...,"[T13, T4, T5, T7]","[T12, T3, T6, T6]","[supports, supports, supports, supports]","[For, Against, For, For, For]",Government and education,Primary and secondary education provide a basi...,1,False
40,essay041.ann,"[T2, T3, T4, T5, T6, T1, T7, T8, T9]","[Claim, Premise, Premise, Premise, Premise, Cl...","[388, 649, 508, 791, 882, 1295, 1213, 966, 1396]","[506, 772, 634, 880, 951, 1383, 1282, 1202, 1677]",[students can learn invaluable experiences thr...,"[T4, T8, T7, T3, T5, T6]","[T3, T1, T1, T5, T2, T1]","[supports, supports, supports, supports, suppo...","[For, For]",Benefits of students' unpaid work,The purpose of this essay is to discuss what c...,1,False
45,essay046.ann,"[T3, T4, T5, T6, T2, T7, T8, T9]","[Claim, Premise, Premise, Premise, MajorClaim,...","[706, 827, 955, 909, 1683, 1542, 1228, 1315]","[795, 904, 1207, 953, 1778, 1666, 1313, 1488]","[A university, in common sense, is a medium fo...","[T6, T9, T4, T5, T8]","[T4, T7, T3, T3, T7]","[supports, supports, supports, supports, suppo...","[For, For]",True function of a university,Universities have quickly become the places to...,1,False
58,essay059.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, MajorClaim, Claim, Cl...","[252, 1667, 1757, 1871, 589, 452, 318, 386, 84...","[316, 1733, 1869, 1954, 652, 581, 384, 450, 90...","[strict laws, like capital punishment, are ess...","[T6, T8, T7, T10, T11, T12, T16, T14, T18, T19...","[T5, T5, T5, T9, T9, T13, T15, T15, T17, T17, ...","[supports, supports, supports, supports, suppo...","[For, For, For, For, Against]",Capital punishment - necessary or not,Crime is always punishable. Retribution is the...,1,False


In [93]:
def get_source_acs(row):
    
    arg_ids = row.argument_ids    
    source_ids = row.source_args
    acs = row.argument_components
    
    source_acs = []
    
    for id in source_ids:
        
        idx = arg_ids.index(id)
        source_acs.append(acs[idx])
        
    return source_acs

In [94]:
get_source_acs(pe_df.iloc[0])

['What we acquired from team work is not only how to achieve the same goal with others but more importantly, how to get along with others',
 'During the process of cooperation, children can learn about how to listen to opinions of others, how to communicate with others, how to think comprehensively, and even how to compromise with other team members when conflicts occurred',
 'All of these skills help them to get on well with other people and will benefit them for the whole life',
 'Take Olympic games which is a form of competition for instance, it is hard to imagine how an athlete could win the game without the training of his or her coach, and the help of other professional staffs such as the people who take care of his diet, and those who are in charge of the medical care',
 'when we consider about the question that how to win the game, we always find that we need the cooperation',
 'the significance of competition is that how to become more excellence to gain the victory']

In [95]:
def get_target_acs(row):
    
    arg_ids = row.argument_ids    
    target_ids = row.target_args
    acs = row.argument_components
    
    target_acs = []
    
    for id in target_ids:
        
        idx = arg_ids.index(id)
        target_acs.append(acs[idx])
        
    return target_acs

In [96]:
get_target_acs(pe_df.iloc[0])

['through cooperation, children can learn about interpersonal skills which are significant in the future life of all students',
 'through cooperation, children can learn about interpersonal skills which are significant in the future life of all students',
 'through cooperation, children can learn about interpersonal skills which are significant in the future life of all students',
 'without the cooperation, there would be no victory of competition',
 'without the cooperation, there would be no victory of competition',
 'competition makes the society more effective']

In [97]:
pe_df.iloc[0]

ann_file_name                                               essay001.ann
argument_ids              [T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11]
argument_types         [MajorClaim, MajorClaim, Claim, Premise, Premi...
arg_start_bounds       [503, 2154, 591, 716, 853, 1088, 1332, 1212, 1...
arg_end_bounds         [575, 2231, 714, 851, 1086, 1191, 1376, 1301, ...
argument_components    [we should attach more importance to cooperati...
source_args                                    [T4, T5, T6, T10, T9, T8]
target_args                                   [T3, T3, T3, T11, T11, T7]
relation_types         [supports, supports, supports, supports, suppo...
stance_types                                         [For, Against, For]
essay_title            Should students be taught to compete or to coo...
essay_text             It is always said that competition can effecti...
sanity_claims                                                          1
sanity_check_1                                     

In [98]:
pe_df.iloc[0]['argument_components']

['we should attach more importance to cooperation during primary education',
 "a more cooperative attitudes towards life is more profitable in one's success",
 'through cooperation, children can learn about interpersonal skills which are significant in the future life of all students',
 'What we acquired from team work is not only how to achieve the same goal with others but more importantly, how to get along with others',
 'During the process of cooperation, children can learn about how to listen to opinions of others, how to communicate with others, how to think comprehensively, and even how to compromise with other team members when conflicts occurred',
 'All of these skills help them to get on well with other people and will benefit them for the whole life',
 'competition makes the society more effective',
 'the significance of competition is that how to become more excellence to gain the victory',
 'when we consider about the question that how to win the game, we always find that 

In [99]:
pe_df['source_acs'] = pe_df.apply(lambda row: get_source_acs(row), axis=1)

In [100]:
pe_df['target_acs'] = pe_df.apply(lambda row: get_target_acs(row), axis=1)

In [101]:
pe_df

Unnamed: 0,ann_file_name,argument_ids,argument_types,arg_start_bounds,arg_end_bounds,argument_components,source_args,target_args,relation_types,stance_types,essay_title,essay_text,sanity_claims,sanity_check_1,source_acs,target_acs
0,essay001.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[503, 2154, 591, 716, 853, 1088, 1332, 1212, 1...","[575, 2231, 714, 851, 1086, 1191, 1376, 1301, ...",[we should attach more importance to cooperati...,"[T4, T5, T6, T10, T9, T8]","[T3, T3, T3, T11, T11, T7]","[supports, supports, supports, supports, suppo...","[For, Against, For]",Should students be taught to compete or to coo...,It is always said that competition can effecti...,1,True,[What we acquired from team work is not only h...,"[through cooperation, children can learn about..."
1,essay002.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, Premise, Claim, Premi...","[391, 1936, 500, 1089, 626, 841, 948, 1168, 12...","[489, 2077, 624, 1156, 839, 946, 1057, 1204, 1...",[they are able to sustain their cultural ident...,"[T9, T11, T12, T13, T5, T6, T7, T3, T8]","[T8, T10, T10, T10, T4, T4, T4, T4, T10]","[supports, supports, supports, supports, suppo...","[For, For, Against]",More people are migrating to other countries t...,The last 50 years have seen an increasing numb...,1,True,[they need a connection back to their country ...,"[it is crucial to keep one’s identity, keeping..."
2,essay003.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[352, 1568, 955, 487, 615, 786, 1024, 1454, 11...","[475, 1701, 1012, 613, 784, 943, 1135, 1526, 1...",[it has contributed to the economic developmen...,"[T10, T9, T6, T5, T4, T7]","[T8, T8, T3, T3, T3, T8]","[supports, supports, supports, supports, suppo...","[For, For]",International tourism is now more common than ...,The last decade has seen an increasing number ...,1,True,"[Without this support and profit from tourism,...",[tourism has survived many non-tangible cultur...
3,essay004.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11]","[MajorClaim, MajorClaim, Claim, Claim, Claim, ...","[262, 1663, 179, 953, 1578, 417, 532, 820, 107...","[376, 1758, 239, 1031, 1624, 530, 818, 924, 11...",[this industry has affected the cultural attri...,"[T8, T7, T6, T10, T9, T11]","[T4, T4, T4, T11, T5, T5]","[supports, supports, supports, supports, suppo...","[Against, For, For]",International tourism is now more common than ...,The last 50 years have seen a significant incr...,1,True,[This was due to the lack of adequate controls...,[international tourism can create negative imp...
4,essay005.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, Claim, Claim, Claim, ...","[286, 1535, 388, 775, 1330, 566, 885, 993, 109...","[359, 1652, 564, 883, 1425, 765, 988, 1078, 11...",[one who studies overseas will gain many skill...,"[T6, T7, T8, T9, T10, T11]","[T3, T4, T7, T4, T5, T5]","[supports, attacks, attacks, supports, support...","[For, For, For, Against]",Living and studying overseas,It is every student's desire to study at a goo...,1,True,[Compared to the peers studying in the home co...,[studying at an overseas university gives indi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397,essay398.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[Claim, MajorClaim, Premise, Premise, Premise,...","[254, 1751, 409, 557, 647, 778, 915, 333, 1260...","[318, 1951, 541, 645, 767, 897, 1035, 407, 134...",[many different characters exist between male ...,"[T3, T4, T5, T6, T7, T13, T14, T11, T9, T15, T16]","[T8, T8, T8, T8, T8, T12, T12, T10, T10, T10, ...","[supports, supports, supports, supports, suppo...","[For, For, For, For]",We can not forcedly put the same numbers of ma...,There is a view that universities and colleges...,1,True,[Most of male students tend to use their left ...,[boys and girls have the diversity in psycholo...
398,essay399.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[367, 1323, 527, 431, 661, 787, 966, 750, 1086...","[429, 1390, 645, 525, 737, 951, 1073, 785, 117...",[this is a worrying tread which has negative i...,"[T4, T5, T6, T8, T10]","[T3, T3, T7, T7, T9]","[supports, supports, supports, supports, suppo...","[For, For, For]","Drugs, alcohol and messy sex lives","Celebrities, for example movie stars, are usua...",1,True,[The first impression of the celebrities seems...,[This gives children the idea that it is not n...
399,essay400.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[274, 1619, 736, 541, 805, 389, 1112, 1233, 14...","[387, 1812, 796, 723, 944, 524, 1220, 1388, 16...",[governments should devote a greater portion o...,"[T4, T5, T6, T7, T9, T8]","[T3, T3, T3, T8, T8, T10]","[supports, supports, supports, supports, suppo...","[For, For]",A greater proportion of the budget should be a...,"In today's world, the concept of the welfare s...",1,True,[decreasing the number of patients in the heal...,[society should be educated and became aware o...
400,essay401.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Premise, Premise, Cla...","[311, 1496, 502, 622, 386, 954, 1103, 818, 128...","[384, 1667, 607, 815, 488, 1087, 1278, 937, 13...",[fatherhood is a as vital part of a healty par...,"[T3, T7, T10, T4, T9, T6]","[T5, T6, T8, T3, T8, T8]","[supports, supports, supports, supports, suppo...","[For, For]",Fatherhood should be as present as motherhood ...,"In today's world, having and raising children ...",1,True,[the role models within a family play a signif...,[The first reason why the father's role should...


In [102]:
def build_pairs(row):
    
    source_acs = row.source_acs
    target_acs = row.target_acs
    essay_acs_l = row.argument_components
    stances_l = row.relation_types
    
    pairs = []
    
    for s_ac, t_ac, stance in zip(source_acs, target_acs, stances_l):
        
        pairs.append((essay_acs_l.index(s_ac)+1, essay_acs_l.index(t_ac)+1, stance))
        
    return pairs

In [103]:
build_pairs(pe_df.iloc[370])

[(5, 4, 'attacks'),
 (2, 6, 'supports'),
 (3, 6, 'supports'),
 (4, 6, 'attacks'),
 (8, 7, 'supports'),
 (7, 10, 'supports')]

In [104]:
pe_df['relation_pairs'] = pe_df.apply(lambda row: build_pairs(row), axis=1)

In [105]:
sum(pe_df.relation_pairs.apply(len))

3832

In [106]:
sum(pe_df.relation_types.apply(len))

3832

In [107]:
sum(pe_df.stance_types.apply(len))

1506

In [108]:
pe_df

Unnamed: 0,ann_file_name,argument_ids,argument_types,arg_start_bounds,arg_end_bounds,argument_components,source_args,target_args,relation_types,stance_types,essay_title,essay_text,sanity_claims,sanity_check_1,source_acs,target_acs,relation_pairs
0,essay001.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[503, 2154, 591, 716, 853, 1088, 1332, 1212, 1...","[575, 2231, 714, 851, 1086, 1191, 1376, 1301, ...",[we should attach more importance to cooperati...,"[T4, T5, T6, T10, T9, T8]","[T3, T3, T3, T11, T11, T7]","[supports, supports, supports, supports, suppo...","[For, Against, For]",Should students be taught to compete or to coo...,It is always said that competition can effecti...,1,True,[What we acquired from team work is not only h...,"[through cooperation, children can learn about...","[(4, 3, supports), (5, 3, supports), (6, 3, su..."
1,essay002.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, Premise, Claim, Premi...","[391, 1936, 500, 1089, 626, 841, 948, 1168, 12...","[489, 2077, 624, 1156, 839, 946, 1057, 1204, 1...",[they are able to sustain their cultural ident...,"[T9, T11, T12, T13, T5, T6, T7, T3, T8]","[T8, T10, T10, T10, T4, T4, T4, T4, T10]","[supports, supports, supports, supports, suppo...","[For, For, Against]",More people are migrating to other countries t...,The last 50 years have seen an increasing numb...,1,True,[they need a connection back to their country ...,"[it is crucial to keep one’s identity, keeping...","[(9, 8, supports), (11, 10, supports), (12, 10..."
2,essay003.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[352, 1568, 955, 487, 615, 786, 1024, 1454, 11...","[475, 1701, 1012, 613, 784, 943, 1135, 1526, 1...",[it has contributed to the economic developmen...,"[T10, T9, T6, T5, T4, T7]","[T8, T8, T3, T3, T3, T8]","[supports, supports, supports, supports, suppo...","[For, For]",International tourism is now more common than ...,The last decade has seen an increasing number ...,1,True,"[Without this support and profit from tourism,...",[tourism has survived many non-tangible cultur...,"[(10, 8, supports), (9, 8, supports), (6, 3, s..."
3,essay004.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11]","[MajorClaim, MajorClaim, Claim, Claim, Claim, ...","[262, 1663, 179, 953, 1578, 417, 532, 820, 107...","[376, 1758, 239, 1031, 1624, 530, 818, 924, 11...",[this industry has affected the cultural attri...,"[T8, T7, T6, T10, T9, T11]","[T4, T4, T4, T11, T5, T5]","[supports, supports, supports, supports, suppo...","[Against, For, For]",International tourism is now more common than ...,The last 50 years have seen a significant incr...,1,True,[This was due to the lack of adequate controls...,[international tourism can create negative imp...,"[(8, 4, supports), (7, 4, supports), (6, 4, su..."
4,essay005.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, Claim, Claim, Claim, ...","[286, 1535, 388, 775, 1330, 566, 885, 993, 109...","[359, 1652, 564, 883, 1425, 765, 988, 1078, 11...",[one who studies overseas will gain many skill...,"[T6, T7, T8, T9, T10, T11]","[T3, T4, T7, T4, T5, T5]","[supports, attacks, attacks, supports, support...","[For, For, For, Against]",Living and studying overseas,It is every student's desire to study at a goo...,1,True,[Compared to the peers studying in the home co...,[studying at an overseas university gives indi...,"[(6, 3, supports), (7, 4, attacks), (8, 7, att..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397,essay398.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[Claim, MajorClaim, Premise, Premise, Premise,...","[254, 1751, 409, 557, 647, 778, 915, 333, 1260...","[318, 1951, 541, 645, 767, 897, 1035, 407, 134...",[many different characters exist between male ...,"[T3, T4, T5, T6, T7, T13, T14, T11, T9, T15, T16]","[T8, T8, T8, T8, T8, T12, T12, T10, T10, T10, ...","[supports, supports, supports, supports, suppo...","[For, For, For, For]",We can not forcedly put the same numbers of ma...,There is a view that universities and colleges...,1,True,[Most of male students tend to use their left ...,[boys and girls have the diversity in psycholo...,"[(3, 8, supports), (4, 8, supports), (5, 8, su..."
398,essay399.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[367, 1323, 527, 431, 661, 787, 966, 750, 1086...","[429, 1390, 645, 525, 737, 951, 1073, 785, 117...",[this is a worrying tread which has negative i...,"[T4, T5, T6, T8, T10]","[T3, T3, T7, T7, T9]","[supports, supports, supports, supports, suppo...","[For, For, For]","Drugs, alcohol and messy sex lives","Celebrities, for example movie stars, are usua...",1,True,[The first impression of the celebrities seems...,[This gives children the idea that it is not n...,"[(4, 3, supports), (5, 3, supports), (6, 7, su..."
399,essay400.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[274, 1619, 736, 541, 805, 389, 1112, 1233, 14...","[387, 1812, 796, 723, 944, 524, 1220, 1388, 16...",[governments should devote a greater portion o...,"[T4, T5, T6, T7, T9, T8]","[T3, T3, T3, T8, T8, T10]","[supports, supports, supports, supports, suppo...","[For, For]",A greater proportion of the budget should be a...,"In today's world, the concept of the welfare s...",1,True,[decreasing the number of patients in the heal...,[society should be educated and became aware o...,"[(4, 3, supports), (5, 3, supports), (6, 3, su..."
400,essay401.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Premise, Premise, Cla...","[311, 1496, 502, 622, 386, 954, 1103, 818, 128...","[384, 1667, 607, 815, 488, 1087, 1278, 937, 13...",[fatherhood is a as vital part of a healty par...,"[T3, T7, T10, T4, T9, T6]","[T5, T6, T8, T3, T8, T8]","[supports, supports, supports, supports, suppo...","[For, For]",Fatherhood should be as present as motherhood ...,"In today's world, having and raising children ...",1,True,[the role models within a family play a signif...,[The first reason why the father's role should...,"[(3, 5, supports), (7, 6, supports), (10, 8, s..."


In [134]:
def c_mc_pairs(row):
    
    arg_types = row.argument_types
    stances = row.stance_types
    
    pairs = [(i+1, j+1) for i, _ in enumerate(arg_types) for j, _ in enumerate(arg_types) if (arg_types[i] == "Claim" and arg_types[j] == "MajorClaim")]
    
    triplets = []
    i = 0
    
    stance = stances[i]
    source = pairs[0][0]
    
    for pair in pairs:
        
        new_source = pair[0]
        if new_source != source:
            i+=1
            stance = stances[i]
        
        #print((pair, stance))
        triplets.append((pair[0], pair[1], stance))
        source = new_source
    
    return triplets
    #triplets = [(pair, stance) for pair in pairs for idx, stance in enumerate(stances) if [elem for elem in arg_types if elem == "Claim"][idx]]

In [135]:
c_mc_pairs(pe_df.iloc[0])

[(3, 1, 'For'),
 (3, 2, 'For'),
 (7, 1, 'Against'),
 (7, 2, 'Against'),
 (11, 1, 'For'),
 (11, 2, 'For')]

In [118]:
pe_df.iloc[0]

ann_file_name                                               essay001.ann
argument_ids              [T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11]
argument_types         [MajorClaim, MajorClaim, Claim, Premise, Premi...
arg_start_bounds       [503, 2154, 591, 716, 853, 1088, 1332, 1212, 1...
arg_end_bounds         [575, 2231, 714, 851, 1086, 1191, 1376, 1301, ...
argument_components    [we should attach more importance to cooperati...
source_args                                    [T4, T5, T6, T10, T9, T8]
target_args                                   [T3, T3, T3, T11, T11, T7]
relation_types         [supports, supports, supports, supports, suppo...
stance_types                                         [For, Against, For]
essay_title            Should students be taught to compete or to coo...
essay_text             It is always said that competition can effecti...
sanity_claims                                                          1
sanity_check_1                                     

In [136]:
pe_df['stance_pairs'] = pe_df.apply(lambda row: c_mc_pairs(row), axis=1)

In [137]:
pe_df

Unnamed: 0,ann_file_name,argument_ids,argument_types,arg_start_bounds,arg_end_bounds,argument_components,source_args,target_args,relation_types,stance_types,essay_title,essay_text,sanity_claims,sanity_check_1,source_acs,target_acs,relation_pairs,stance_pairs
0,essay001.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[503, 2154, 591, 716, 853, 1088, 1332, 1212, 1...","[575, 2231, 714, 851, 1086, 1191, 1376, 1301, ...",[we should attach more importance to cooperati...,"[T4, T5, T6, T10, T9, T8]","[T3, T3, T3, T11, T11, T7]","[supports, supports, supports, supports, suppo...","[For, Against, For]",Should students be taught to compete or to coo...,It is always said that competition can effecti...,1,True,[What we acquired from team work is not only h...,"[through cooperation, children can learn about...","[(4, 3, supports), (5, 3, supports), (6, 3, su...","[(3, 1, For), (3, 2, For), (7, 1, Against), (7..."
1,essay002.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, Premise, Claim, Premi...","[391, 1936, 500, 1089, 626, 841, 948, 1168, 12...","[489, 2077, 624, 1156, 839, 946, 1057, 1204, 1...",[they are able to sustain their cultural ident...,"[T9, T11, T12, T13, T5, T6, T7, T3, T8]","[T8, T10, T10, T10, T4, T4, T4, T4, T10]","[supports, supports, supports, supports, suppo...","[For, For, Against]",More people are migrating to other countries t...,The last 50 years have seen an increasing numb...,1,True,[they need a connection back to their country ...,"[it is crucial to keep one’s identity, keeping...","[(9, 8, supports), (11, 10, supports), (12, 10...","[(4, 1, For), (4, 2, For), (10, 1, For), (10, ..."
2,essay003.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[352, 1568, 955, 487, 615, 786, 1024, 1454, 11...","[475, 1701, 1012, 613, 784, 943, 1135, 1526, 1...",[it has contributed to the economic developmen...,"[T10, T9, T6, T5, T4, T7]","[T8, T8, T3, T3, T3, T8]","[supports, supports, supports, supports, suppo...","[For, For]",International tourism is now more common than ...,The last decade has seen an increasing number ...,1,True,"[Without this support and profit from tourism,...",[tourism has survived many non-tangible cultur...,"[(10, 8, supports), (9, 8, supports), (6, 3, s...","[(3, 1, For), (3, 2, For), (8, 1, For), (8, 2,..."
3,essay004.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11]","[MajorClaim, MajorClaim, Claim, Claim, Claim, ...","[262, 1663, 179, 953, 1578, 417, 532, 820, 107...","[376, 1758, 239, 1031, 1624, 530, 818, 924, 11...",[this industry has affected the cultural attri...,"[T8, T7, T6, T10, T9, T11]","[T4, T4, T4, T11, T5, T5]","[supports, supports, supports, supports, suppo...","[Against, For, For]",International tourism is now more common than ...,The last 50 years have seen a significant incr...,1,True,[This was due to the lack of adequate controls...,[international tourism can create negative imp...,"[(8, 4, supports), (7, 4, supports), (6, 4, su...","[(3, 1, Against), (3, 2, Against), (4, 1, For)..."
4,essay005.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, Claim, Claim, Claim, ...","[286, 1535, 388, 775, 1330, 566, 885, 993, 109...","[359, 1652, 564, 883, 1425, 765, 988, 1078, 11...",[one who studies overseas will gain many skill...,"[T6, T7, T8, T9, T10, T11]","[T3, T4, T7, T4, T5, T5]","[supports, attacks, attacks, supports, support...","[For, For, For, Against]",Living and studying overseas,It is every student's desire to study at a goo...,1,True,[Compared to the peers studying in the home co...,[studying at an overseas university gives indi...,"[(6, 3, supports), (7, 4, attacks), (8, 7, att...","[(3, 1, For), (3, 2, For), (4, 1, For), (4, 2,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397,essay398.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[Claim, MajorClaim, Premise, Premise, Premise,...","[254, 1751, 409, 557, 647, 778, 915, 333, 1260...","[318, 1951, 541, 645, 767, 897, 1035, 407, 134...",[many different characters exist between male ...,"[T3, T4, T5, T6, T7, T13, T14, T11, T9, T15, T16]","[T8, T8, T8, T8, T8, T12, T12, T10, T10, T10, ...","[supports, supports, supports, supports, suppo...","[For, For, For, For]",We can not forcedly put the same numbers of ma...,There is a view that universities and colleges...,1,True,[Most of male students tend to use their left ...,[boys and girls have the diversity in psycholo...,"[(3, 8, supports), (4, 8, supports), (5, 8, su...","[(1, 2, For), (8, 2, For), (10, 2, For), (12, ..."
398,essay399.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[367, 1323, 527, 431, 661, 787, 966, 750, 1086...","[429, 1390, 645, 525, 737, 951, 1073, 785, 117...",[this is a worrying tread which has negative i...,"[T4, T5, T6, T8, T10]","[T3, T3, T7, T7, T9]","[supports, supports, supports, supports, suppo...","[For, For, For]","Drugs, alcohol and messy sex lives","Celebrities, for example movie stars, are usua...",1,True,[The first impression of the celebrities seems...,[This gives children the idea that it is not n...,"[(4, 3, supports), (5, 3, supports), (6, 7, su...","[(3, 1, For), (3, 2, For), (7, 1, For), (7, 2,..."
399,essay400.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[274, 1619, 736, 541, 805, 389, 1112, 1233, 14...","[387, 1812, 796, 723, 944, 524, 1220, 1388, 16...",[governments should devote a greater portion o...,"[T4, T5, T6, T7, T9, T8]","[T3, T3, T3, T8, T8, T10]","[supports, supports, supports, supports, suppo...","[For, For]",A greater proportion of the budget should be a...,"In today's world, the concept of the welfare s...",1,True,[decreasing the number of patients in the heal...,[society should be educated and became aware o...,"[(4, 3, supports), (5, 3, supports), (6, 3, su...","[(3, 1, For), (3, 2, For), (10, 1, For), (10, ..."
400,essay401.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Premise, Premise, Cla...","[311, 1496, 502, 622, 386, 954, 1103, 818, 128...","[384, 1667, 607, 815, 488, 1087, 1278, 937, 13...",[fatherhood is a as vital part of a healty par...,"[T3, T7, T10, T4, T9, T6]","[T5, T6, T8, T3, T8, T8]","[supports, supports, supports, supports, suppo...","[For, For]",Fatherhood should be as present as motherhood ...,"In today's world, having and raising children ...",1,True,[the role models within a family play a signif...,[The first reason why the father's role should...,"[(3, 5, supports), (7, 6, supports), (10, 8, s...","[(5, 1, For), (5, 2, For), (8, 1, For), (8, 2,..."


In [138]:
pe_df.columns

Index(['ann_file_name', 'argument_ids', 'argument_types', 'arg_start_bounds',
       'arg_end_bounds', 'argument_components', 'source_args', 'target_args',
       'relation_types', 'stance_types', 'essay_title', 'essay_text',
       'sanity_claims', 'sanity_check_1', 'source_acs', 'target_acs',
       'relation_pairs', 'stance_pairs'],
      dtype='object')

In [31]:
pe_df['argument_ids'] = pe_df['argument_ids'].apply(lambda x: x[0] if isinstance(x, list) else x)

In [32]:
pe_df['sort_key'] = pe_df['argument_ids'].apply(lambda x: int(x[1:])) 

In [33]:
sorted_pe_df = pe_df.sort_values(by='sort_key').reset_index(drop=True)

In [34]:
pe_df.columns

Index(['ann_file_name', 'argument_ids', 'argument_types', 'arg_start_bounds',
       'arg_end_bounds', 'argument_components', 'source_args', 'target_args',
       'relation_types', 'essay_title', 'essay_text', 'sanity_check_1',
       'source_acs', 'target_acs', 'sort_key'],
      dtype='object')

In [35]:
columns_to_sort = ['argument_types', 'arg_start_bounds',
       'arg_end_bounds', 'argument_components']

In [36]:
for col in pe_df.columns:
    if col not in columns_to_sort:
        sorted_pe_df[col] = pe_df[col]

In [37]:
sorted_pe_df = sorted_pe_df.drop(columns=['sort_key'])

In [38]:
sorted_pe_df

Unnamed: 0,ann_file_name,argument_ids,argument_types,arg_start_bounds,arg_end_bounds,argument_components,source_args,target_args,relation_types,essay_title,essay_text,sanity_check_1,source_acs,target_acs
0,essay001.ann,T1,"[MajorClaim, MajorClaim, Claim, Premise, Premi...","[503, 2154, 591, 716, 853, 1088, 1332, 1212, 1...","[575, 2231, 714, 851, 1086, 1191, 1376, 1301, ...",[we should attach more importance to cooperati...,"[T4, T5, T6, T10, T9, T8]","[T3, T3, T3, T11, T11, T7]","[supports, supports, supports, supports, suppo...",Should students be taught to compete or to coo...,It is always said that competition can effecti...,True,[What we acquired from team work is not only h...,"[through cooperation, children can learn about..."
1,essay002.ann,T1,"[Claim, Claim, Premise, MajorClaim, Claim, Pre...","[87, 163, 242, 24, 445, 324, 838, 926, 988, 10...","[161, 232, 322, 85, 698, 443, 924, 983, 1082, ...",[The big cities have lots of advantages and fa...,"[T9, T11, T12, T13, T5, T6, T7, T3, T8]","[T8, T10, T10, T10, T4, T4, T4, T4, T10]","[supports, supports, supports, supports, suppo...",More people are migrating to other countries t...,The last 50 years have seen an increasing numb...,True,[they need a connection back to their country ...,"[it is crucial to keep one’s identity, keeping..."
2,essay003.ann,T1,"[MajorClaim, Claim, Premise, Premise, Premise,...","[217, 137, 500, 599, 838, 733, 908, 406, 971, ...","[338, 215, 597, 690, 906, 822, 961, 498, 1071,...",[I personally disagree with this notion that t...,"[T10, T9, T6, T5, T4, T7]","[T8, T8, T3, T3, T3, T8]","[supports, supports, supports, supports, suppo...",International tourism is now more common than ...,The last decade has seen an increasing number ...,True,"[Without this support and profit from tourism,...",[tourism has survived many non-tangible cultur...
3,essay004.ann,T1,"[MajorClaim, Claim, Premise, Premise, Claim, P...","[530, 618, 826, 891, 1084, 1163, 1275, 1551, 1...","[575, 818, 880, 1007, 1156, 1273, 1464, 1672, ...",[being present at classes seems a wiser option...,"[T8, T7, T6, T10, T9, T11]","[T4, T4, T4, T11, T5, T5]","[supports, supports, supports, supports, suppo...",International tourism is now more common than ...,The last 50 years have seen a significant incr...,True,[This was due to the lack of adequate controls...,[international tourism can create negative imp...
4,essay005.ann,T1,"[Claim, MajorClaim, Claim, Premise, Premise, P...","[284, 183, 410, 508, 686, 899, 1030, 1202, 133...","[350, 277, 506, 684, 897, 1009, 1184, 1337, 14...",[it can improve the efficiency and reduce the ...,"[T6, T7, T8, T9, T10, T11]","[T3, T4, T7, T4, T5, T5]","[supports, attacks, attacks, supports, support...",Living and studying overseas,It is every student's desire to study at a goo...,True,[Compared to the peers studying in the home co...,[studying at an overseas university gives indi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397,essay398.ann,T1,"[Claim, Premise, Premise, Premise, MajorClaim,...","[706, 827, 955, 909, 1683, 1542, 1228, 1315]","[795, 904, 1207, 953, 1778, 1666, 1313, 1488]","[A university, in common sense, is a medium fo...","[T3, T4, T5, T6, T7, T13, T14, T11, T9, T15, T16]","[T8, T8, T8, T8, T8, T12, T12, T10, T10, T10, ...","[supports, supports, supports, supports, suppo...",We can not forcedly put the same numbers of ma...,There is a view that universities and colleges...,True,[Most of male students tend to use their left ...,[boys and girls have the diversity in psycholo...
398,essay399.ann,T1,"[MajorClaim, MajorClaim, Claim, Claim, Premise...","[216, 1831, 1959, 410, 541, 686, 853, 1002, 10...","[302, 1948, 2014, 497, 669, 837, 967, 1047, 12...",[I do not think these disadvantages will outwe...,"[T4, T5, T6, T8, T10]","[T3, T3, T7, T7, T9]","[supports, supports, supports, supports, suppo...","Drugs, alcohol and messy sex lives","Celebrities, for example movie stars, are usua...",True,[The first impression of the celebrities seems...,[This gives children the idea that it is not n...
399,essay400.ann,T1,"[MajorClaim, Claim, Claim, Premise, Premise, P...","[193, 104, 911, 501, 683, 752, 979, 1137, 1745...","[383, 183, 977, 676, 736, 903, 1041, 1295, 185...",[it is obvious that prepared food can bring ab...,"[T4, T5, T6, T7, T9, T8]","[T3, T3, T3, T8, T8, T10]","[supports, supports, supports, supports, suppo...",A greater proportion of the budget should be a...,"In today's world, the concept of the welfare s...",True,[decreasing the number of patients in the heal...,[society should be educated and became aware o...
400,essay401.ann,T1,"[MajorClaim, Claim, Premise, Claim, Premise, P...","[328, 484, 399, 1244, 1086, 637, 732, 822, 947...","[397, 578, 473, 1353, 1231, 713, 820, 945, 107...","[governments spend more money on buses, trains...","[T3, T7, T10, T4, T9, T6]","[T5, T6, T8, T3, T8, T8]","[supports, supports, supports, supports, suppo...",Fatherhood should be as present as motherhood ...,"In today's world, having and raising children ...",True,[the role models within a family play a signif...,[The first reason why the father's role should...


In [None]:
def sort_arg_ids(row):
    
    sorted_indices = sorted(range(len(row.argument_ids)), key=lambda i: int(row.argument_ids[i][1:]))

In [None]:
### Sanity Check 2

In [62]:
def sanity_check(row):
    
    #check1 = True if len(row.argument_ids) == len(row.argument_components) else False
    check2 = True if int(re.match(r"([A-Za-z]+)(\d+)", row.argument_ids[-1]).groups()[1]) == len(row.argument_components) else False # type: ignore
    
    #return check1
    return check2

In [63]:
pe_df['sanity_check'] = pe_df.apply(lambda row: sanity_check(row), axis=1)

In [66]:
pe_df[pe_df['sanity_check'] == False]

Unnamed: 0,ann_file_name,argument_ids,argument_types,arg_start_bounds,arg_end_bounds,argument_components,source_args,target_args,relation_types,essay_title,essay_text,sanity_check
22,essay023.ann,"[T3, T1, T2, T4, T5, T6, T7, T9, T10, T11, T12...","[MajorClaim, MajorClaim, Claim, Claim, Premise...","[216, 1831, 1959, 410, 541, 686, 853, 1002, 10...","[302, 1948, 2014, 497, 669, 837, 967, 1047, 12...",[I do not think these disadvantages will outwe...,"[T5, T6, T7, T10, T12, T13, T15]","[T4, T4, T4, T9, T11, T11, T14]","[supports, supports, attacks, supports, suppor...",Effects of mobile phones,"Nowadays, the popularity of mobile phones has ...",False
29,essay030.ann,"[T12, T13, T14, T15, T1, T2, T3, T4, T5, T6, T7]","[Claim, Premise, MajorClaim, Claim, MajorClaim...","[1123, 1236, 1342, 1400, 148, 318, 421, 572, 7...","[1232, 1301, 1391, 1558, 316, 400, 570, 720, 8...",[this will encourage all the parents to think ...,"[T13, T4, T5, T7]","[T12, T3, T6, T6]","[supports, supports, supports, supports]",Government and education,Primary and secondary education provide a basi...,False
45,essay046.ann,"[T3, T4, T5, T6, T2, T7, T8, T9]","[Claim, Premise, Premise, Premise, MajorClaim,...","[706, 827, 955, 909, 1683, 1542, 1228, 1315]","[795, 904, 1207, 953, 1778, 1666, 1313, 1488]","[A university, in common sense, is a medium fo...","[T6, T9, T4, T5, T8]","[T4, T7, T3, T3, T7]","[supports, supports, supports, supports, suppo...",True function of a university,Universities have quickly become the places to...,False
183,essay184.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, Claim, Claim, Claim, ...","[301, 1668, 1595, 1726, 1101, 453, 505, 661, 7...","[444, 1724, 1662, 1770, 1159, 503, 645, 739, 9...",[it is much more gratifying to stay side by si...,"[T7, T8, T9, T10, T11, T14, T13, T15, T12, T16...","[T6, T6, T6, T6, T6, T13, T5, T5, T5, T5, T5]","[supports, supports, supports, supports, suppo...",Live football game at a stadium,Have you ever traveled along to support your h...,False
192,essay193.ann,"[T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12...","[Claim, Premise, Premise, Premise, Claim, Prem...","[370, 468, 509, 602, 787, 857, 940, 988, 1055,...","[466, 507, 600, 785, 855, 938, 986, 1053, 1149...",[The use of public transportation is the best ...,"[T3, T4, T5, T7, T8, T9, T10, T11, T13, T14]","[T2, T2, T2, T6, T6, T6, T6, T6, T12, T12]","[supports, supports, supports, supports, suppo...",Public transportation keeps society from the d...,The development of public transportation is th...,False
258,essay259.ann,"[T1, T2, T4, T5, T6, T7, T8, T9, T10, T11, T12...","[MajorClaim, MajorClaim, Premise, Premise, Pre...","[407, 1531, 648, 573, 817, 906, 493, 1001, 110...","[477, 1616, 815, 646, 883, 991, 571, 1088, 115...",[working at home is worse and less efficient t...,"[T5, T4, T6, T7, T11, T10, T12, T13]","[T8, T8, T8, T8, T10, T9, T9, T9]","[supports, supports, supports, supports, suppo...",Working at home vs Working in a company,"Due to the convenience of current technology, ...",False
265,essay266.ann,"[T1, T2, T3, T5, T6, T7, T8, T9, T10, T11, T12...","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[480, 2018, 2196, 623, 745, 880, 939, 1041, 12...","[544, 2082, 2310, 727, 869, 937, 1039, 1226, 1...",[the advertising is not the main cause of unhe...,"[T5, T6, T7, T8, T9, T10, T13, T14, T15, T16]","[T11, T11, T6, T7, T11, T11, T12, T12, T12, T12]","[supports, attacks, supports, supports, suppor...",The advertising is the main cause of unhealthy...,Advertising is becoming part of our modern soc...,False
296,essay297.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, Premise, Premise, Pre...","[277, 2063, 462, 535, 667, 868, 377, 969, 906,...","[331, 2140, 525, 652, 858, 896, 460, 1015, 967...",[online classes should never replace classroom...,"[T4, T3, T5, T6, T11, T10, T9, T14, T13, T15, ...","[T3, T7, T7, T7, T8, T8, T8, T12, T12, T12, T16]","[supports, supports, supports, supports, suppo...",Classroom learnng vs online education,There are some people who believe that distanc...,False
321,essay322.ann,"[T1, T2, T3, T4, T5, T7, T8, T9, T10, T11, T12...","[Claim, Claim, MajorClaim, Claim, Claim, Claim...","[222, 293, 1414, 1465, 1539, 354, 513, 594, 66...","[278, 351, 1463, 1527, 1639, 511, 592, 663, 72...",[The historic buildings show the history of ou...,"[T8, T9, T10, T11, T12, T14, T16, T13]","[T7, T7, T7, T7, T7, T13, T15, T15]","[supports, supports, supports, supports, suppo...",Replacing old buildings is important for any c...,It is important for any city to replace old bu...,False
357,essay358.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, Premise, Premise, Premise, Premis...","[1630, 560, 635, 706, 773, 835, 998, 921, 467,...","[1724, 613, 704, 771, 833, 919, 1076, 990, 549...",[the option of working or studying from home i...,"[T6, T8, T3, T2, T4, T5, T12, T11, T10, T9]","[T5, T5, T2, T7, T7, T7, T14, T14, T14, T7]","[supports, supports, supports, supports, suppo...",The option to work or study from home is advan...,Undoubtedly this is an era of internet which i...,False


In [58]:
pe_df.iloc[29]

ann_file_name                                               essay030.ann
argument_ids            [T12, T13, T14, T15, T1, T2, T3, T4, T5, T6, T7]
argument_types         [Claim, Premise, MajorClaim, Claim, MajorClaim...
arg_start_bounds       [1123, 1236, 1342, 1400, 148, 318, 421, 572, 7...
arg_end_bounds         [1232, 1301, 1391, 1558, 316, 400, 570, 720, 8...
argument_components    [this will encourage all the parents to think ...
source_args                                            [T13, T4, T5, T7]
target_args                                            [T12, T3, T6, T6]
relation_types                  [supports, supports, supports, supports]
essay_title                                     Government and education
essay_text             Primary and secondary education provide a basi...
sanity_check                                                       False
Name: 29, dtype: object

In [19]:
split_df = pd.read_csv("/Utilisateurs/umushtaq/am_reasoning/data_files/train-test-split.csv", delimiter=";")

In [20]:
split_df

Unnamed: 0,ID,SET
0,essay001,TRAIN
1,essay002,TRAIN
2,essay003,TRAIN
3,essay004,TEST
4,essay005,TEST
...,...,...
397,essay398,TEST
398,essay399,TRAIN
399,essay400,TRAIN
400,essay401,TRAIN


In [21]:
pe_df['split'] = split_df.SET

In [23]:
pe_df.to_csv("/Utilisateurs/umushtaq/am_reasoning/data_files/pe_dataset_new.csv")

In [24]:
components_df

Unnamed: 0,file_name,argument_id,argument_type,arg_bound_1,arg_bound2,argument_component
0,essay001.ann,T1,MajorClaim,503,575,we should attach more importance to cooperatio...
1,essay001.ann,T2,MajorClaim,2154,2231,a more cooperative attitudes towards life is m...
2,essay001.ann,T3,Claim,591,714,"through cooperation, children can learn about ..."
3,essay001.ann,T4,Premise,716,851,What we acquired from team work is not only ho...
4,essay001.ann,T5,Premise,853,1086,"During the process of cooperation, children ca..."
...,...,...,...,...,...,...
6084,essay402.ann,T11,Premise,1275,1339,indirectly they will learn how to socialize ea...
6085,essay402.ann,T12,Premise,1341,1388,That will make children getting lots of friends
6086,essay402.ann,T13,Premise,1393,1436,they can contribute positively to community
6087,essay402.ann,T14,Premise,1448,1525,playing sport makes children getting healthy a...


In [20]:
components_df = components_df.drop(columns=components_df.columns[0])

Parsing annotation files: 100%|██████████| 402/402 [00:00<00:00, 2360.83it/s]

processing: essay001.ann
processing: essay002.ann
processing: essay003.ann
processing: essay004.ann
processing: essay005.ann
processing: essay006.ann
processing: essay007.ann
processing: essay008.ann
processing: essay009.ann
processing: essay010.ann
processing: essay011.ann
processing: essay012.ann
processing: essay013.ann
processing: essay014.ann
processing: essay015.ann
processing: essay016.ann
processing: essay017.ann
processing: essay018.ann
processing: essay019.ann
processing: essay020.ann
processing: essay021.ann
processing: essay022.ann
processing: essay023.ann
processing: essay024.ann
processing: essay025.ann
processing: essay026.ann
processing: essay027.ann
processing: essay028.ann
processing: essay029.ann
processing: essay030.ann
processing: essay031.ann
processing: essay032.ann
processing: essay033.ann
processing: essay034.ann
processing: essay035.ann
processing: essay036.ann
processing: essay037.ann
processing: essay038.ann
processing: essay039.ann
processing: essay040.ann





In [28]:
relations_df

Unnamed: 0,file_name,source_arg,target_arg,relation_type
0,essay001.ann,T4,T3,supports
1,essay001.ann,T5,T3,supports
2,essay001.ann,T6,T3,supports
3,essay001.ann,T10,T11,supports
4,essay001.ann,T9,T11,supports
...,...,...,...,...
3827,essay402.ann,T13,T4,supports
3828,essay402.ann,T9,T4,supports
3829,essay402.ann,T10,T4,supports
3830,essay402.ann,T14,T4,supports


In [25]:
relations_df = relations_df.drop(columns=relations_df.columns[0])

In [27]:
relations_df.columns = ["file_name", "source_arg", "target_arg", "relation_type"]

In [104]:
for filename in tqdm(os.listdir(text_files_directory), desc="Parsing text files"):
    
    print("processing: " + filename)
    #if filename.endswith(".ann"):  # Process only text files
    file_path = os.path.join(text_files_directory, filename)
    filename_ann = filename.replace(".txt", ".ann")
    
    with open(file_path, "r", encoding="utf-8") as file:
        
        lines = file.readlines()
        essay_title = lines[0].strip()
        essay_text = "".join(lines[1:]).strip()
        
        essays_data.append([filename.replace(".txt", ".ann"), essay_title, essay_text])

Parsing text files: 100%|██████████| 402/402 [00:00<00:00, 6202.45it/s]

processing: essay001.txt
processing: essay002.txt
processing: essay003.txt
processing: essay004.txt
processing: essay005.txt
processing: essay006.txt
processing: essay007.txt
processing: essay008.txt
processing: essay009.txt
processing: essay010.txt
processing: essay011.txt
processing: essay012.txt
processing: essay013.txt
processing: essay014.txt
processing: essay015.txt
processing: essay016.txt
processing: essay017.txt
processing: essay018.txt
processing: essay019.txt
processing: essay020.txt
processing: essay021.txt
processing: essay022.txt
processing: essay023.txt
processing: essay024.txt
processing: essay025.txt
processing: essay026.txt
processing: essay027.txt
processing: essay028.txt
processing: essay029.txt
processing: essay030.txt
processing: essay031.txt
processing: essay032.txt
processing: essay033.txt
processing: essay034.txt
processing: essay035.txt
processing: essay036.txt
processing: essay037.txt
processing: essay038.txt
processing: essay039.txt
processing: essay040.txt





In [105]:
essays_df = pd.DataFrame(essays_data)

In [106]:
essays_df

Unnamed: 0,0,1,2
0,essay001.ann,Should students be taught to compete or to coo...,It is always said that competition can effecti...
1,essay002.ann,More people are migrating to other countries t...,The last 50 years have seen an increasing numb...
2,essay003.ann,International tourism is now more common than ...,The last decade has seen an increasing number ...
3,essay004.ann,International tourism is now more common than ...,The last 50 years have seen a significant incr...
4,essay005.ann,Living and studying overseas,It is every student's desire to study at a goo...
...,...,...,...
397,essay398.ann,We can not forcedly put the same numbers of ma...,There is a view that universities and colleges...
398,essay399.ann,"Drugs, alcohol and messy sex lives","Celebrities, for example movie stars, are usua..."
399,essay400.ann,A greater proportion of the budget should be a...,"In today's world, the concept of the welfare s..."
400,essay401.ann,Fatherhood should be as present as motherhood ...,"In today's world, having and raising children ..."


In [107]:
essays_df.columns = ["file_name", "essay_title", "essay_text"]

In [108]:
essays_df

Unnamed: 0,file_name,essay_title,essay_text
0,essay001.ann,Should students be taught to compete or to coo...,It is always said that competition can effecti...
1,essay002.ann,More people are migrating to other countries t...,The last 50 years have seen an increasing numb...
2,essay003.ann,International tourism is now more common than ...,The last decade has seen an increasing number ...
3,essay004.ann,International tourism is now more common than ...,The last 50 years have seen a significant incr...
4,essay005.ann,Living and studying overseas,It is every student's desire to study at a goo...
...,...,...,...
397,essay398.ann,We can not forcedly put the same numbers of ma...,There is a view that universities and colleges...
398,essay399.ann,"Drugs, alcohol and messy sex lives","Celebrities, for example movie stars, are usua..."
399,essay400.ann,A greater proportion of the budget should be a...,"In today's world, the concept of the welfare s..."
400,essay401.ann,Fatherhood should be as present as motherhood ...,"In today's world, having and raising children ..."


### Group dfs and then merge

In [109]:
# "file_name", "argument_id", "argument_type", "arg_bound_1", "arg_bound2", "argument_component"

In [110]:
comp_grouped_df = components_df.groupby(['file_name'], sort=False).agg({
    'argument_id': list,
    'argument_type': list,
    'arg_bound_1': list,
    'arg_bound2': list,
    'argument_component': list,
}).reset_index()

In [111]:
comp_grouped_df

Unnamed: 0,file_name,argument_id,argument_type,arg_bound_1,arg_bound2,argument_component
0,essay001.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[503, 2154, 591, 716, 853, 1088, 1332, 1212, 1...","[575, 2231, 714, 851, 1086, 1191, 1376, 1301, ...",[we should attach more importance to cooperati...
1,essay002.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, Premise, Claim, Premi...","[391, 1936, 500, 1089, 626, 841, 948, 1168, 12...","[489, 2077, 624, 1156, 839, 946, 1057, 1204, 1...",[they are able to sustain their cultural ident...
2,essay003.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[352, 1568, 955, 487, 615, 786, 1024, 1454, 11...","[475, 1701, 1012, 613, 784, 943, 1135, 1526, 1...",[it has contributed to the economic developmen...
3,essay004.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11]","[MajorClaim, MajorClaim, Claim, Claim, Claim, ...","[262, 1663, 179, 953, 1578, 417, 532, 820, 107...","[376, 1758, 239, 1031, 1624, 530, 818, 924, 11...",[this industry has affected the cultural attri...
4,essay005.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, Claim, Claim, Claim, ...","[286, 1535, 388, 775, 1330, 566, 885, 993, 109...","[359, 1652, 564, 883, 1425, 765, 988, 1078, 11...",[one who studies overseas will gain many skill...
...,...,...,...,...,...,...
397,essay398.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[Claim, MajorClaim, Premise, Premise, Premise,...","[254, 1751, 409, 557, 647, 778, 915, 333, 1260...","[318, 1951, 541, 645, 767, 897, 1035, 407, 134...",[many different characters exist between male ...
398,essay399.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[367, 1323, 527, 431, 661, 787, 966, 750, 1086...","[429, 1390, 645, 525, 737, 951, 1073, 785, 117...",[this is a worrying tread which has negative i...
399,essay400.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[274, 1619, 736, 541, 805, 389, 1112, 1233, 14...","[387, 1812, 796, 723, 944, 524, 1220, 1388, 16...",[governments should devote a greater portion o...
400,essay401.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Premise, Premise, Cla...","[311, 1496, 502, 622, 386, 954, 1103, 818, 128...","[384, 1667, 607, 815, 488, 1087, 1278, 937, 13...",[fatherhood is a as vital part of a healty par...


In [112]:
rels_grouped_df = relations_df.groupby(['file_name'], sort=False).agg({
    'source_arg': list,
    'target_arg': list,
    'relation_type': list,
}).reset_index()

# source_arg	target_arg	relation_type

In [113]:
rels_grouped_df

Unnamed: 0,file_name,source_arg,target_arg,relation_type
0,essay001.ann,"[T4, T5, T6, T10, T9, T8]","[T3, T3, T3, T11, T11, T7]","[supports, supports, supports, supports, suppo..."
1,essay002.ann,"[T9, T11, T12, T13, T5, T6, T7, T3, T8]","[T8, T10, T10, T10, T4, T4, T4, T4, T10]","[supports, supports, supports, supports, suppo..."
2,essay003.ann,"[T10, T9, T6, T5, T4, T7]","[T8, T8, T3, T3, T3, T8]","[supports, supports, supports, supports, suppo..."
3,essay004.ann,"[T8, T7, T6, T10, T9, T11]","[T4, T4, T4, T11, T5, T5]","[supports, supports, supports, supports, suppo..."
4,essay005.ann,"[T6, T7, T8, T9, T10, T11]","[T3, T4, T7, T4, T5, T5]","[supports, attacks, attacks, supports, support..."
...,...,...,...,...
397,essay398.ann,"[T3, T4, T5, T6, T7, T13, T14, T11, T9, T15, T16]","[T8, T8, T8, T8, T8, T12, T12, T10, T10, T10, ...","[supports, supports, supports, supports, suppo..."
398,essay399.ann,"[T4, T5, T6, T8, T10]","[T3, T3, T7, T7, T9]","[supports, supports, supports, supports, suppo..."
399,essay400.ann,"[T4, T5, T6, T7, T9, T8]","[T3, T3, T3, T8, T8, T10]","[supports, supports, supports, supports, suppo..."
400,essay401.ann,"[T3, T7, T10, T4, T9, T6]","[T5, T6, T8, T3, T8, T8]","[supports, supports, supports, supports, suppo..."


### MERGE

In [114]:
merged_df = comp_grouped_df.merge(rels_grouped_df, on="file_name").merge(essays_df, on="file_name")

In [115]:
merged_df

Unnamed: 0,file_name,argument_id,argument_type,arg_bound_1,arg_bound2,argument_component,source_arg,target_arg,relation_type,essay_title,essay_text
0,essay001.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[503, 2154, 591, 716, 853, 1088, 1332, 1212, 1...","[575, 2231, 714, 851, 1086, 1191, 1376, 1301, ...",[we should attach more importance to cooperati...,"[T4, T5, T6, T10, T9, T8]","[T3, T3, T3, T11, T11, T7]","[supports, supports, supports, supports, suppo...",Should students be taught to compete or to coo...,It is always said that competition can effecti...
1,essay002.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, Premise, Claim, Premi...","[391, 1936, 500, 1089, 626, 841, 948, 1168, 12...","[489, 2077, 624, 1156, 839, 946, 1057, 1204, 1...",[they are able to sustain their cultural ident...,"[T9, T11, T12, T13, T5, T6, T7, T3, T8]","[T8, T10, T10, T10, T4, T4, T4, T4, T10]","[supports, supports, supports, supports, suppo...",More people are migrating to other countries t...,The last 50 years have seen an increasing numb...
2,essay003.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[352, 1568, 955, 487, 615, 786, 1024, 1454, 11...","[475, 1701, 1012, 613, 784, 943, 1135, 1526, 1...",[it has contributed to the economic developmen...,"[T10, T9, T6, T5, T4, T7]","[T8, T8, T3, T3, T3, T8]","[supports, supports, supports, supports, suppo...",International tourism is now more common than ...,The last decade has seen an increasing number ...
3,essay004.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11]","[MajorClaim, MajorClaim, Claim, Claim, Claim, ...","[262, 1663, 179, 953, 1578, 417, 532, 820, 107...","[376, 1758, 239, 1031, 1624, 530, 818, 924, 11...",[this industry has affected the cultural attri...,"[T8, T7, T6, T10, T9, T11]","[T4, T4, T4, T11, T5, T5]","[supports, supports, supports, supports, suppo...",International tourism is now more common than ...,The last 50 years have seen a significant incr...
4,essay005.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, Claim, Claim, Claim, ...","[286, 1535, 388, 775, 1330, 566, 885, 993, 109...","[359, 1652, 564, 883, 1425, 765, 988, 1078, 11...",[one who studies overseas will gain many skill...,"[T6, T7, T8, T9, T10, T11]","[T3, T4, T7, T4, T5, T5]","[supports, attacks, attacks, supports, support...",Living and studying overseas,It is every student's desire to study at a goo...
...,...,...,...,...,...,...,...,...,...,...,...
397,essay398.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[Claim, MajorClaim, Premise, Premise, Premise,...","[254, 1751, 409, 557, 647, 778, 915, 333, 1260...","[318, 1951, 541, 645, 767, 897, 1035, 407, 134...",[many different characters exist between male ...,"[T3, T4, T5, T6, T7, T13, T14, T11, T9, T15, T16]","[T8, T8, T8, T8, T8, T12, T12, T10, T10, T10, ...","[supports, supports, supports, supports, suppo...",We can not forcedly put the same numbers of ma...,There is a view that universities and colleges...
398,essay399.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[367, 1323, 527, 431, 661, 787, 966, 750, 1086...","[429, 1390, 645, 525, 737, 951, 1073, 785, 117...",[this is a worrying tread which has negative i...,"[T4, T5, T6, T8, T10]","[T3, T3, T7, T7, T9]","[supports, supports, supports, supports, suppo...","Drugs, alcohol and messy sex lives","Celebrities, for example movie stars, are usua..."
399,essay400.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[274, 1619, 736, 541, 805, 389, 1112, 1233, 14...","[387, 1812, 796, 723, 944, 524, 1220, 1388, 16...",[governments should devote a greater portion o...,"[T4, T5, T6, T7, T9, T8]","[T3, T3, T3, T8, T8, T10]","[supports, supports, supports, supports, suppo...",A greater proportion of the budget should be a...,"In today's world, the concept of the welfare s..."
400,essay401.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Premise, Premise, Cla...","[311, 1496, 502, 622, 386, 954, 1103, 818, 128...","[384, 1667, 607, 815, 488, 1087, 1278, 937, 13...",[fatherhood is a as vital part of a healty par...,"[T3, T7, T10, T4, T9, T6]","[T5, T6, T8, T3, T8, T8]","[supports, supports, supports, supports, suppo...",Fatherhood should be as present as motherhood ...,"In today's world, having and raising children ..."


In [122]:
merged_df.columns

Index(['file_name', 'argument_id', 'argument_type', 'arg_bound_1',
       'arg_bound2', 'argument_component', 'source_arg', 'target_arg',
       'relation_type', 'essay_title', 'essay_text'],
      dtype='object')

In [123]:
merged_df.columns = ['ann_file_name', 'argument_ids', 'argument_types', 'arg_start_bounds',
       'arg_end_bounds', 'argument_components', 'source_args', 'target_args',
       'relation_types', 'essay_title', 'essay_text']

In [124]:
merged_df

Unnamed: 0,ann_file_name,argument_ids,argument_types,arg_start_bounds,arg_end_bounds,argument_components,source_args,target_args,relation_types,essay_title,essay_text
0,essay001.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[503, 2154, 591, 716, 853, 1088, 1332, 1212, 1...","[575, 2231, 714, 851, 1086, 1191, 1376, 1301, ...",[we should attach more importance to cooperati...,"[T4, T5, T6, T10, T9, T8]","[T3, T3, T3, T11, T11, T7]","[supports, supports, supports, supports, suppo...",Should students be taught to compete or to coo...,It is always said that competition can effecti...
1,essay002.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, Premise, Claim, Premi...","[391, 1936, 500, 1089, 626, 841, 948, 1168, 12...","[489, 2077, 624, 1156, 839, 946, 1057, 1204, 1...",[they are able to sustain their cultural ident...,"[T9, T11, T12, T13, T5, T6, T7, T3, T8]","[T8, T10, T10, T10, T4, T4, T4, T4, T10]","[supports, supports, supports, supports, suppo...",More people are migrating to other countries t...,The last 50 years have seen an increasing numb...
2,essay003.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[352, 1568, 955, 487, 615, 786, 1024, 1454, 11...","[475, 1701, 1012, 613, 784, 943, 1135, 1526, 1...",[it has contributed to the economic developmen...,"[T10, T9, T6, T5, T4, T7]","[T8, T8, T3, T3, T3, T8]","[supports, supports, supports, supports, suppo...",International tourism is now more common than ...,The last decade has seen an increasing number ...
3,essay004.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11]","[MajorClaim, MajorClaim, Claim, Claim, Claim, ...","[262, 1663, 179, 953, 1578, 417, 532, 820, 107...","[376, 1758, 239, 1031, 1624, 530, 818, 924, 11...",[this industry has affected the cultural attri...,"[T8, T7, T6, T10, T9, T11]","[T4, T4, T4, T11, T5, T5]","[supports, supports, supports, supports, suppo...",International tourism is now more common than ...,The last 50 years have seen a significant incr...
4,essay005.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, Claim, Claim, Claim, ...","[286, 1535, 388, 775, 1330, 566, 885, 993, 109...","[359, 1652, 564, 883, 1425, 765, 988, 1078, 11...",[one who studies overseas will gain many skill...,"[T6, T7, T8, T9, T10, T11]","[T3, T4, T7, T4, T5, T5]","[supports, attacks, attacks, supports, support...",Living and studying overseas,It is every student's desire to study at a goo...
...,...,...,...,...,...,...,...,...,...,...,...
397,essay398.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[Claim, MajorClaim, Premise, Premise, Premise,...","[254, 1751, 409, 557, 647, 778, 915, 333, 1260...","[318, 1951, 541, 645, 767, 897, 1035, 407, 134...",[many different characters exist between male ...,"[T3, T4, T5, T6, T7, T13, T14, T11, T9, T15, T16]","[T8, T8, T8, T8, T8, T12, T12, T10, T10, T10, ...","[supports, supports, supports, supports, suppo...",We can not forcedly put the same numbers of ma...,There is a view that universities and colleges...
398,essay399.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[367, 1323, 527, 431, 661, 787, 966, 750, 1086...","[429, 1390, 645, 525, 737, 951, 1073, 785, 117...",[this is a worrying tread which has negative i...,"[T4, T5, T6, T8, T10]","[T3, T3, T7, T7, T9]","[supports, supports, supports, supports, suppo...","Drugs, alcohol and messy sex lives","Celebrities, for example movie stars, are usua..."
399,essay400.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[274, 1619, 736, 541, 805, 389, 1112, 1233, 14...","[387, 1812, 796, 723, 944, 524, 1220, 1388, 16...",[governments should devote a greater portion o...,"[T4, T5, T6, T7, T9, T8]","[T3, T3, T3, T8, T8, T10]","[supports, supports, supports, supports, suppo...",A greater proportion of the budget should be a...,"In today's world, the concept of the welfare s..."
400,essay401.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Premise, Premise, Cla...","[311, 1496, 502, 622, 386, 954, 1103, 818, 128...","[384, 1667, 607, 815, 488, 1087, 1278, 937, 13...",[fatherhood is a as vital part of a healty par...,"[T3, T7, T10, T4, T9, T6]","[T5, T6, T8, T3, T8, T8]","[supports, supports, supports, supports, suppo...",Fatherhood should be as present as motherhood ...,"In today's world, having and raising children ..."


#### sanity checks

In [126]:
def egal_1(row):
    
    return 1 if (len(row.argument_ids) == len(row.argument_types) == len(row.arg_start_bounds) == len(row.arg_end_bounds) == len(row.argument_components)) else 0

In [127]:
merged_df['egal_1'] = merged_df.apply(lambda row: egal_1(row), axis=1)

In [128]:
merged_df['egal_1'].value_counts()

egal_1
1    402
Name: count, dtype: int64

In [129]:
def egal_2(row):
    
    return 1 if (len(row.source_args) == len(row.target_args) == len(row.relation_types)) else 0

In [130]:
merged_df['egal_2'] = merged_df.apply(lambda row: egal_2(row), axis=1)

In [131]:
merged_df['egal_2'].value_counts()

egal_2
1    402
Name: count, dtype: int64