In [1]:
import os
import pandas as pd
from tqdm import tqdm

In [2]:
ann_files_directory = "/Utilisateurs/umushtaq/am_reasoning/raw_files/ann_files"
text_files_directory = "/Utilisateurs/umushtaq/am_reasoning/raw_files/txt_files"

In [3]:
components_data = []
relations_data = []
essays_data = []

In [4]:
for filename in tqdm(os.listdir(ann_files_directory), desc="Parsing annotation files for components ..."):
    
    print("processing: " + filename)
    #if filename.endswith(".ann"):  # Process only text files
    file_path = os.path.join(ann_files_directory, filename)
    
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split("\t")
            #print(parts)
            # Process "T" lines (claims/premises)
            if line.startswith("T"):
                t_id = parts[0]  # Extract T ID (e.g., T1)
                t_type = parts[1].split(" ")[0]  # Extract type (e.g., MajorClaim, Claim, Premise)
                t_s_bound = parts[1].split(" ")[1]
                t_e_bound = parts[1].split(" ")[2]
                text = parts[2] #if len(parts) > 3 else ""  # Extract text if available
                components_data.append(["T", filename, t_id, t_type, t_s_bound, t_e_bound, text])

Parsing annotation files for components ...: 100%|██████████| 402/402 [00:00<00:00, 5050.85it/s]

processing: essay001.ann
processing: essay002.ann
processing: essay003.ann
processing: essay004.ann
processing: essay005.ann
processing: essay006.ann
processing: essay007.ann
processing: essay008.ann
processing: essay009.ann
processing: essay010.ann
processing: essay011.ann
processing: essay012.ann
processing: essay013.ann
processing: essay014.ann
processing: essay015.ann
processing: essay016.ann
processing: essay017.ann
processing: essay018.ann
processing: essay019.ann
processing: essay020.ann
processing: essay021.ann
processing: essay022.ann
processing: essay023.ann
processing: essay024.ann
processing: essay025.ann
processing: essay026.ann
processing: essay027.ann
processing: essay028.ann
processing: essay029.ann
processing: essay030.ann
processing: essay031.ann
processing: essay032.ann
processing: essay033.ann
processing: essay034.ann
processing: essay035.ann
processing: essay036.ann
processing: essay037.ann
processing: essay038.ann
processing: essay039.ann
processing: essay040.ann





In [5]:
for filename in tqdm(os.listdir(ann_files_directory), desc="Parsing annotation files for relations ... "):
    
    print("processing: " + filename)
    #if filename.endswith(".ann"):  # Process only text files
    file_path = os.path.join(ann_files_directory, filename)
    
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split("\t")
            #print(parts)
            # Process "T" lines (claims/premises)
            if line.startswith("R"):
                parts = line.strip().split()
                relation_type = parts[1]  # "supports" or "attacks"
                arg1 = parts[2].split(":")[1]  # Extract T value from Arg1
                arg2 = parts[3].split(":")[1]  # Extract T value from Arg2
                relations_data.append(["R", filename, arg1, arg2, relation_type])

Parsing annotation files for relations ... : 100%|██████████| 402/402 [00:00<00:00, 5478.64it/s]

processing: essay001.ann
processing: essay002.ann
processing: essay003.ann
processing: essay004.ann
processing: essay005.ann
processing: essay006.ann
processing: essay007.ann
processing: essay008.ann
processing: essay009.ann
processing: essay010.ann
processing: essay011.ann
processing: essay012.ann
processing: essay013.ann
processing: essay014.ann
processing: essay015.ann
processing: essay016.ann
processing: essay017.ann
processing: essay018.ann
processing: essay019.ann
processing: essay020.ann
processing: essay021.ann
processing: essay022.ann
processing: essay023.ann
processing: essay024.ann
processing: essay025.ann
processing: essay026.ann
processing: essay027.ann
processing: essay028.ann
processing: essay029.ann
processing: essay030.ann
processing: essay031.ann
processing: essay032.ann
processing: essay033.ann
processing: essay034.ann
processing: essay035.ann
processing: essay036.ann
processing: essay037.ann
processing: essay038.ann
processing: essay039.ann
processing: essay040.ann





In [6]:
for filename in tqdm(os.listdir(text_files_directory), desc="Parsing text files for esssays ... "):
    
    print("processing: " + filename)
    #if filename.endswith(".ann"):  # Process only text files
    file_path = os.path.join(text_files_directory, filename)
    filename_ann = filename.replace(".txt", ".ann")
    
    with open(file_path, "r", encoding="utf-8") as file:
        
        lines = file.readlines()
        essay_title = lines[0].strip()
        essay_text = "".join(lines[1:]).strip()
        
        essays_data.append([filename.replace(".txt", ".ann"), essay_title, essay_text])

Parsing text files for esssays ... : 100%|██████████| 402/402 [00:00<00:00, 5562.52it/s]

processing: essay001.txt
processing: essay002.txt
processing: essay003.txt
processing: essay004.txt
processing: essay005.txt
processing: essay006.txt
processing: essay007.txt
processing: essay008.txt
processing: essay009.txt
processing: essay010.txt
processing: essay011.txt
processing: essay012.txt
processing: essay013.txt
processing: essay014.txt
processing: essay015.txt
processing: essay016.txt
processing: essay017.txt
processing: essay018.txt
processing: essay019.txt
processing: essay020.txt
processing: essay021.txt
processing: essay022.txt
processing: essay023.txt
processing: essay024.txt
processing: essay025.txt
processing: essay026.txt
processing: essay027.txt
processing: essay028.txt
processing: essay029.txt
processing: essay030.txt
processing: essay031.txt
processing: essay032.txt
processing: essay033.txt
processing: essay034.txt
processing: essay035.txt
processing: essay036.txt
processing: essay037.txt
processing: essay038.txt
processing: essay039.txt
processing: essay040.txt





In [7]:
components_df = pd.DataFrame(components_data)
relations_df = pd.DataFrame(relations_data)
essays_df = pd.DataFrame(essays_data)

In [8]:
components_df = components_df.drop(columns=components_df.columns[0])
components_df.columns = ["file_name", "argument_id", "argument_type", "arg_bound_1", "arg_bound2", "argument_component"]

In [9]:
relations_df = relations_df.drop(columns=relations_df.columns[0])
relations_df.columns = ["file_name", "source_arg", "target_arg", "relation_type"]

In [10]:
essays_df.columns = ["file_name", "essay_title", "essay_text"]

### Group and Merge

In [11]:
comp_grouped_df = components_df.groupby(['file_name'], sort=False).agg({
    'argument_id': list,
    'argument_type': list,
    'arg_bound_1': list,
    'arg_bound2': list,
    'argument_component': list,
}).reset_index()

In [12]:
rels_grouped_df = relations_df.groupby(['file_name'], sort=False).agg({
    'source_arg': list,
    'target_arg': list,
    'relation_type': list,
}).reset_index()

In [13]:
pe_df = comp_grouped_df.merge(rels_grouped_df, on="file_name").merge(essays_df, on="file_name")

In [14]:
pe_df.columns = ['ann_file_name', 'argument_ids', 'argument_types', 'arg_start_bounds',
       'arg_end_bounds', 'argument_components', 'source_args', 'target_args',
       'relation_types', 'essay_title', 'essay_text']

In [15]:
pe_df.to_csv("/Utilisateurs/umushtaq/am_reasoning/data_files/pe_dataset_new.csv")

In [24]:
components_df

Unnamed: 0,file_name,argument_id,argument_type,arg_bound_1,arg_bound2,argument_component
0,essay001.ann,T1,MajorClaim,503,575,we should attach more importance to cooperatio...
1,essay001.ann,T2,MajorClaim,2154,2231,a more cooperative attitudes towards life is m...
2,essay001.ann,T3,Claim,591,714,"through cooperation, children can learn about ..."
3,essay001.ann,T4,Premise,716,851,What we acquired from team work is not only ho...
4,essay001.ann,T5,Premise,853,1086,"During the process of cooperation, children ca..."
...,...,...,...,...,...,...
6084,essay402.ann,T11,Premise,1275,1339,indirectly they will learn how to socialize ea...
6085,essay402.ann,T12,Premise,1341,1388,That will make children getting lots of friends
6086,essay402.ann,T13,Premise,1393,1436,they can contribute positively to community
6087,essay402.ann,T14,Premise,1448,1525,playing sport makes children getting healthy a...


In [20]:
components_df = components_df.drop(columns=components_df.columns[0])

Parsing annotation files: 100%|██████████| 402/402 [00:00<00:00, 2360.83it/s]

processing: essay001.ann
processing: essay002.ann
processing: essay003.ann
processing: essay004.ann
processing: essay005.ann
processing: essay006.ann
processing: essay007.ann
processing: essay008.ann
processing: essay009.ann
processing: essay010.ann
processing: essay011.ann
processing: essay012.ann
processing: essay013.ann
processing: essay014.ann
processing: essay015.ann
processing: essay016.ann
processing: essay017.ann
processing: essay018.ann
processing: essay019.ann
processing: essay020.ann
processing: essay021.ann
processing: essay022.ann
processing: essay023.ann
processing: essay024.ann
processing: essay025.ann
processing: essay026.ann
processing: essay027.ann
processing: essay028.ann
processing: essay029.ann
processing: essay030.ann
processing: essay031.ann
processing: essay032.ann
processing: essay033.ann
processing: essay034.ann
processing: essay035.ann
processing: essay036.ann
processing: essay037.ann
processing: essay038.ann
processing: essay039.ann
processing: essay040.ann





In [28]:
relations_df

Unnamed: 0,file_name,source_arg,target_arg,relation_type
0,essay001.ann,T4,T3,supports
1,essay001.ann,T5,T3,supports
2,essay001.ann,T6,T3,supports
3,essay001.ann,T10,T11,supports
4,essay001.ann,T9,T11,supports
...,...,...,...,...
3827,essay402.ann,T13,T4,supports
3828,essay402.ann,T9,T4,supports
3829,essay402.ann,T10,T4,supports
3830,essay402.ann,T14,T4,supports


In [25]:
relations_df = relations_df.drop(columns=relations_df.columns[0])

In [27]:
relations_df.columns = ["file_name", "source_arg", "target_arg", "relation_type"]

In [104]:
for filename in tqdm(os.listdir(text_files_directory), desc="Parsing text files"):
    
    print("processing: " + filename)
    #if filename.endswith(".ann"):  # Process only text files
    file_path = os.path.join(text_files_directory, filename)
    filename_ann = filename.replace(".txt", ".ann")
    
    with open(file_path, "r", encoding="utf-8") as file:
        
        lines = file.readlines()
        essay_title = lines[0].strip()
        essay_text = "".join(lines[1:]).strip()
        
        essays_data.append([filename.replace(".txt", ".ann"), essay_title, essay_text])

Parsing text files: 100%|██████████| 402/402 [00:00<00:00, 6202.45it/s]

processing: essay001.txt
processing: essay002.txt
processing: essay003.txt
processing: essay004.txt
processing: essay005.txt
processing: essay006.txt
processing: essay007.txt
processing: essay008.txt
processing: essay009.txt
processing: essay010.txt
processing: essay011.txt
processing: essay012.txt
processing: essay013.txt
processing: essay014.txt
processing: essay015.txt
processing: essay016.txt
processing: essay017.txt
processing: essay018.txt
processing: essay019.txt
processing: essay020.txt
processing: essay021.txt
processing: essay022.txt
processing: essay023.txt
processing: essay024.txt
processing: essay025.txt
processing: essay026.txt
processing: essay027.txt
processing: essay028.txt
processing: essay029.txt
processing: essay030.txt
processing: essay031.txt
processing: essay032.txt
processing: essay033.txt
processing: essay034.txt
processing: essay035.txt
processing: essay036.txt
processing: essay037.txt
processing: essay038.txt
processing: essay039.txt
processing: essay040.txt





In [105]:
essays_df = pd.DataFrame(essays_data)

In [106]:
essays_df

Unnamed: 0,0,1,2
0,essay001.ann,Should students be taught to compete or to coo...,It is always said that competition can effecti...
1,essay002.ann,More people are migrating to other countries t...,The last 50 years have seen an increasing numb...
2,essay003.ann,International tourism is now more common than ...,The last decade has seen an increasing number ...
3,essay004.ann,International tourism is now more common than ...,The last 50 years have seen a significant incr...
4,essay005.ann,Living and studying overseas,It is every student's desire to study at a goo...
...,...,...,...
397,essay398.ann,We can not forcedly put the same numbers of ma...,There is a view that universities and colleges...
398,essay399.ann,"Drugs, alcohol and messy sex lives","Celebrities, for example movie stars, are usua..."
399,essay400.ann,A greater proportion of the budget should be a...,"In today's world, the concept of the welfare s..."
400,essay401.ann,Fatherhood should be as present as motherhood ...,"In today's world, having and raising children ..."


In [107]:
essays_df.columns = ["file_name", "essay_title", "essay_text"]

In [108]:
essays_df

Unnamed: 0,file_name,essay_title,essay_text
0,essay001.ann,Should students be taught to compete or to coo...,It is always said that competition can effecti...
1,essay002.ann,More people are migrating to other countries t...,The last 50 years have seen an increasing numb...
2,essay003.ann,International tourism is now more common than ...,The last decade has seen an increasing number ...
3,essay004.ann,International tourism is now more common than ...,The last 50 years have seen a significant incr...
4,essay005.ann,Living and studying overseas,It is every student's desire to study at a goo...
...,...,...,...
397,essay398.ann,We can not forcedly put the same numbers of ma...,There is a view that universities and colleges...
398,essay399.ann,"Drugs, alcohol and messy sex lives","Celebrities, for example movie stars, are usua..."
399,essay400.ann,A greater proportion of the budget should be a...,"In today's world, the concept of the welfare s..."
400,essay401.ann,Fatherhood should be as present as motherhood ...,"In today's world, having and raising children ..."


### Group dfs and then merge

In [109]:
# "file_name", "argument_id", "argument_type", "arg_bound_1", "arg_bound2", "argument_component"

In [110]:
comp_grouped_df = components_df.groupby(['file_name'], sort=False).agg({
    'argument_id': list,
    'argument_type': list,
    'arg_bound_1': list,
    'arg_bound2': list,
    'argument_component': list,
}).reset_index()

In [111]:
comp_grouped_df

Unnamed: 0,file_name,argument_id,argument_type,arg_bound_1,arg_bound2,argument_component
0,essay001.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[503, 2154, 591, 716, 853, 1088, 1332, 1212, 1...","[575, 2231, 714, 851, 1086, 1191, 1376, 1301, ...",[we should attach more importance to cooperati...
1,essay002.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, Premise, Claim, Premi...","[391, 1936, 500, 1089, 626, 841, 948, 1168, 12...","[489, 2077, 624, 1156, 839, 946, 1057, 1204, 1...",[they are able to sustain their cultural ident...
2,essay003.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[352, 1568, 955, 487, 615, 786, 1024, 1454, 11...","[475, 1701, 1012, 613, 784, 943, 1135, 1526, 1...",[it has contributed to the economic developmen...
3,essay004.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11]","[MajorClaim, MajorClaim, Claim, Claim, Claim, ...","[262, 1663, 179, 953, 1578, 417, 532, 820, 107...","[376, 1758, 239, 1031, 1624, 530, 818, 924, 11...",[this industry has affected the cultural attri...
4,essay005.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, Claim, Claim, Claim, ...","[286, 1535, 388, 775, 1330, 566, 885, 993, 109...","[359, 1652, 564, 883, 1425, 765, 988, 1078, 11...",[one who studies overseas will gain many skill...
...,...,...,...,...,...,...
397,essay398.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[Claim, MajorClaim, Premise, Premise, Premise,...","[254, 1751, 409, 557, 647, 778, 915, 333, 1260...","[318, 1951, 541, 645, 767, 897, 1035, 407, 134...",[many different characters exist between male ...
398,essay399.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[367, 1323, 527, 431, 661, 787, 966, 750, 1086...","[429, 1390, 645, 525, 737, 951, 1073, 785, 117...",[this is a worrying tread which has negative i...
399,essay400.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[274, 1619, 736, 541, 805, 389, 1112, 1233, 14...","[387, 1812, 796, 723, 944, 524, 1220, 1388, 16...",[governments should devote a greater portion o...
400,essay401.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Premise, Premise, Cla...","[311, 1496, 502, 622, 386, 954, 1103, 818, 128...","[384, 1667, 607, 815, 488, 1087, 1278, 937, 13...",[fatherhood is a as vital part of a healty par...


In [112]:
rels_grouped_df = relations_df.groupby(['file_name'], sort=False).agg({
    'source_arg': list,
    'target_arg': list,
    'relation_type': list,
}).reset_index()

# source_arg	target_arg	relation_type

In [113]:
rels_grouped_df

Unnamed: 0,file_name,source_arg,target_arg,relation_type
0,essay001.ann,"[T4, T5, T6, T10, T9, T8]","[T3, T3, T3, T11, T11, T7]","[supports, supports, supports, supports, suppo..."
1,essay002.ann,"[T9, T11, T12, T13, T5, T6, T7, T3, T8]","[T8, T10, T10, T10, T4, T4, T4, T4, T10]","[supports, supports, supports, supports, suppo..."
2,essay003.ann,"[T10, T9, T6, T5, T4, T7]","[T8, T8, T3, T3, T3, T8]","[supports, supports, supports, supports, suppo..."
3,essay004.ann,"[T8, T7, T6, T10, T9, T11]","[T4, T4, T4, T11, T5, T5]","[supports, supports, supports, supports, suppo..."
4,essay005.ann,"[T6, T7, T8, T9, T10, T11]","[T3, T4, T7, T4, T5, T5]","[supports, attacks, attacks, supports, support..."
...,...,...,...,...
397,essay398.ann,"[T3, T4, T5, T6, T7, T13, T14, T11, T9, T15, T16]","[T8, T8, T8, T8, T8, T12, T12, T10, T10, T10, ...","[supports, supports, supports, supports, suppo..."
398,essay399.ann,"[T4, T5, T6, T8, T10]","[T3, T3, T7, T7, T9]","[supports, supports, supports, supports, suppo..."
399,essay400.ann,"[T4, T5, T6, T7, T9, T8]","[T3, T3, T3, T8, T8, T10]","[supports, supports, supports, supports, suppo..."
400,essay401.ann,"[T3, T7, T10, T4, T9, T6]","[T5, T6, T8, T3, T8, T8]","[supports, supports, supports, supports, suppo..."


### MERGE

In [114]:
merged_df = comp_grouped_df.merge(rels_grouped_df, on="file_name").merge(essays_df, on="file_name")

In [115]:
merged_df

Unnamed: 0,file_name,argument_id,argument_type,arg_bound_1,arg_bound2,argument_component,source_arg,target_arg,relation_type,essay_title,essay_text
0,essay001.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[503, 2154, 591, 716, 853, 1088, 1332, 1212, 1...","[575, 2231, 714, 851, 1086, 1191, 1376, 1301, ...",[we should attach more importance to cooperati...,"[T4, T5, T6, T10, T9, T8]","[T3, T3, T3, T11, T11, T7]","[supports, supports, supports, supports, suppo...",Should students be taught to compete or to coo...,It is always said that competition can effecti...
1,essay002.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, Premise, Claim, Premi...","[391, 1936, 500, 1089, 626, 841, 948, 1168, 12...","[489, 2077, 624, 1156, 839, 946, 1057, 1204, 1...",[they are able to sustain their cultural ident...,"[T9, T11, T12, T13, T5, T6, T7, T3, T8]","[T8, T10, T10, T10, T4, T4, T4, T4, T10]","[supports, supports, supports, supports, suppo...",More people are migrating to other countries t...,The last 50 years have seen an increasing numb...
2,essay003.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[352, 1568, 955, 487, 615, 786, 1024, 1454, 11...","[475, 1701, 1012, 613, 784, 943, 1135, 1526, 1...",[it has contributed to the economic developmen...,"[T10, T9, T6, T5, T4, T7]","[T8, T8, T3, T3, T3, T8]","[supports, supports, supports, supports, suppo...",International tourism is now more common than ...,The last decade has seen an increasing number ...
3,essay004.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11]","[MajorClaim, MajorClaim, Claim, Claim, Claim, ...","[262, 1663, 179, 953, 1578, 417, 532, 820, 107...","[376, 1758, 239, 1031, 1624, 530, 818, 924, 11...",[this industry has affected the cultural attri...,"[T8, T7, T6, T10, T9, T11]","[T4, T4, T4, T11, T5, T5]","[supports, supports, supports, supports, suppo...",International tourism is now more common than ...,The last 50 years have seen a significant incr...
4,essay005.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, Claim, Claim, Claim, ...","[286, 1535, 388, 775, 1330, 566, 885, 993, 109...","[359, 1652, 564, 883, 1425, 765, 988, 1078, 11...",[one who studies overseas will gain many skill...,"[T6, T7, T8, T9, T10, T11]","[T3, T4, T7, T4, T5, T5]","[supports, attacks, attacks, supports, support...",Living and studying overseas,It is every student's desire to study at a goo...
...,...,...,...,...,...,...,...,...,...,...,...
397,essay398.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[Claim, MajorClaim, Premise, Premise, Premise,...","[254, 1751, 409, 557, 647, 778, 915, 333, 1260...","[318, 1951, 541, 645, 767, 897, 1035, 407, 134...",[many different characters exist between male ...,"[T3, T4, T5, T6, T7, T13, T14, T11, T9, T15, T16]","[T8, T8, T8, T8, T8, T12, T12, T10, T10, T10, ...","[supports, supports, supports, supports, suppo...",We can not forcedly put the same numbers of ma...,There is a view that universities and colleges...
398,essay399.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[367, 1323, 527, 431, 661, 787, 966, 750, 1086...","[429, 1390, 645, 525, 737, 951, 1073, 785, 117...",[this is a worrying tread which has negative i...,"[T4, T5, T6, T8, T10]","[T3, T3, T7, T7, T9]","[supports, supports, supports, supports, suppo...","Drugs, alcohol and messy sex lives","Celebrities, for example movie stars, are usua..."
399,essay400.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[274, 1619, 736, 541, 805, 389, 1112, 1233, 14...","[387, 1812, 796, 723, 944, 524, 1220, 1388, 16...",[governments should devote a greater portion o...,"[T4, T5, T6, T7, T9, T8]","[T3, T3, T3, T8, T8, T10]","[supports, supports, supports, supports, suppo...",A greater proportion of the budget should be a...,"In today's world, the concept of the welfare s..."
400,essay401.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Premise, Premise, Cla...","[311, 1496, 502, 622, 386, 954, 1103, 818, 128...","[384, 1667, 607, 815, 488, 1087, 1278, 937, 13...",[fatherhood is a as vital part of a healty par...,"[T3, T7, T10, T4, T9, T6]","[T5, T6, T8, T3, T8, T8]","[supports, supports, supports, supports, suppo...",Fatherhood should be as present as motherhood ...,"In today's world, having and raising children ..."


In [122]:
merged_df.columns

Index(['file_name', 'argument_id', 'argument_type', 'arg_bound_1',
       'arg_bound2', 'argument_component', 'source_arg', 'target_arg',
       'relation_type', 'essay_title', 'essay_text'],
      dtype='object')

In [123]:
merged_df.columns = ['ann_file_name', 'argument_ids', 'argument_types', 'arg_start_bounds',
       'arg_end_bounds', 'argument_components', 'source_args', 'target_args',
       'relation_types', 'essay_title', 'essay_text']

In [124]:
merged_df

Unnamed: 0,ann_file_name,argument_ids,argument_types,arg_start_bounds,arg_end_bounds,argument_components,source_args,target_args,relation_types,essay_title,essay_text
0,essay001.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[503, 2154, 591, 716, 853, 1088, 1332, 1212, 1...","[575, 2231, 714, 851, 1086, 1191, 1376, 1301, ...",[we should attach more importance to cooperati...,"[T4, T5, T6, T10, T9, T8]","[T3, T3, T3, T11, T11, T7]","[supports, supports, supports, supports, suppo...",Should students be taught to compete or to coo...,It is always said that competition can effecti...
1,essay002.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, Premise, Claim, Premi...","[391, 1936, 500, 1089, 626, 841, 948, 1168, 12...","[489, 2077, 624, 1156, 839, 946, 1057, 1204, 1...",[they are able to sustain their cultural ident...,"[T9, T11, T12, T13, T5, T6, T7, T3, T8]","[T8, T10, T10, T10, T4, T4, T4, T4, T10]","[supports, supports, supports, supports, suppo...",More people are migrating to other countries t...,The last 50 years have seen an increasing numb...
2,essay003.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[352, 1568, 955, 487, 615, 786, 1024, 1454, 11...","[475, 1701, 1012, 613, 784, 943, 1135, 1526, 1...",[it has contributed to the economic developmen...,"[T10, T9, T6, T5, T4, T7]","[T8, T8, T3, T3, T3, T8]","[supports, supports, supports, supports, suppo...",International tourism is now more common than ...,The last decade has seen an increasing number ...
3,essay004.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11]","[MajorClaim, MajorClaim, Claim, Claim, Claim, ...","[262, 1663, 179, 953, 1578, 417, 532, 820, 107...","[376, 1758, 239, 1031, 1624, 530, 818, 924, 11...",[this industry has affected the cultural attri...,"[T8, T7, T6, T10, T9, T11]","[T4, T4, T4, T11, T5, T5]","[supports, supports, supports, supports, suppo...",International tourism is now more common than ...,The last 50 years have seen a significant incr...
4,essay005.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[MajorClaim, MajorClaim, Claim, Claim, Claim, ...","[286, 1535, 388, 775, 1330, 566, 885, 993, 109...","[359, 1652, 564, 883, 1425, 765, 988, 1078, 11...",[one who studies overseas will gain many skill...,"[T6, T7, T8, T9, T10, T11]","[T3, T4, T7, T4, T5, T5]","[supports, attacks, attacks, supports, support...",Living and studying overseas,It is every student's desire to study at a goo...
...,...,...,...,...,...,...,...,...,...,...,...
397,essay398.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,...","[Claim, MajorClaim, Premise, Premise, Premise,...","[254, 1751, 409, 557, 647, 778, 915, 333, 1260...","[318, 1951, 541, 645, 767, 897, 1035, 407, 134...",[many different characters exist between male ...,"[T3, T4, T5, T6, T7, T13, T14, T11, T9, T15, T16]","[T8, T8, T8, T8, T8, T12, T12, T10, T10, T10, ...","[supports, supports, supports, supports, suppo...",We can not forcedly put the same numbers of ma...,There is a view that universities and colleges...
398,essay399.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[367, 1323, 527, 431, 661, 787, 966, 750, 1086...","[429, 1390, 645, 525, 737, 951, 1073, 785, 117...",[this is a worrying tread which has negative i...,"[T4, T5, T6, T8, T10]","[T3, T3, T7, T7, T9]","[supports, supports, supports, supports, suppo...","Drugs, alcohol and messy sex lives","Celebrities, for example movie stars, are usua..."
399,essay400.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Claim, Premise, Premi...","[274, 1619, 736, 541, 805, 389, 1112, 1233, 14...","[387, 1812, 796, 723, 944, 524, 1220, 1388, 16...",[governments should devote a greater portion o...,"[T4, T5, T6, T7, T9, T8]","[T3, T3, T3, T8, T8, T10]","[supports, supports, supports, supports, suppo...",A greater proportion of the budget should be a...,"In today's world, the concept of the welfare s..."
400,essay401.ann,"[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10]","[MajorClaim, MajorClaim, Premise, Premise, Cla...","[311, 1496, 502, 622, 386, 954, 1103, 818, 128...","[384, 1667, 607, 815, 488, 1087, 1278, 937, 13...",[fatherhood is a as vital part of a healty par...,"[T3, T7, T10, T4, T9, T6]","[T5, T6, T8, T3, T8, T8]","[supports, supports, supports, supports, suppo...",Fatherhood should be as present as motherhood ...,"In today's world, having and raising children ..."


#### sanity checks

In [126]:
def egal_1(row):
    
    return 1 if (len(row.argument_ids) == len(row.argument_types) == len(row.arg_start_bounds) == len(row.arg_end_bounds) == len(row.argument_components)) else 0

In [127]:
merged_df['egal_1'] = merged_df.apply(lambda row: egal_1(row), axis=1)

In [128]:
merged_df['egal_1'].value_counts()

egal_1
1    402
Name: count, dtype: int64

In [129]:
def egal_2(row):
    
    return 1 if (len(row.source_args) == len(row.target_args) == len(row.relation_types)) else 0

In [130]:
merged_df['egal_2'] = merged_df.apply(lambda row: egal_2(row), axis=1)

In [131]:
merged_df['egal_2'].value_counts()

egal_2
1    402
Name: count, dtype: int64