In [1]:
import os
import re
import pandas as pd
from tqdm import tqdm

In [2]:
train_ann_files_directory = "/Utilisateurs/umushtaq/am_reasoning/abstRCT/raw_files/train/neoplasm_train"
dev_ann_files_directory = "/Utilisateurs/umushtaq/am_reasoning/abstRCT/raw_files/dev/neoplasm_dev"
neo_test_ann_files_directory = "/Utilisateurs/umushtaq/am_reasoning/abstRCT/raw_files/test/neoplasm_test"
mix_test_ann_files_directory = "/Utilisateurs/umushtaq/am_reasoning/abstRCT/raw_files/test/mixed_test"
gla_test_ann_files_directory = "/Utilisateurs/umushtaq/am_reasoning/abstRCT/raw_files/test/glaucoma_test"
#text_files_directory = "/Utilisateurs/umushtaq/am_reasoning/raw_files/txt_files"

In [3]:
# train_data = []
# dev_data = []
# neo_test_data = []
# mix_test_data = []
# gla_test_data = []

In [49]:
def process_files(dir):
    
    component_data = []
    relations_data = []
    
    for filename in tqdm([f for f in os.listdir(dir) if f.endswith('.ann')], desc="Parsing annotation files for components ..."):
    
        #print("processing: " + filename)
        #if filename.endswith(".ann"):  # Process only text files
        file_path = os.path.join(dir, filename)
        
        with open(file_path, "r", encoding="utf-8") as file:
            for line in file:
                parts = line.strip().split("\t")
                #print(parts)
                # Process "T" lines (claims/premises)
                if line.startswith("T"):
                    t_id = parts[0]  # Extract T ID (e.g., T1)
                    t_type = parts[1].split(" ")[0]  # Extract type (e.g., MajorClaim, Claim, Premise)
                    t_s_bound = parts[1].split(" ")[1]
                    t_e_bound = parts[1].split(" ")[2]
                    text = parts[2] #if len(parts) > 3 else ""  # Extract text if available
                    component_data.append(["T", filename, t_id, t_type, t_s_bound, t_e_bound, text])
                    
    for filename in tqdm([f for f in os.listdir(dir) if f.endswith('.ann')], desc="Parsing annotation files for relations ... "):
    
        #print("processing: " + filename)
        #if filename.endswith(".ann"):  # Process only text files
        file_path = os.path.join(dir, filename)
        
        with open(file_path, "r", encoding="utf-8") as file:
            has_relation = False
            for line in file:
                parts = line.strip().split("\t")
                #print(parts)
                # Process "T" lines (claims/premises)
                if line.startswith("R"):
                    has_relation = True
                    parts = line.strip().split()
                    relation_type = parts[1]  # "supports" or "attacks"
                    arg1 = parts[2].split(":")[1]  # Extract T value from Arg1
                    arg2 = parts[3].split(":")[1]  # Extract T value from Arg2
                    relations_data.append(["R", filename, arg1, arg2, relation_type])
                    
            if not has_relation:
                relations_data.append(["R", filename, None, None, None])
                    
    return component_data, relations_data

In [50]:
train_component_data, train_relations_data = process_files(train_ann_files_directory)

Parsing annotation files for components ...:   0%|          | 0/350 [00:00<?, ?it/s]

Parsing annotation files for components ...: 100%|██████████| 350/350 [00:00<00:00, 1404.74it/s]
Parsing annotation files for relations ... : 100%|██████████| 350/350 [00:00<00:00, 1353.62it/s]


In [51]:
dev_component_data, dev_relations_data = process_files(dev_ann_files_directory)

Parsing annotation files for components ...: 100%|██████████| 50/50 [00:00<00:00, 1434.53it/s]
Parsing annotation files for relations ... :   0%|          | 0/50 [00:00<?, ?it/s]

Parsing annotation files for relations ... : 100%|██████████| 50/50 [00:00<00:00, 1498.98it/s]


In [52]:
neo_test_component_data, neo_test_relations_data = process_files(neo_test_ann_files_directory)

Parsing annotation files for components ...:   0%|          | 0/100 [00:00<?, ?it/s]

Parsing annotation files for components ...: 100%|██████████| 100/100 [00:00<00:00, 1443.17it/s]
Parsing annotation files for relations ... : 100%|██████████| 100/100 [00:00<00:00, 1381.46it/s]


In [53]:
mix_test_component_data, mix_test_relations_data = process_files(mix_test_ann_files_directory)

Parsing annotation files for components ...: 100%|██████████| 100/100 [00:00<00:00, 1279.43it/s]
Parsing annotation files for relations ... : 100%|██████████| 100/100 [00:00<00:00, 1443.24it/s]


In [54]:
gla_test_component_data, gla_test_relations_data = process_files(gla_test_ann_files_directory)

Parsing annotation files for components ...: 100%|██████████| 100/100 [00:00<00:00, 1275.84it/s]
Parsing annotation files for relations ... : 100%|██████████| 100/100 [00:00<00:00, 1090.69it/s]


In [55]:
df = pd.DataFrame(train_component_data)

In [56]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,T,10561201.ann,T1,MajorClaim,1,162,A combination of mitoxantrone plus prednisone ...
1,T,10561201.ann,T2,Premise,992,1058,"At 6 weeks, both groups showed improvement in ..."
2,T,10561201.ann,T3,Premise,1063,1187,only physical functioning and pain were better...
3,T,10561201.ann,T4,Premise,1188,1447,"After 6 weeks, patients taking prednisone show..."
4,T,10561201.ann,T6,Premise,1452,1568,the improvement (> 10 units on a scale of 0 to...
...,...,...,...,...,...,...,...
2262,T,9890172.ann,T2,Premise,1260,1450,Vinorelbine-treated patients scored better tha...
2263,T,9890172.ann,T5,Premise,1451,1563,There was a statistically significant (two-sid...
2264,T,9890172.ann,T6,Premise,1564,1643,median survival increased from 21 to 28 weeks ...
2265,T,9890172.ann,T7,Premise,1644,1736,The relative hazard of death for vinorelbine-t...


In [57]:
df.columns = ["type", "filename", "ac_id", "ac_type", "ac_start_bound", "ac_end_bound", "ac"]

In [58]:
len(df.filename.unique().tolist())

350

In [59]:
train_gdf = df.groupby(["filename"], sort=False).agg({
    'type': list,
    'ac_id': list,
    "ac_type": list,
    'ac_start_bound': list,
    'ac_end_bound': list,
    'ac': list,
}).reset_index()

In [60]:
train_gdf

Unnamed: 0,filename,type,ac_id,ac_type,ac_start_bound,ac_end_bound,ac
0,10561201.ann,"[T, T, T, T, T, T, T]","[T1, T2, T3, T4, T6, T7, T8]","[MajorClaim, Premise, Premise, Premise, Premis...","[1, 992, 1063, 1188, 1452, 1569, 1769]","[162, 1058, 1187, 1447, 1568, 1768, 1945]",[A combination of mitoxantrone plus prednisone...
1,10561203.ann,"[T, T, T, T, T, T, T]","[T1, T3, T4, T5, T6, T7, T8]","[Claim, Premise, Premise, Premise, Premise, Pr...","[1, 1354, 1443, 1593, 1828, 1979, 2056]","[318, 1442, 1592, 1827, 1978, 2055, 2167]",[In endocrine therapy trials in advanced breas...
2,10653877.ann,"[T, T, T, T, T, T, T, T, T]","[T1, T2, T3, T4, T5, T6, T7, T8, T9]","[Claim, Premise, Premise, Premise, Premise, Pr...","[1, 957, 1204, 1297, 1465, 1612, 1865, 1938, 2...","[156, 1203, 1296, 1464, 1611, 1864, 1928, 2001...",[Treatment with cisplatin-based chemotherapy p...
3,10675381.ann,"[T, T, T, T, T, T, T, T]","[T1, T2, T3, T4, T5, T6, T7, T8]","[MajorClaim, Premise, Premise, Premise, Premis...","[218, 1028, 1209, 1395, 1580, 1647, 1844, 2193]","[449, 1208, 1394, 1579, 1646, 1843, 2152, 2288]",[In nonrandomized studies involving patients w...
4,10735887.ann,"[T, T, T, T, T, T, T]","[T1, T2, T3, T4, T5, T6, T7]","[Premise, Premise, Premise, Premise, Claim, Pr...","[552, 678, 758, 1102, 1196, 1228, 1306]","[677, 757, 1101, 1195, 1227, 1305, 1555]",[Overall objective response (OR) rates were hi...
...,...,...,...,...,...,...,...
345,9807987.ann,"[T, T, T, T, T]","[T1, T2, T3, T4, T5]","[Premise, Premise, Premise, Premise, Claim]","[812, 1026, 1140, 1342, 1469]","[1025, 1139, 1341, 1468, 1760]","[With a median follow-up of 13 months, the ove..."
346,9849452.ann,"[T, T, T, T, T, T, T, T, T]","[T1, T2, T3, T4, T5, T6, T7, T8, T9]","[Premise, Premise, Premise, Premise, Premise, ...","[983, 1124, 1187, 1250, 1327, 1349, 1503, 1624...","[1123, 1186, 1249, 1326, 1348, 1502, 1623, 166...",[Patients in the chemotherapy group reported b...
347,9849454.ann,"[T, T, T, T, T, T]","[T1, T2, T3, T4, T5, T6]","[Premise, Premise, Premise, Premise, Claim, Cl...","[1149, 1215, 1381, 1507, 2258, 2523]","[1214, 1380, 1506, 1699, 2522, 2655]",[Complete remission was achieved in 91% (170/1...
348,9850014.ann,"[T, T, T, T, T, T]","[T1, T2, T3, T4, T5, T6]","[Premise, Premise, Premise, Premise, Premise, ...","[891, 1041, 1139, 1264, 1357, 1578]","[1040, 1138, 1263, 1356, 1577, 1770]",[An objective response (complete [CR] or parti...


In [61]:
r_df = pd.DataFrame(train_relations_data)

In [62]:
r_df

Unnamed: 0,0,1,2,3,4
0,R,10561201.ann,T8,T1,Support
1,R,10561201.ann,T2,T8,Support
2,R,10561201.ann,T3,T8,Support
3,R,10561201.ann,T6,T8,Support
4,R,10561201.ann,T4,T8,Support
...,...,...,...,...,...
1420,R,9890172.ann,T8,T1,Support
1421,R,9890172.ann,T5,T8,Support
1422,R,9890172.ann,T6,T8,Support
1423,R,9890172.ann,T2,T8,Support


In [63]:
r_df.columns = ["type", "filename", "source", "target", "relation_type"]

In [64]:
len(r_df.filename.unique().tolist())

350

In [65]:
x= df.filename.unique().tolist()
y = r_df.filename.unique().tolist()

In [66]:
[elem for elem in x if elem not in y]

[]

In [67]:
rtrain_gdf = r_df.groupby(["filename"], sort=False).agg({
    'type': list,
    'source': list,
    "target": list,
    'relation_type': list,
}).reset_index()

In [68]:
rtrain_gdf

Unnamed: 0,filename,type,source,target,relation_type
0,10561201.ann,"[R, R, R, R, R, R]","[T8, T2, T3, T6, T4, T7]","[T1, T8, T8, T8, T8, T1]","[Support, Support, Support, Support, Support, ..."
1,10561203.ann,"[R, R, R, R, R]","[T7, T3, T4, T5, T6]","[T8, T8, T8, T8, T8]","[Partial-Attack, Support, Support, Support, Pa..."
2,10653877.ann,"[R, R, R, R, R]","[T8, T6, T2, T4, T5]","[T9, T9, T9, T9, T9]","[Support, Support, Support, Support, Support]"
3,10675381.ann,"[R, R, R, R, R]","[T2, T4, T5, T6, T7]","[T8, T8, T8, T8, T8]","[Support, Support, Support, Support, Support]"
4,10735887.ann,"[R, R, R, R]","[T1, T3, T4, T5]","[T7, T7, T7, T7]","[Support, Support, Support, Support]"
...,...,...,...,...,...
345,9807987.ann,"[R, R, R]","[T4, T3, T1]","[T5, T5, T5]","[Support, Support, Support]"
346,9849452.ann,"[R, R, R, R, R]","[T5, T1, T3, T6, T7]","[T4, T9, T2, T9, T9]","[Partial-Attack, Support, Partial-Attack, Supp..."
347,9849454.ann,"[R, R, R, R]","[T1, T4, T3, T2]","[T6, T6, T5, T5]","[Support, Support, Support, Support]"
348,9850014.ann,"[R, R, R]","[T4, T5, T3]","[T6, T6, T6]","[Support, Support, Support]"


In [69]:
train_df = train_gdf.merge(rtrain_gdf, on="filename")

In [70]:
train_df

Unnamed: 0,filename,type_x,ac_id,ac_type,ac_start_bound,ac_end_bound,ac,type_y,source,target,relation_type
0,10561201.ann,"[T, T, T, T, T, T, T]","[T1, T2, T3, T4, T6, T7, T8]","[MajorClaim, Premise, Premise, Premise, Premis...","[1, 992, 1063, 1188, 1452, 1569, 1769]","[162, 1058, 1187, 1447, 1568, 1768, 1945]",[A combination of mitoxantrone plus prednisone...,"[R, R, R, R, R, R]","[T8, T2, T3, T6, T4, T7]","[T1, T8, T8, T8, T8, T1]","[Support, Support, Support, Support, Support, ..."
1,10561203.ann,"[T, T, T, T, T, T, T]","[T1, T3, T4, T5, T6, T7, T8]","[Claim, Premise, Premise, Premise, Premise, Pr...","[1, 1354, 1443, 1593, 1828, 1979, 2056]","[318, 1442, 1592, 1827, 1978, 2055, 2167]",[In endocrine therapy trials in advanced breas...,"[R, R, R, R, R]","[T7, T3, T4, T5, T6]","[T8, T8, T8, T8, T8]","[Partial-Attack, Support, Support, Support, Pa..."
2,10653877.ann,"[T, T, T, T, T, T, T, T, T]","[T1, T2, T3, T4, T5, T6, T7, T8, T9]","[Claim, Premise, Premise, Premise, Premise, Pr...","[1, 957, 1204, 1297, 1465, 1612, 1865, 1938, 2...","[156, 1203, 1296, 1464, 1611, 1864, 1928, 2001...",[Treatment with cisplatin-based chemotherapy p...,"[R, R, R, R, R]","[T8, T6, T2, T4, T5]","[T9, T9, T9, T9, T9]","[Support, Support, Support, Support, Support]"
3,10675381.ann,"[T, T, T, T, T, T, T, T]","[T1, T2, T3, T4, T5, T6, T7, T8]","[MajorClaim, Premise, Premise, Premise, Premis...","[218, 1028, 1209, 1395, 1580, 1647, 1844, 2193]","[449, 1208, 1394, 1579, 1646, 1843, 2152, 2288]",[In nonrandomized studies involving patients w...,"[R, R, R, R, R]","[T2, T4, T5, T6, T7]","[T8, T8, T8, T8, T8]","[Support, Support, Support, Support, Support]"
4,10735887.ann,"[T, T, T, T, T, T, T]","[T1, T2, T3, T4, T5, T6, T7]","[Premise, Premise, Premise, Premise, Claim, Pr...","[552, 678, 758, 1102, 1196, 1228, 1306]","[677, 757, 1101, 1195, 1227, 1305, 1555]",[Overall objective response (OR) rates were hi...,"[R, R, R, R]","[T1, T3, T4, T5]","[T7, T7, T7, T7]","[Support, Support, Support, Support]"
...,...,...,...,...,...,...,...,...,...,...,...
345,9807987.ann,"[T, T, T, T, T]","[T1, T2, T3, T4, T5]","[Premise, Premise, Premise, Premise, Claim]","[812, 1026, 1140, 1342, 1469]","[1025, 1139, 1341, 1468, 1760]","[With a median follow-up of 13 months, the ove...","[R, R, R]","[T4, T3, T1]","[T5, T5, T5]","[Support, Support, Support]"
346,9849452.ann,"[T, T, T, T, T, T, T, T, T]","[T1, T2, T3, T4, T5, T6, T7, T8, T9]","[Premise, Premise, Premise, Premise, Premise, ...","[983, 1124, 1187, 1250, 1327, 1349, 1503, 1624...","[1123, 1186, 1249, 1326, 1348, 1502, 1623, 166...",[Patients in the chemotherapy group reported b...,"[R, R, R, R, R]","[T5, T1, T3, T6, T7]","[T4, T9, T2, T9, T9]","[Partial-Attack, Support, Partial-Attack, Supp..."
347,9849454.ann,"[T, T, T, T, T, T]","[T1, T2, T3, T4, T5, T6]","[Premise, Premise, Premise, Premise, Claim, Cl...","[1149, 1215, 1381, 1507, 2258, 2523]","[1214, 1380, 1506, 1699, 2522, 2655]",[Complete remission was achieved in 91% (170/1...,"[R, R, R, R]","[T1, T4, T3, T2]","[T6, T6, T5, T5]","[Support, Support, Support, Support]"
348,9850014.ann,"[T, T, T, T, T, T]","[T1, T2, T3, T4, T5, T6]","[Premise, Premise, Premise, Premise, Premise, ...","[891, 1041, 1139, 1264, 1357, 1578]","[1040, 1138, 1263, 1356, 1577, 1770]",[An objective response (complete [CR] or parti...,"[R, R, R]","[T4, T5, T3]","[T6, T6, T6]","[Support, Support, Support]"


In [115]:
# for filename in tqdm([f for f in os.listdir(train_ann_files_directory) if f.endswith('.ann')], desc="Parsing annotation files for components ..."):
    
#     print("processing: " + filename)
#     #if filename.endswith(".ann"):  # Process only text files
#     file_path = os.path.join(train_ann_files_directory, filename)
    
#     with open(file_path, "r", encoding="utf-8") as file:
#         for line in file:
#             parts = line.strip().split("\t")
#             #print(parts)
#             # Process "T" lines (claims/premises)
#             if line.startswith("T"):
#                 t_id = parts[0]  # Extract T ID (e.g., T1)
#                 t_type = parts[1].split(" ")[0]  # Extract type (e.g., MajorClaim, Claim, Premise)
#                 t_s_bound = parts[1].split(" ")[1]
#                 t_e_bound = parts[1].split(" ")[2]
#                 text = parts[2] #if len(parts) > 3 else ""  # Extract text if available
#                 train_data.append(["T", filename, t_id, t_type, t_s_bound, t_e_bound, text])

In [26]:
# for filename in tqdm(os.listdir(ann_files_directory), desc="Parsing annotation files for relations ... "):
    
#     print("processing: " + filename)
#     #if filename.endswith(".ann"):  # Process only text files
#     file_path = os.path.join(ann_files_directory, filename)
    
#     with open(file_path, "r", encoding="utf-8") as file:
#         for line in file:
#             parts = line.strip().split("\t")
#             #print(parts)
#             # Process "T" lines (claims/premises)
#             if line.startswith("R"):
#                 parts = line.strip().split()
#                 relation_type = parts[1]  # "supports" or "attacks"
#                 arg1 = parts[2].split(":")[1]  # Extract T value from Arg1
#                 arg2 = parts[3].split(":")[1]  # Extract T value from Arg2
#                 relations_data.append(["R", filename, arg1, arg2, relation_type])

In [27]:
# for filename in tqdm(os.listdir(ann_files_directory), desc="Parsing annotation files for stances ... "):
    
#     print("processing: " + filename)
#     #if filename.endswith(".ann"):  # Process only text files
#     file_path = os.path.join(ann_files_directory, filename)
    
#     with open(file_path, "r", encoding="utf-8") as file:
#         for line in file:
#             parts = line.strip().split("\t")
#             #print(parts)
#             # Process "T" lines (claims/premises)
#             if line.startswith("A"):
#                 parts = line.strip().split()
#                 relation_type = parts[-1]  # "supports" or "attacks"
#                 #arg1 = parts[2].split(":")[1]  # Extract T value from Arg1
#                 #arg2 = parts[3].split(":")[1]  # Extract T value from Arg2
#                 stances_data.append(["A", filename, relation_type])

In [28]:
# for filename in tqdm(os.listdir(text_files_directory), desc="Parsing text files for esssays ... "):
    
#     print("processing: " + filename)
#     #if filename.endswith(".ann"):  # Process only text files
#     file_path = os.path.join(text_files_directory, filename)
#     filename_ann = filename.replace(".txt", ".ann")
    
#     with open(file_path, "r", encoding="utf-8") as file:
        
#         lines = file.readlines()
#         essay_title = lines[0].strip()
#         essay_text = "".join(lines[1:]).strip()
        
#         essays_data.append([filename.replace(".txt", ".ann"), essay_title, essay_text])

In [29]:
# components_df = pd.DataFrame(components_data)
# relations_df = pd.DataFrame(relations_data)
# stances_df = pd.DataFrame(stances_data)
# essays_df = pd.DataFrame(essays_data)

In [30]:
# essays_df

In [31]:
# components_df = components_df.drop(columns=components_df.columns[0])
# components_df.columns = ["file_name", "argument_id", "argument_type", "arg_bound_1", "arg_bound2", "argument_component"]

In [32]:
# relations_df = relations_df.drop(columns=relations_df.columns[0])
# relations_df.columns = ["file_name", "source_arg", "target_arg", "relation_type"]

In [33]:
# stances_df = stances_df.drop(columns=stances_df.columns[0])
# stances_df.columns = ["file_name", "stance_type"]

In [34]:
# essays_df.columns = ["file_name", "essay_title", "essay_text"]

<!-- ### Group and Merge -->

In [35]:
# comp_grouped_df = components_df.groupby(['file_name'], sort=False).agg({
#     'argument_id': list,
#     'argument_type': list,
#     'arg_bound_1': list,
#     'arg_bound2': list,
#     'argument_component': list,
# }).reset_index()

In [36]:
# rels_grouped_df = relations_df.groupby(['file_name'], sort=False).agg({
#     'source_arg': list,
#     'target_arg': list,
#     'relation_type': list,
# }).reset_index()

In [37]:
# stance_grouped_df = stances_df.groupby(['file_name'], sort=False).agg({
#     'stance_type': list,
# }).reset_index()

In [38]:
# pe_df = comp_grouped_df.merge(rels_grouped_df, on="file_name").merge(stance_grouped_df, on="file_name").merge(essays_df, on="file_name")

In [39]:
# pe_df.columns = ['ann_file_name', 'argument_ids', 'argument_types', 'arg_start_bounds',
#        'arg_end_bounds', 'argument_components', 'source_args', 'target_args',
#        'relation_types', 'stance_types', 'essay_title', 'essay_text']

In [40]:
# pe_df

In [41]:
# def claim_sanity(row):
    
#     return 1 if len([elem for elem in row.argument_types if elem == "Claim"]) == len(row.stance_types) else 0

In [42]:
# pe_df['sanity_claims'] = pe_df.apply(lambda row: claim_sanity(row), axis=1)

In [43]:
# pe_df['sanity_claims'].value_counts()

In [44]:
## Sanity Checks

In [45]:
# def is_ordered(row):
#     """Checks if the list is in the correct order."""
#     extracted_numbers = [int(re.search(r'\d+', item).group()) for item in row.argument_ids] # type: ignore
#     return extracted_numbers == sorted(extracted_numbers)

In [46]:
# pe_df['sanity_check_1'] = pe_df.apply(lambda row: is_ordered(row), axis=1)

In [47]:
# pe_df['sanity_check_1'].value_counts()

In [48]:
# pe_df[pe_df['sanity_check_1'] == False]

In [49]:
# def get_source_acs(row):
    
#     arg_ids = row.argument_ids    
#     source_ids = row.source_args
#     acs = row.argument_components
    
#     source_acs = []
    
#     for id in source_ids:
        
#         idx = arg_ids.index(id)
#         source_acs.append(acs[idx])
        
#     return source_acs

In [50]:
# get_source_acs(pe_df.iloc[0])

In [51]:
# def get_target_acs(row):
    
#     arg_ids = row.argument_ids    
#     target_ids = row.target_args
#     acs = row.argument_components
    
#     target_acs = []
    
#     for id in target_ids:
        
#         idx = arg_ids.index(id)
#         target_acs.append(acs[idx])
        
#     return target_acs

In [52]:
# get_target_acs(pe_df.iloc[0])

In [53]:
# pe_df.iloc[0]

In [54]:
# pe_df.iloc[0]['argument_components']

In [55]:
# pe_df['source_acs'] = pe_df.apply(lambda row: get_source_acs(row), axis=1)

In [56]:
# pe_df['target_acs'] = pe_df.apply(lambda row: get_target_acs(row), axis=1)

In [57]:
# pe_df

In [58]:
# def build_pairs(row):
    
#     source_acs = row.source_acs
#     target_acs = row.target_acs
#     essay_acs_l = row.argument_components
#     stances_l = row.relation_types
    
#     pairs = []
    
#     for s_ac, t_ac, stance in zip(source_acs, target_acs, stances_l):
        
#         pairs.append((essay_acs_l.index(s_ac)+1, essay_acs_l.index(t_ac)+1, stance))
        
#     return pairs

In [59]:
# build_pairs(pe_df.iloc[370])

In [60]:
# pe_df['relation_pairs'] = pe_df.apply(lambda row: build_pairs(row), axis=1)

In [61]:
# sum(pe_df.relation_pairs.apply(len))

In [62]:
# sum(pe_df.relation_types.apply(len))

In [63]:
# sum(pe_df.stance_types.apply(len))

In [64]:
# pe_df

In [65]:
# def c_mc_pairs(row):
    
#     arg_types = row.argument_types
#     stances = row.stance_types
    
#     pairs = [(i+1, j+1) for i, _ in enumerate(arg_types) for j, _ in enumerate(arg_types) if (arg_types[i] == "Claim" and arg_types[j] == "MajorClaim")]
    
#     triplets = []
#     i = 0
    
#     stance = stances[i]
#     source = pairs[0][0]
    
#     for pair in pairs:
        
#         new_source = pair[0]
#         if new_source != source:
#             i+=1
#             stance = stances[i]
        
#         #print((pair, stance))
#         triplets.append((pair[0], pair[1], stance))
#         source = new_source
    
#     return triplets
#     #triplets = [(pair, stance) for pair in pairs for idx, stance in enumerate(stances) if [elem for elem in arg_types if elem == "Claim"][idx]]

In [66]:
# c_mc_pairs(pe_df.iloc[0])

In [67]:
# pe_df.iloc[0]

In [68]:
# pe_df['stance_pairs'] = pe_df.apply(lambda row: c_mc_pairs(row), axis=1)

In [69]:
# pe_df

In [70]:
# pe_df.columns

In [71]:
# pe_df = pe_df.drop(columns=['sanity_claims', 'sanity_check_1'])

In [72]:
# split_df = pd.read_csv("/Utilisateurs/umushtaq/am_reasoning/data_files/train-test-split.csv", delimiter=";")

In [73]:
# split_df

In [74]:
# pe_df['split'] = split_df.SET

In [75]:
# pe_df.to_csv("/Utilisateurs/umushtaq/am_reasoning/data_files/pe_dataset_final.csv")

In [76]:
# 

In [77]:
# 

In [78]:
# 

In [79]:
# 

In [80]:
# 

In [81]:
# 

In [82]:
# components_df

In [83]:
# components_df = components_df.drop(columns=components_df.columns[0])

In [84]:
# 

In [85]:
# 

In [86]:
# 

In [87]:
# 

In [88]:
# relations_df

In [89]:
# relations_df = relations_df.drop(columns=relations_df.columns[0])

In [90]:
# relations_df.columns = ["file_name", "source_arg", "target_arg", "relation_type"]

In [91]:
# 

In [92]:
# 

In [93]:
# for filename in tqdm(os.listdir(text_files_directory), desc="Parsing text files"):
    
#     print("processing: " + filename)
#     #if filename.endswith(".ann"):  # Process only text files
#     file_path = os.path.join(text_files_directory, filename)
#     filename_ann = filename.replace(".txt", ".ann")
    
#     with open(file_path, "r", encoding="utf-8") as file:
        
#         lines = file.readlines()
#         essay_title = lines[0].strip()
#         essay_text = "".join(lines[1:]).strip()
        
#         essays_data.append([filename.replace(".txt", ".ann"), essay_title, essay_text])

In [94]:
# essays_df = pd.DataFrame(essays_data)

In [95]:
# essays_df

In [96]:
# essays_df.columns = ["file_name", "essay_title", "essay_text"]

In [97]:
# essays_df

<!-- ### Group dfs and then merge -->

In [98]:
"file_name", "argument_id", "argument_type", "arg_bound_1", "arg_bound2", "argument_component"

('file_name',
 'argument_id',
 'argument_type',
 'arg_bound_1',
 'arg_bound2',
 'argument_component')

In [99]:
# comp_grouped_df = components_df.groupby(['file_name'], sort=False).agg({
#     'argument_id': list,
#     'argument_type': list,
#     'arg_bound_1': list,
#     'arg_bound2': list,
#     'argument_component': list,
# }).reset_index()

In [100]:
# comp_grouped_df

In [101]:
# rels_grouped_df = relations_df.groupby(['file_name'], sort=False).agg({
#     'source_arg': list,
#     'target_arg': list,
#     'relation_type': list,
# }).reset_index()

# # source_arg	target_arg	relation_type

In [102]:
# rels_grouped_df

<!-- ### MERGE -->

In [103]:
# merged_df = comp_grouped_df.merge(rels_grouped_df, on="file_name").merge(essays_df, on="file_name")

In [104]:
# merged_df

In [105]:
# merged_df.columns

In [106]:
# merged_df.columns = ['ann_file_name', 'argument_ids', 'argument_types', 'arg_start_bounds',
#        'arg_end_bounds', 'argument_components', 'source_args', 'target_args',
#        'relation_types', 'essay_title', 'essay_text']

In [107]:
# merged_df

<!-- #### sanity checks -->

In [108]:
# def egal_1(row):
    
#     return 1 if (len(row.argument_ids) == len(row.argument_types) == len(row.arg_start_bounds) == len(row.arg_end_bounds) == len(row.argument_components)) else 0

In [109]:
# merged_df['egal_1'] = merged_df.apply(lambda row: egal_1(row), axis=1)

In [110]:
# merged_df['egal_1'].value_counts()

In [111]:
# def egal_2(row):
    
#     return 1 if (len(row.source_args) == len(row.target_args) == len(row.relation_types)) else 0

In [112]:
# merged_df['egal_2'] = merged_df.apply(lambda row: egal_2(row), axis=1)

In [113]:
# merged_df['egal_2'].value_counts()

In [114]:
# 