In [69]:
import os
import pandas as pd

from tqdm import tqdm
from pathlib import Path

### Paths

In [2]:
train_files_directory = "/Utilisateurs/umushtaq/am_reasoning/abstRCT/raw_files/train/neoplasm_train"
dev_files_directory = "/Utilisateurs/umushtaq/am_reasoning/abstRCT/raw_files/dev/neoplasm_dev"
neo_test_files_directory = "/Utilisateurs/umushtaq/am_reasoning/abstRCT/raw_files/test/neoplasm_test"
mix_test_files_directory = "/Utilisateurs/umushtaq/am_reasoning/abstRCT/raw_files/test/mixed_test"
gla_test_files_directory = "/Utilisateurs/umushtaq/am_reasoning/abstRCT/raw_files/test/glaucoma_test"

### Process ANN files

In [3]:
# def process_ann_files(dir):
    
#     component_data = []
#     relations_data = []
    
#     for filename in tqdm([f for f in os.listdir(dir) if f.endswith('.ann')], desc="Parsing annotation files for components ..."):
    
#         #print("processing: " + filename)
#         #if filename.endswith(".ann"):  # Process only text files
#         file_path = os.path.join(dir, filename)
        
#         with open(file_path, "r", encoding="utf-8") as file:
#             for line in file:
#                 parts = line.strip().split("\t")
#                 #print(parts)
#                 # Process "T" lines (claims/premises)
#                 if line.startswith("T"):
#                     t_id = parts[0]  # Extract T ID (e.g., T1)
#                     t_type = parts[1].split(" ")[0]  # Extract type (e.g., MajorClaim, Claim, Premise)
#                     t_s_bound = parts[1].split(" ")[1]
#                     t_e_bound = parts[1].split(" ")[2]
#                     text = parts[2] #if len(parts) > 3 else ""  # Extract text if available
#                     component_data.append(["T", filename, t_id, t_type, t_s_bound, t_e_bound, text])
                    
#     for filename in tqdm([f for f in os.listdir(dir) if f.endswith('.ann')], desc="Parsing annotation files for relations ... "):
    
#         #print("processing: " + filename)
#         #if filename.endswith(".ann"):  # Process only text files
#         file_path = os.path.join(dir, filename)
        
#         with open(file_path, "r", encoding="utf-8") as file:
#             has_relation = False
#             for line in file:
#                 parts = line.strip().split("\t")
#                 #print(parts)
#                 # Process "T" lines (claims/premises)
#                 if line.startswith("R"):
#                     has_relation = True
#                     parts = line.strip().split()
#                     relation_type = parts[1]  # "supports" or "attacks"
#                     arg1 = parts[2].split(":")[1]  # Extract T value from Arg1
#                     arg2 = parts[3].split(":")[1]  # Extract T value from Arg2
#                     relations_data.append(["R", filename, arg1, arg2, relation_type])
                    
#             if not has_relation:
#                 relations_data.append(["R", filename, None, None, None])
                    
#     return component_data, relations_data

In [4]:
def process_ann_files(dir):
    
    component_data = []
    relations_data = []
    
    for filename in tqdm([f for f in os.listdir(dir) if f.endswith('.ann')], desc="Parsing annotation files ..."):
    
        file_path = os.path.join(dir, filename)
        
        with open(file_path, "r", encoding="utf-8") as file:
            has_relation = False
            for line in file:
                parts = line.strip().split("\t")
                #print(parts)
                # Process "T" lines (claims/premises)
                if line.startswith("T"):
                    t_id = parts[0]  # Extract T ID (e.g., T1)
                    t_type = parts[1].split(" ")[0]  # Extract type (e.g., MajorClaim, Claim, Premise)
                    t_s_bound = parts[1].split(" ")[1]
                    t_e_bound = parts[1].split(" ")[2]
                    text = parts[2] #if len(parts) > 3 else ""  # Extract text if available
                    component_data.append(["T", filename, t_id, t_type, t_s_bound, t_e_bound, text])                    
                    
                elif line.startswith("R"):
                    has_relation = True
                    parts = line.strip().split()
                    relation_type = parts[1]  # "supports" or "attacks"
                    arg1 = parts[2].split(":")[1]  # Extract T value from Arg1
                    arg2 = parts[3].split(":")[1]  # Extract T value from Arg2
                    relations_data.append(["R", filename, arg1, arg2, relation_type])
                    
            if not has_relation:
                relations_data.append(["R", filename, None, None, None])
                    
    return component_data, relations_data

In [5]:
train_component_data, train_relations_data = process_ann_files(train_files_directory)

Parsing annotation files ...: 100%|██████████| 350/350 [00:00<00:00, 499.20it/s]


In [6]:
dev_component_data, dev_relations_data = process_ann_files(dev_files_directory)

Parsing annotation files ...: 100%|██████████| 50/50 [00:00<00:00, 549.27it/s]


In [7]:
neo_test_component_data, neo_test_relations_data = process_ann_files(neo_test_files_directory)

Parsing annotation files ...: 100%|██████████| 100/100 [00:00<00:00, 556.21it/s]


In [8]:
mix_test_component_data, mix_test_relations_data = process_ann_files(mix_test_files_directory)

Parsing annotation files ...:   3%|▎         | 3/100 [00:00<00:03, 29.63it/s]

Parsing annotation files ...: 100%|██████████| 100/100 [00:00<00:00, 377.88it/s]


In [9]:
gla_test_component_data, gla_test_relations_data = process_ann_files(gla_test_files_directory)

Parsing annotation files ...:   0%|          | 0/100 [00:00<?, ?it/s]

Parsing annotation files ...: 100%|██████████| 100/100 [00:00<00:00, 600.60it/s]


### Build DataFrames

In [10]:
def build_dataframes(components, relations):
    
    comp_df = pd.DataFrame(components)
    comp_df.columns = ["type_indicator", "filename", "ac_id", "ac_type", "ac_start_bound", "ac_end_bound", "ac"]
    
    r_df = pd.DataFrame(relations)
    r_df.columns = ["type_indicator", "filename", "source", "target", "relation_type"]
    
    train_gdf = comp_df.groupby(["filename"], sort=False).agg({
    'type_indicator': list,
    'ac_id': list,
    "ac_type": list,
    'ac_start_bound': list,
    'ac_end_bound': list,
    'ac': list,
}).reset_index()
    
    rtrain_gdf = r_df.groupby(["filename"], sort=False).agg({
    'type_indicator': list,
    'source': list,
    "target": list,
    'relation_type': list,
}).reset_index()
    
    return train_gdf.merge(rtrain_gdf, on="filename")   
    

In [11]:
train_df = build_dataframes(train_component_data, train_relations_data)

In [12]:
dev_df = build_dataframes(dev_component_data, dev_relations_data)

In [13]:
gla_test_df = build_dataframes(gla_test_component_data, gla_test_relations_data)
mix_test_df = build_dataframes(mix_test_component_data, mix_test_relations_data)
neo_test_df = build_dataframes(neo_test_component_data, neo_test_relations_data)

In [14]:
train_df.shape, dev_df.shape, gla_test_df.shape, mix_test_df.shape, neo_test_df.shape

((350, 11), (50, 11), (100, 11), (100, 11), (100, 11))

### Build Text Files

In [15]:
def process_text_files(dir):
    text_data = []

    for filename in tqdm([f for f in os.listdir(dir) if f.endswith('.txt')], desc="Parsing text files ..."):
        file_path = os.path.join(dir, filename)

        with open(file_path, "r", encoding="utf-8") as file:
            full_text = file.read()
            text_data.append([filename, filename.replace(".txt", ".ann"), full_text])

    return text_data

In [16]:
train_texts = process_text_files(train_files_directory)

Parsing text files ...: 100%|██████████| 350/350 [00:00<00:00, 519.94it/s]


In [17]:
dev_texts = process_text_files(dev_files_directory)

Parsing text files ...:   0%|          | 0/50 [00:00<?, ?it/s]

Parsing text files ...: 100%|██████████| 50/50 [00:00<00:00, 602.62it/s]


In [18]:
gla_test_texts = process_text_files(gla_test_files_directory)
mix_test_texts = process_text_files(mix_test_files_directory)
neo_test_texts = process_text_files(neo_test_files_directory)

Parsing text files ...:   0%|          | 0/100 [00:00<?, ?it/s]

Parsing text files ...: 100%|██████████| 100/100 [00:00<00:00, 567.42it/s]
Parsing text files ...: 100%|██████████| 100/100 [00:00<00:00, 573.32it/s]
Parsing text files ...: 100%|██████████| 100/100 [00:00<00:00, 681.12it/s]


In [19]:
len(train_texts), len(dev_texts), len(gla_test_texts), len(mix_test_texts), len(neo_test_texts)

(350, 50, 100, 100, 100)

### Merge with dfs

In [20]:
train_texts_df = pd.DataFrame(train_texts)
dev_texts_df = pd.DataFrame(dev_texts)
gla_test_texts_df = pd.DataFrame(gla_test_texts)
mix_test_texts_df = pd.DataFrame(mix_test_texts)
neo_test_texts_df = pd.DataFrame(neo_test_texts)

In [21]:
train_texts_df.columns = ["text_file", "filename", "abstract_text"]
dev_texts_df.columns = ["text_file", "filename", "abstract_text"]
gla_test_texts_df.columns = ["text_file", "filename", "abstract_text"]
mix_test_texts_df.columns = ["text_file", "filename", "abstract_text"]
neo_test_texts_df.columns = ["text_file", "filename", "abstract_text"]

In [22]:
train_df_abs = train_df.merge(train_texts_df, on="filename")

In [23]:
dev_df_abs = dev_df.merge(dev_texts_df, on="filename")

In [24]:
gla_test_df_abs = gla_test_df.merge(gla_test_texts_df, on="filename")
mix_test_df_abs = mix_test_df.merge(mix_test_texts_df, on="filename")
neo_test_df_abs = neo_test_df.merge(neo_test_texts_df, on="filename")

In [25]:
train_df_abs.shape, dev_df_abs.shape, gla_test_df_abs.shape, mix_test_df_abs.shape, neo_test_df_abs.shape

((350, 13), (50, 13), (100, 13), (100, 13), (100, 13))

### Adjust Relations

In [26]:
train_df_abs.shape, dev_df_abs.shape, gla_test_df_abs.shape, mix_test_df_abs.shape, neo_test_df_abs.shape

((350, 13), (50, 13), (100, 13), (100, 13), (100, 13))

In [27]:
#### Partial Attack -> Attack

In [28]:
def process_rel(row):
    
    return ["Attack" if elem == "Partial-Attack" else elem for elem in row.relation_type]

In [29]:
train_df_abs['relation_type'] = train_df_abs.apply(lambda row: process_rel(row), axis=1)
dev_df_abs['relation_type'] = dev_df_abs.apply(lambda row: process_rel(row), axis=1)
gla_test_df_abs['relation_type'] = gla_test_df_abs.apply(lambda row: process_rel(row), axis=1)
mix_test_df_abs['relation_type'] = mix_test_df_abs.apply(lambda row: process_rel(row), axis=1)
neo_test_df_abs['relation_type'] = neo_test_df_abs.apply(lambda row: process_rel(row), axis=1)

In [30]:
train_df_abs


Unnamed: 0,filename,type_indicator_x,ac_id,ac_type,ac_start_bound,ac_end_bound,ac,type_indicator_y,source,target,relation_type,text_file,abstract_text
0,10561201.ann,"[T, T, T, T, T, T, T]","[T1, T2, T3, T4, T6, T7, T8]","[MajorClaim, Premise, Premise, Premise, Premis...","[1, 992, 1063, 1188, 1452, 1569, 1769]","[162, 1058, 1187, 1447, 1568, 1768, 1945]",[A combination of mitoxantrone plus prednisone...,"[R, R, R, R, R, R]","[T8, T2, T3, T6, T4, T7]","[T1, T8, T8, T8, T8, T1]","[Support, Support, Support, Support, Support, ...",10561201.txt,A combination of mitoxantrone plus prednisone...
1,10561203.ann,"[T, T, T, T, T, T, T]","[T1, T3, T4, T5, T6, T7, T8]","[Claim, Premise, Premise, Premise, Premise, Pr...","[1, 1354, 1443, 1593, 1828, 1979, 2056]","[318, 1442, 1592, 1827, 1978, 2055, 2167]",[In endocrine therapy trials in advanced breas...,"[R, R, R, R, R]","[T7, T3, T4, T5, T6]","[T8, T8, T8, T8, T8]","[Attack, Support, Support, Support, Attack]",10561203.txt,In endocrine therapy trials in advanced breas...
2,10653877.ann,"[T, T, T, T, T, T, T, T, T]","[T1, T2, T3, T4, T5, T6, T7, T8, T9]","[Claim, Premise, Premise, Premise, Premise, Pr...","[1, 957, 1204, 1297, 1465, 1612, 1865, 1938, 2...","[156, 1203, 1296, 1464, 1611, 1864, 1928, 2001...",[Treatment with cisplatin-based chemotherapy p...,"[R, R, R, R, R]","[T8, T6, T2, T4, T5]","[T9, T9, T9, T9, T9]","[Support, Support, Support, Support, Support]",10653877.txt,Treatment with cisplatin-based chemotherapy p...
3,10675381.ann,"[T, T, T, T, T, T, T, T]","[T1, T2, T3, T4, T5, T6, T7, T8]","[MajorClaim, Premise, Premise, Premise, Premis...","[218, 1028, 1209, 1395, 1580, 1647, 1844, 2193]","[449, 1208, 1394, 1579, 1646, 1843, 2152, 2288]",[In nonrandomized studies involving patients w...,"[R, R, R, R, R]","[T2, T4, T5, T6, T7]","[T8, T8, T8, T8, T8]","[Support, Support, Support, Support, Support]",10675381.txt,Extracellular adenosine 5'-triphosphate (ATP)...
4,10735887.ann,"[T, T, T, T, T, T, T]","[T1, T2, T3, T4, T5, T6, T7]","[Premise, Premise, Premise, Premise, Claim, Pr...","[552, 678, 758, 1102, 1196, 1228, 1306]","[677, 757, 1101, 1195, 1227, 1305, 1555]",[Overall objective response (OR) rates were hi...,"[R, R, R, R]","[T1, T3, T4, T5]","[T7, T7, T7, T7]","[Support, Support, Support, Support]",10735887.txt,"This phase III, double-blind, randomized, mul..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,9807987.ann,"[T, T, T, T, T]","[T1, T2, T3, T4, T5]","[Premise, Premise, Premise, Premise, Claim]","[812, 1026, 1140, 1342, 1469]","[1025, 1139, 1341, 1468, 1760]","[With a median follow-up of 13 months, the ove...","[R, R, R]","[T4, T3, T1]","[T5, T5, T5]","[Support, Support, Support]",9807987.txt,"In phase II studies, irinotecan is active in ..."
346,9849452.ann,"[T, T, T, T, T, T, T, T, T]","[T1, T2, T3, T4, T5, T6, T7, T8, T9]","[Premise, Premise, Premise, Premise, Premise, ...","[983, 1124, 1187, 1250, 1327, 1349, 1503, 1624...","[1123, 1186, 1249, 1326, 1348, 1502, 1623, 166...",[Patients in the chemotherapy group reported b...,"[R, R, R, R, R]","[T5, T1, T3, T6, T7]","[T4, T9, T2, T9, T9]","[Attack, Support, Attack, Support, Support]",9849452.txt,The aim of the present trial was to evaluate ...
347,9849454.ann,"[T, T, T, T, T, T]","[T1, T2, T3, T4, T5, T6]","[Premise, Premise, Premise, Premise, Claim, Cl...","[1149, 1215, 1381, 1507, 2258, 2523]","[1214, 1380, 1506, 1699, 2522, 2655]",[Complete remission was achieved in 91% (170/1...,"[R, R, R, R]","[T1, T4, T3, T2]","[T6, T6, T5, T5]","[Support, Support, Support, Support]",9849454.txt,The second International Society of Paediatri...
348,9850014.ann,"[T, T, T, T, T, T]","[T1, T2, T3, T4, T5, T6]","[Premise, Premise, Premise, Premise, Premise, ...","[891, 1041, 1139, 1264, 1357, 1578]","[1040, 1138, 1263, 1356, 1577, 1770]",[An objective response (complete [CR] or parti...,"[R, R, R]","[T4, T5, T3]","[T6, T6, T6]","[Support, Support, Support]",9850014.txt,We report results of a randomized prospective...


In [31]:
def get_src_acs(row):
    try:
        source_ids = row.source

        source_acs = [row.ac[row.ac_id.index(elem)] for elem in source_ids if elem != None]

        return source_acs

    except Exception as e:
        raise ValueError(f"Error at index {row.name}: {e}")


In [32]:
def get_trg_acs(row):
    try:
        target_ids = row.target
        target_acs = [row.ac[row.ac_id.index(elem)] for elem in target_ids if elem != None]

        return target_acs

    except Exception as e:
        raise ValueError(f"Error at index {row.name}: {e}")


In [33]:
def get_src_ids(row):
    try:
        
        source_acs = row.source_acs
        source_ids = [row.ac.index(elem)+1 for elem in source_acs]

        return source_ids

    except Exception as e:
        raise ValueError(f"Error at index {row.name}: {e}")


In [34]:
def get_trg_ids(row):
    try:
        
        target_acs = row.target_acs
        target_ids = [row.ac.index(elem)+1 for elem in target_acs]

        return target_ids

    except Exception as e:
        raise ValueError(f"Error at index {row.name}: {e}")


In [35]:
def build_relations(row):
    
    return [(i, j) for i, j in zip(row.source_ids, row.target_ids)]

In [36]:
train_df_abs['source_acs'] = train_df_abs.apply(lambda row: get_src_acs(row), axis=1)
train_df_abs['target_acs'] = train_df_abs.apply(lambda row: get_trg_acs(row), axis=1)

train_df_abs['source_ids'] = train_df_abs.apply(lambda row: get_src_ids(row), axis=1)
train_df_abs['target_ids'] = train_df_abs.apply(lambda row: get_trg_ids(row), axis=1)

train_df_abs['relations'] = train_df_abs.apply(lambda row: build_relations(row), axis=1)

In [37]:
dev_df_abs['source_acs'] = dev_df_abs.apply(lambda row: get_src_acs(row), axis=1)
dev_df_abs['target_acs'] = dev_df_abs.apply(lambda row: get_trg_acs(row), axis=1)

dev_df_abs['source_ids'] = dev_df_abs.apply(lambda row: get_src_ids(row), axis=1)
dev_df_abs['target_ids'] = dev_df_abs.apply(lambda row: get_trg_ids(row), axis=1)

dev_df_abs['relations'] = dev_df_abs.apply(lambda row: build_relations(row), axis=1)

In [38]:
gla_test_df_abs['source_acs'] = gla_test_df_abs.apply(lambda row: get_src_acs(row), axis=1)
gla_test_df_abs['target_acs'] = gla_test_df_abs.apply(lambda row: get_trg_acs(row), axis=1)

gla_test_df_abs['source_ids'] = gla_test_df_abs.apply(lambda row: get_src_ids(row), axis=1)
gla_test_df_abs['target_ids'] = gla_test_df_abs.apply(lambda row: get_trg_ids(row), axis=1)

gla_test_df_abs['relations'] = gla_test_df_abs.apply(lambda row: build_relations(row), axis=1)

In [39]:
mix_test_df_abs['source_acs'] = mix_test_df_abs.apply(lambda row: get_src_acs(row), axis=1)
mix_test_df_abs['target_acs'] = mix_test_df_abs.apply(lambda row: get_trg_acs(row), axis=1)

mix_test_df_abs['source_ids'] = mix_test_df_abs.apply(lambda row: get_src_ids(row), axis=1)
mix_test_df_abs['target_ids'] = mix_test_df_abs.apply(lambda row: get_trg_ids(row), axis=1)

mix_test_df_abs['relations'] = mix_test_df_abs.apply(lambda row: build_relations(row), axis=1)

In [40]:
neo_test_df_abs['source_acs'] = neo_test_df_abs.apply(lambda row: get_src_acs(row), axis=1)
neo_test_df_abs['target_acs'] = neo_test_df_abs.apply(lambda row: get_trg_acs(row), axis=1)

neo_test_df_abs['source_ids'] = neo_test_df_abs.apply(lambda row: get_src_ids(row), axis=1)
neo_test_df_abs['target_ids'] = neo_test_df_abs.apply(lambda row: get_trg_ids(row), axis=1)

neo_test_df_abs['relations'] = neo_test_df_abs.apply(lambda row: build_relations(row), axis=1)

In [41]:
### Sanity Checks

In [66]:
def check_egalite(row):
    
    return 1 if (len(row.relation_type) == len(row.relations)) and (len(row.ac) == len(row.ac_type)) else 0     

In [67]:
neo_test_df_abs['egalite'] = neo_test_df_abs.apply(lambda row: check_egalite(row), axis=1)

In [68]:
neo_test_df_abs['egalite'].value_counts()

egalite
1    100
Name: count, dtype: int64

In [45]:
### Ok cause the 7 have no relations train. 2 in gla. 2 in mix.

### Save DFs

In [72]:
DATFILES_DIR = Path("/Utilisateurs/umushtaq/am_reasoning/abstRCT/data_files")

In [73]:
train_df_abs.to_csv(DATFILES_DIR / "neo_train.csv")
dev_df_abs.to_csv(DATFILES_DIR / "neo_dev.csv")
gla_test_df_abs.to_csv(DATFILES_DIR / "gla_test.csv")
mix_test_df_abs.to_csv(DATFILES_DIR / "mix_test.csv")
neo_test_df_abs.to_csv(DATFILES_DIR / "neo_test.csv")