In [53]:
import os
import json
import pandas as pd

## Part I: Analyse Automaxo results

In [54]:
def extract_data(base_path: str):
    """
    Extract raw triplet rows and disease-level info from TSV/JSON files.
    Returns:
        full_df (pd.DataFrame): one row per triplet
        disease_info (dict): per-disease counts and PubMed ID sets
    """
    rows = []
    disease_info = {}

    for disease in os.listdir(base_path):
        folder = os.path.join(base_path, disease)
        if not os.path.isdir(folder):
            continue

        # Count initial and filtered PubMed IDs
        initial = filtered = 0
        for fn in os.listdir(folder):
            path = os.path.join(folder, fn)
            if fn.endswith('_no_replaced.tsv'):
                initial = sum(1 for _ in open(path)) - 1
            if fn.endswith('_filtered.tsv'):
                filtered = sum(1 for _ in open(path)) - 1

        # Load JSON triplets
        json_path = os.path.join(folder, 'final_automaxo_results.json')
        if not os.path.exists(json_path):
            continue
        data = json.load(open(json_path))

        info = disease_info.setdefault(disease, {
            'initial_pubmed_count': initial,
            'filtered_pubmed_count': filtered,
            'triplet_count': 0,
            'unique_pubmed_ids': set()
        })

        for entry in data.get('triplets', []):
            t = entry['triplet']
            sources = entry.get('source', {})

            info['unique_pubmed_ids'].update(sources.keys())
            info['triplet_count'] += 1

            rows.append({
                'disease': disease,
                'maxo': t.get('maxo'),
                'hpo': t.get('hpo'),
                'mondo': t.get('mondo'),
                'non_grounded_maxo': t.get('non_grounded_maxo'),
                'non_grounded_hpo': t.get('non_grounded_hpo'),
                'non_grounded_mondo': t.get('non_grounded_mondo'),
                'potential_maxo': t.get('potential_maxo', []),
                'potential_hpo': t.get('potential_hpo', []),
                'potential_mondo': t.get('potential_mondo', []),
                'no_of_titles': sum(1 for d in sources.values() if 'title' in d),
                'no_of_abstracts': sum(1 for d in sources.values() if 'abstract' in d)
            })

    full_df = pd.DataFrame(rows)
    return full_df, disease_info

def clean_data(full_df: pd.DataFrame):
    """
    Replace literal "None" strings with actual NA values.
    """
    cols_to_clean = [
        'maxo', 'hpo', 'mondo',
        'non_grounded_maxo', 'non_grounded_hpo', 'non_grounded_mondo'
    ]
    for col in cols_to_clean:
        full_df[col] = full_df[col].replace("None", pd.NA)
    return full_df

def summarize_data(full_df: pd.DataFrame, disease_info: dict):
    """
    Summarize metrics per disease based on the cleaned full_df and disease_info.
    Returns:
        summary_df (pd.DataFrame): one row per disease with all metrics
    """
    def count_potential(sub, col):
        ids = set()
        for lst in sub[col].dropna():
            for item in lst:
                if isinstance(item, dict) and 'id' in item:
                    ids.add(item['id'])
                elif isinstance(item, str):
                    ids.add(item)
        return len(ids)

    summary_rows = []
    for disease, info in disease_info.items():
        sub = full_df[full_df['disease'] == disease]
        total_titles = sub['no_of_titles'].sum()
        total_abstracts = sub['no_of_abstracts'].sum()

        unique_maxo = sub['maxo'].nunique(dropna=True)
        total_maxo = sub['maxo'].count()
        unique_hpo = sub['hpo'].nunique(dropna=True)
        total_hpo = sub['hpo'].count()
        unique_mondo = sub['mondo'].nunique(dropna=True)
        total_mondo = sub['mondo'].count()

        ng_maxo = sub['non_grounded_maxo'].notna().sum()
        ng_hpo = sub['non_grounded_hpo'].notna().sum()
        ng_mondo = sub['non_grounded_mondo'].notna().sum()
        total_non_grounded = ng_maxo + ng_hpo + ng_mondo

        pot_maxo = count_potential(sub, 'potential_maxo')
        pot_hpo = count_potential(sub, 'potential_hpo')
        pot_mondo = count_potential(sub, 'potential_mondo')
        total_potential = pot_maxo + pot_hpo + pot_mondo

        summary_rows.append({
            'Disease': disease,
            'initial_pubmed_count': info['initial_pubmed_count'],
            'filtered_pubmed_count': info['filtered_pubmed_count'],
            'unique_pubmed_ids': len(info['unique_pubmed_ids']),
            'triplet_count': info['triplet_count'],
            'total_titles': total_titles,
            'total_abstracts': total_abstracts,
            'unique_maxo': unique_maxo,
            'total_maxo': total_maxo,
            'unique_hpo': unique_hpo,
            'total_hpo': total_hpo,
            'unique_mondo': unique_mondo,
            'total_mondo': total_mondo,
            'total_non_grounded': total_non_grounded,
            'non_grounded_maxo': ng_maxo,
            'non_grounded_hpo': ng_hpo,
            'non_grounded_mondo': ng_mondo,
            'total_potential': total_potential,
            'potential_maxo': pot_maxo,
            'potential_hpo': pot_hpo,
            'potential_mondo': pot_mondo
        })

    return pd.DataFrame(summary_rows)



In [55]:
base_path = '../data'
full_disease_analysis_df, disease_info = extract_data(base_path)
full_disease_analysis_df = clean_data(full_disease_analysis_df)
disease_summary_automaxo = summarize_data(full_disease_analysis_df, disease_info)


In [56]:
disease_summary_automaxo

Unnamed: 0,Disease,initial_pubmed_count,filtered_pubmed_count,unique_pubmed_ids,triplet_count,total_titles,total_abstracts,unique_maxo,total_maxo,unique_hpo,...,unique_mondo,total_mondo,total_non_grounded,non_grounded_maxo,non_grounded_hpo,non_grounded_mondo,total_potential,potential_maxo,potential_hpo,potential_mondo
0,dravet_syndrome,99,78,77,342,344,344,12,41,26,...,8,239,605,301,201,103,9,1,4,4
1,familial_mediterranean_fever,100,84,80,377,377,377,13,33,42,...,22,323,685,344,287,54,12,1,4,7
2,loeys_dietz_syndrome,35,34,33,139,144,144,12,18,19,...,2,85,271,121,96,54,4,1,1,2
3,donnai-barrow_syndrome,3,3,3,20,20,20,8,8,5,...,1,13,27,12,8,7,2,0,1,1
4,primary_ciliary_dyskinesia,98,76,72,325,325,325,18,31,29,...,12,244,634,294,259,81,20,5,12,3
5,camurati_engelmann,14,12,11,34,34,34,3,6,9,...,2,28,54,28,20,6,4,1,2,1
6,sickle_cell,2000,990,958,3829,3917,3917,65,452,144,...,55,2466,7899,3377,3159,1363,102,25,51,26
7,gaucher_disease,99,88,87,365,368,368,14,42,45,...,10,244,710,323,266,121,11,4,2,5
8,celiac_disease,2000,1240,1165,4718,4878,4878,67,313,219,...,90,2846,9917,4405,3640,1872,158,20,96,42
9,canavan_disease,39,20,20,76,76,76,4,13,5,...,8,45,162,63,68,31,4,1,0,3


In [57]:
full_disease_analysis_df

Unnamed: 0,disease,maxo,hpo,mondo,non_grounded_maxo,non_grounded_hpo,non_grounded_mondo,potential_maxo,potential_hpo,potential_mondo,no_of_titles,no_of_abstracts
0,dravet_syndrome,,,,add-on therapy,seizure frequency,drug-resistant focal epilepsy,[],[],[],2,2
1,dravet_syndrome,,hp:0001250,mondo:0100135,use of cannabidiol,,,[],[],[],2,2
2,dravet_syndrome,,,,access to syndrome-specific treatment options,mismanagement,dravet syndrome,[],[],"[{'id': 'MONDO:0100135', 'label': 'Dravet synd...",1,1
3,dravet_syndrome,,,,add-on stiripentol treatment,seizure frequency,refractory epilepsies,[],[],[],1,1
4,dravet_syndrome,,,,add-on therapy,gastrointestinal adverse effects,drug-resistant focal epilepsy,[],[],[],1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
15935,stickler_syndrome,maxo:0000571,,mondo:0019354,,open-bite deformity,,[],[],[],1,1
15936,stickler_syndrome,maxo:0000571,hp:0000689,mondo:0009869,,,,[],[],[],1,1
15937,stickler_syndrome,maxo:0001085,hp:0012230,mondo:0019354,,,,[],[],[],1,1
15938,stickler_syndrome,maxo:0009072,hp:0000541,,,,chronic pediatric retinal detachment,[],[],[],1,1


## Part II: Analyse Automaxoviewer results aka final expert curation 

In [59]:
import os
import glob
import pandas as pd

# Folder containing the TSV files
folder_path = 'maxo_final_curation'

# Expected core columns
expected_cols = [
    'disease_id', 'disease_name', 'source_id', 'maxo_id', 'maxo_name',
    'hpo_id', 'relation', 'evidence', 'extension_id', 'extension_name',
    'comment', 'other', 'author', 'last_updated', 'created'
]

dfs = []
for file_path in glob.glob(os.path.join(folder_path, '*.tsv')):
    # Read TSV
    df = pd.read_csv(file_path, sep='\t', dtype=str)
    # Strip whitespace from headers
    df.columns = df.columns.str.strip()
    
    # If any combined headers (e.g., 'author last_updated'), split them
    if 'author last_updated' in df.columns:
        df[['author', 'last_updated']] = df['author last_updated'].str.split(n=1, expand=True)
        df = df.drop(columns=['author last_updated'])
    
    # Ensure all expected columns exist
    for col in expected_cols:
        if col not in df.columns:
            df[col] = pd.NA
    
    # Keep only expected columns + file_name
    df = df[expected_cols]
    df['file_name'] = os.path.basename(file_path)
    
    dfs.append(df)

# Concatenate and drop any duplicate columns
maxo_final_curation_df = pd.concat(dfs, ignore_index=True)
maxo_final_curation_df = maxo_final_curation_df.loc[:, ~maxo_final_curation_df.columns.duplicated()]


In [60]:
maxo_final_curation_df

Unnamed: 0,disease_id,disease_name,source_id,maxo_id,maxo_name,hpo_id,relation,evidence,extension_id,extension_name,comment,other,author,last_updated,created,file_name
0,MONDO:0010726,Rett syndrome,PMID:29587149,MAXO:0000944,cranial electrical stimulation,HP:0000118,treats,PCS,,,,,0000-0002-0736-9199,,2025-05-10,maxo_Rett.tsv
1,MONDO:0010726,Rett syndrome,PMID:29941161,MAXO:0001346,gastrostomy,HP:0004325,unknown,PCS,,,,,0000-0002-0736-9199,,2025-05-10,maxo_Rett.tsv
2,MONDO:0010726,Rett syndrome,PMID:31469772,MAXO:0000667,anesthesiology specialist evaluation,HP:0000118,treats,PCS,,,,,0000-0002-0736-9199,,2025-05-10,maxo_Rett.tsv
3,MONDO:0010726,Rett syndrome,PMID:31568388,MAXO:0000465,aquatic exercise therapy,HP:0000118,treats,PCS,,,,,0000-0002-0736-9199,,2025-05-10,maxo_Rett.tsv
4,MONDO:0010726,Rett syndrome,PMID:34282992,MAXO:0000011,physical therapy,HP:0000118,treats,PCS,,,,,0000-0002-0736-9199,,2025-05-10,maxo_Rett.tsv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
953,MONDO:0018954,Loeys-Dietz syndrome,PMID:29889773,MAXO:0000479,orthopedic surgery,HP:0003302,treats,PCS,,,,,0000-0002-0736-9199,,2024-08-02,maxo_LDS.tsv
954,MONDO:0018954,Loeys-Dietz syndrome,PMID:30513314,MAXO:0010032,cardiac transplantation,HP:0001635,treats,PCS,,,,,0000-0002-0736-9199,,2024-08-02,maxo_LDS.tsv
955,MONDO:0018954,Loeys-Dietz syndrome,PMID:31842932,MAXO:0035065,dentist evaluation,HP:0000118,treats,PCS,,,,,0000-0002-0736-9199,,2024-08-02,maxo_LDS.tsv
956,MONDO:0018954,Loeys-Dietz syndrome,PMID:32164578,MAXO:0001346,gastrostomy,HP:0001508,treats,PCS,,,,,0000-0002-0736-9199,,2024-08-02,maxo_LDS.tsv


In [61]:
maxo_final_curation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958 entries, 0 to 957
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   disease_id      958 non-null    object
 1   disease_name    958 non-null    object
 2   source_id       958 non-null    object
 3   maxo_id         958 non-null    object
 4   maxo_name       958 non-null    object
 5   hpo_id          958 non-null    object
 6   relation        958 non-null    object
 7   evidence        958 non-null    object
 8   extension_id    57 non-null     object
 9   extension_name  57 non-null     object
 10  comment         0 non-null      object
 11  other           0 non-null      object
 12  author          958 non-null    object
 13  last_updated    0 non-null      object
 14  created         958 non-null    object
 15  file_name       958 non-null    object
dtypes: object(16)
memory usage: 119.9+ KB


In [62]:
import pandas as pd

# Build summary DataFrame with total_annotations per disease
summary_curation = (
    maxo_final_curation_df
      .groupby(['disease_id', 'disease_name', 'file_name'], dropna=False)
      .agg(
          maxo_id_count_curated       = ('maxo_id',      'nunique'),
          hpo_id_count_curated       = ('hpo_id',       'nunique'),
          relation_count_curated      = ('relation',     'nunique'),
          total_annotations_curated   = ('disease_id',   'size'),
      )
      .reset_index()
)



In [63]:
summary_curation

Unnamed: 0,disease_id,disease_name,file_name,maxo_id_count_curated,hpo_id_count_curated,relation_count_curated,total_annotations_curated
0,MONDO:0001516,spinal muscular atrophy,maxo_SMA.tsv,6,4,2,39
1,MONDO:0001586,mucopolysaccharidosis type 1,maxo_MPS1.tsv,7,13,4,48
2,MONDO:0001734,tuberous sclerosis,maxo_TS.tsv,15,12,3,35
3,MONDO:0005130,celiac disease,maxo_Celiac.tsv,1,8,2,67
4,MONDO:0007037,Achondroplasia,maxo_hypoch.tsv,1,1,2,2
5,MONDO:0007041,Apert syndrome,maxo_Apert.tsv,11,15,3,24
6,MONDO:0007542,Camurati-Engelmann disease,maxo_CamuratiEngelman.tsv,7,5,3,10
7,MONDO:0007739,Huntington disease,maxo_Huntington.tsv,22,16,3,46
8,MONDO:0007793,hypochondroplasia,maxo_hypoch.tsv,3,3,2,4
9,MONDO:0007919,lymphatic malformation 1,maxo_Milroy.tsv,1,1,1,1


In [64]:
import pandas as pd
# Copy original DataFrames
df1 = summary_curation.copy()
df2 = disease_summary_automaxo.copy()

# 1. Create a base key from the file_name in df1
df1['base_key'] = (
    df1['file_name']
       .str.lower()
       .str.removeprefix('maxo_')
       .str.removesuffix('.tsv')
       .str.replace(r'[_\-\s]', '', regex=True)
)

# 2. Define mapping from base_key (Format A) to Disease codes (Format B)
mapping = {
    'rett': 'rett_syndrome',
    'wilson': 'wilson_disease',
    'maplesyrup': 'maple_syrup_urine_disease',
    'donnaibarrow': 'donnai-barrow_syndrome',
    'camuratiefengelmann': 'camurati_engelmann',
    'ts': 'tuberous_sclerosis',
    'apert': 'apert_syndrome',
    'prop': 'propionic_acidemia',
    'ataxiatel': 'ataxia_telangiectasia',
    'sma': 'spinal_muscular_atrophy',
    'mps1': 'mucopolysaccharidosis_type_i',
    'mfs': 'marfan_syndrome',
    'citrullinemia': 'citrullinemia',
    'noonan': 'noonan_syndrome',
    'fmf': 'familial_mediterranean_fever',
    'celiac': 'celiac_disease',
    'milroy': 'milroy_disease',
    'milroy2': 'milroy_disease',
    'brugada': 'brugada_syndrome',
    'dravet': 'dravet_syndrome',
    'stickler': 'stickler_syndrome',
    'lns': 'lesch_nyhan_syndrome',
    'canavan': 'canavan_disease',
    'fanconi': 'fanconi_anemia',
    'hypoch': 'hypochondroplasia',
    'huntington': 'huntington_disease',
    'pcd': 'primary_ciliary_dyskinesia',
    'alkapto': 'alkaptonuria',
    'sicklecell': 'sickle_cell',
    'chediak': 'chediak_higashi_syndrome',
    'gaucher': 'gaucher_disease',
    'lds': 'loeys_dietz_syndrome'
}

# 3. Map base_key to the Format B disease names
df1['Disease'] = df1['base_key'].map(mapping)

# 4. Merge the two DataFrames on the standardized Disease column
merged_curation_automaxo_df = pd.merge(
    df1,
    df2,
    on='Disease',
    how='inner',
    suffixes=('_curated', '_automaxo')
)



In [65]:
# Display merged result
merged_curation_automaxo_df


Unnamed: 0,disease_id,disease_name,file_name,maxo_id_count_curated,hpo_id_count_curated,relation_count_curated,total_annotations_curated,base_key,Disease,initial_pubmed_count,...,unique_mondo,total_mondo,total_non_grounded,non_grounded_maxo,non_grounded_hpo,non_grounded_mondo,total_potential,potential_maxo,potential_hpo,potential_mondo
0,MONDO:0001516,spinal muscular atrophy,maxo_SMA.tsv,6,4,2,39,sma,spinal_muscular_atrophy,98,...,11,226,685,313,243,129,16,2,11,3
1,MONDO:0001586,mucopolysaccharidosis type 1,maxo_MPS1.tsv,7,13,4,48,mps1,mucopolysaccharidosis_type_i,100,...,10,229,516,234,222,60,13,4,4,5
2,MONDO:0001734,tuberous sclerosis,maxo_TS.tsv,15,12,3,35,ts,tuberous_sclerosis,98,...,20,230,605,296,210,99,15,3,7,5
3,MONDO:0005130,celiac disease,maxo_Celiac.tsv,1,8,2,67,celiac,celiac_disease,2000,...,90,2846,9917,4405,3640,1872,158,20,96,42
4,MONDO:0007037,Achondroplasia,maxo_hypoch.tsv,1,1,2,2,hypoch,hypochondroplasia,13,...,6,26,80,32,33,15,1,1,0,0
5,MONDO:0007041,Apert syndrome,maxo_Apert.tsv,11,15,3,24,apert,apert_syndrome,50,...,5,130,385,165,154,66,10,3,4,3
6,MONDO:0007739,Huntington disease,maxo_Huntington.tsv,22,16,3,46,huntington,huntington_disease,226,...,18,343,912,434,358,120,22,7,10,5
7,MONDO:0007793,hypochondroplasia,maxo_hypoch.tsv,3,3,2,4,hypoch,hypochondroplasia,13,...,6,26,80,32,33,15,1,1,0,0
8,MONDO:0007919,lymphatic malformation 1,maxo_Milroy.tsv,1,1,1,1,milroy,milroy_disease,2,...,1,15,22,10,12,0,0,0,0,0
9,MONDO:0007919,lymphatic malformation 1,maxo_milroy 2.tsv,2,2,1,2,milroy2,milroy_disease,2,...,1,15,22,10,12,0,0,0,0,0


In [66]:
# Save full_df to an Excel file
full_disease_analysis_df.to_excel('full_disease_analysis_all_diseases.xlsx', index=False)

# Save disease_summary to an Excel file
disease_summary_automaxo.to_excel('disease_summary_automaxo.xlsx', index=False)

# Save maxo_final_curation_df to an Excel file
maxo_final_curation_df.to_excel('full_maxo_final_curation.xlsx', index=False)

summary_curation.to_excel('maxo_final_curation.xlsx', index=False)

merged_curation_automaxo_df.to_excel('merged_curation_automaxo.xlsx', index=False)