In [1]:
# read all the diseases 

In [None]:
import os
import json
import pandas as pd
import numpy as np
import re


In [None]:

def load_and_combine_json(base_path):
    # Initialize storage
    rows = []
    disease_info = {}

    # Traverse each disease folder
    for disease in os.listdir(base_path):
        disease_folder = os.path.join(base_path, disease)
        if not os.path.isdir(disease_folder):
            continue

        # 1) Count initial PubMed IDs from *_no_replaced.tsv
        initial_count = 0
        for fname in os.listdir(disease_folder):
            if fname.endswith('_no_replaced.tsv'):
                tsv_path = os.path.join(disease_folder, fname)
                with open(tsv_path, 'r') as tf:
                    initial_count = sum(1 for _ in tf) - 1
                break

        # 2) Load triplet JSON
        json_path = os.path.join(disease_folder, 'final_automaxo_results.json')
        if not os.path.exists(json_path):
            continue

        with open(json_path, 'r') as f:
            data = json.load(f)

        # Initialize or retrieve counters
        info = disease_info.setdefault(disease, {
            'triplet_count': 0,
            'unique_pubmed_ids': set(),
            'initial_pubmed_count': initial_count
        })

        # Process each triplet
        for entry in data.get('triplets', []):
            triplet = entry['triplet']
            sources = entry.get('source', {})

            pubmed_ids = []
            title_count = 0
            abstract_count = 0

            for pid, details in sources.items():
                pubmed_ids.append(pid)
                info['unique_pubmed_ids'].add(pid)
                if 'title' in details:
                    title_count += 1
                if 'abstract' in details:
                    abstract_count += 1

            triplet.update({
                'disease': disease,
                'count': entry.get('count', 0),
                'source': pubmed_ids,
                'no_of_titles': title_count,
                'no_of_abstracts': abstract_count,
            })

            rows.append(triplet)
            info['triplet_count'] += 1

    # Full triplet DataFrame
    df = pd.DataFrame(rows)

    # Build summary DataFrame with the four columns
    summary_rows = []
    for disease, info in disease_info.items():
        summary_rows.append({
            'Disease': disease,
            'initial_pubmed_count': info['initial_pubmed_count'],
            'unique_pubmed_ids': len(info['unique_pubmed_ids']),
            'triplet_count': info['triplet_count']
        })
    summary_df = pd.DataFrame(summary_rows)

    return df, summary_df


In [None]:

def format_disease(name):
    parts = name.split('_')
    out = []
    for p in parts:
        p_low = p.lower()
        # detect roman numerals and uppercase them
        if re.fullmatch(r'(i|ii|iii|iv|v|vi|vii|viii|ix|x)', p_low):
            out.append(p_low.upper())
        # keep “type” lowercase
        elif p_low == 'type':
            out.append('type')
        # otherwise capitalize first letter
        else:
            out.append(p_low.capitalize())
    return ' '.join(out)


In [31]:
# Usage:
base_path = '../data'
full_df, disease_summary = load_and_combine_json(base_path)
disease_summary['Disease'] = disease_summary['Disease'].apply(format_disease)


In [32]:
disease_summary

Unnamed: 0,Disease,initial_pubmed_count,unique_pubmed_ids,triplet_count
0,Dravet Syndrome,99,98,443
1,Familial Mediterranean Fever,100,92,436
2,Loeys Dietz Syndrome,35,34,141
3,Donnai-barrow Syndrome,3,3,20
4,Primary Ciliary Dyskinesia,98,93,423
5,Camurati Engelmann,14,13,42
6,Sickle Cell,2000,1858,7236
7,Gaucher Disease,99,98,408
8,Celiac Disease,2000,1842,6899
9,Canavan Disease,39,38,142


In [28]:
full_df

Unnamed: 0,maxo,maxo_label,non_grounded_maxo,potential_maxo,relationship,hpo,hpo_label,non_grounded_hpo,potential_hpo,mondo,...,non_grounded_mondo,potential_mondo,maxo_qualifier,chebi,hpo_extension,disease,count,source,no_of_titles,no_of_abstracts
0,,,add-on therapy,"[{'id': 'MAXO:0001298', 'label': 'therapy'}]",treats,,,seizure frequency,"[{'id': 'HP:0001250', 'label': 'Seizure'}, {'i...",,...,drug-resistant focal epilepsy,"[{'id': 'MONDO:0002754', 'label': 'epilepsy'},...",,chebi:228488,,dravet_syndrome,2,"[32468572, 36066395]",2,2
1,,,use of cannabidiol,[],treats,hp:0001250,seizure,,[],mondo:0100135,...,,[],,chebi:69478,,dravet_syndrome,2,"[38167335, 38183688]",2,2
2,,,access to syndrome-specific treatment options,[],prevents,,,mismanagement,"[{'id': 'HP:0001658', 'label': 'MI'}]",,...,dravet syndrome,"[{'id': 'MONDO:0002254', 'label': 'Dravet synd...",,,,dravet_syndrome,1,[34268891],1,1
3,,,acute hyperthermia-induced seizure test,[],treats,,,hyperthermia-induced seizures,"[{'id': 'HP:0001250', 'label': 'Seizure'}, {'i...",mondo:0100135,...,,[],,,,dravet_syndrome,1,[39212337],1,1
4,,,add-on stiripentol treatment,[],treats,,,seizure frequency,"[{'id': 'HP:0001250', 'label': 'Seizure'}, {'i...",,...,refractory epilepsies,"[{'id': 'MONDO:0002754', 'label': 'EP'}, {'id'...",with stiripentol,chebi:228488,,dravet_syndrome,1,[31630399],1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23781,maxo:0000571,computed tomography procedure,,[],prevents,,,open-bite deformity,[],mondo:0019354,...,,[],,,,stickler_syndrome,1,[26540157],1,1
23782,maxo:0000571,computed tomography procedure,,[],prevents,hp:0000689,dental malocclusion,,[],mondo:0009869,...,,[],,,,stickler_syndrome,1,[26540157],1,1
23783,maxo:0001085,vitrectomy,,[],treats,hp:0012230,rhegmatogenous retinal detachment,,[],mondo:0019354,...,,[],,,,stickler_syndrome,1,[20545219],1,1
23784,maxo:0009072,surgical repair,,[],treats,hp:0000541,retinal detachment,,[],,...,chronic pediatric retinal detachment,"[{'id': 'MONDO:0008090', 'label': 'retinal det...",,scleral buckling,,stickler_syndrome,1,[35972836],1,1


In [17]:
df.head()

Unnamed: 0,maxo,maxo_label,non_grounded_maxo,potential_maxo,relationship,hpo,hpo_label,non_grounded_hpo,potential_hpo,mondo,...,non_grounded_mondo,potential_mondo,maxo_qualifier,chebi,hpo_extension,disease,count,source,no_of_titles,no_of_abstracts
0,,,add-on therapy,"[{'id': 'MAXO:0001298', 'label': 'therapy'}]",treats,,,seizure frequency,"[{'id': 'HP:0001250', 'label': 'Seizure'}, {'i...",,...,drug-resistant focal epilepsy,"[{'id': 'MONDO:0002754', 'label': 'epilepsy'},...",,chebi:228488,,dravet_syndrome,2,"[32468572, 36066395]",2,2
1,,,use of cannabidiol,[],treats,hp:0001250,seizure,,[],mondo:0100135,...,,[],,chebi:69478,,dravet_syndrome,2,"[38167335, 38183688]",2,2
2,,,access to syndrome-specific treatment options,[],prevents,,,mismanagement,"[{'id': 'HP:0001658', 'label': 'MI'}]",,...,dravet syndrome,"[{'id': 'MONDO:0002254', 'label': 'Dravet synd...",,,,dravet_syndrome,1,[34268891],1,1
3,,,acute hyperthermia-induced seizure test,[],treats,,,hyperthermia-induced seizures,"[{'id': 'HP:0001250', 'label': 'Seizure'}, {'i...",mondo:0100135,...,,[],,,,dravet_syndrome,1,[39212337],1,1
4,,,add-on stiripentol treatment,[],treats,,,seizure frequency,"[{'id': 'HP:0001250', 'label': 'Seizure'}, {'i...",,...,refractory epilepsies,"[{'id': 'MONDO:0002754', 'label': 'EP'}, {'id'...",with stiripentol,chebi:228488,,dravet_syndrome,1,[31630399],1,1


In [18]:
# Stept 2: Counting all the NANs

# Assuming df is your DataFrame
df = df.replace(['None'], np.nan)

# Replace empty lists with NaN in columns that can contain lists
for column in df.columns:
    df[column] = df[column].apply(lambda x: np.nan if x == [] else x)



In [19]:

# Total sum of titles and abstracts
total_titles = df['no_of_titles'].sum()
total_abstracts = df['no_of_abstracts'].sum()

# Count of unique values in maxo, hpo, mondo
unique_maxo = df['maxo'].nunique()
unique_hpo = df['hpo'].nunique()
unique_mondo = df['mondo'].nunique()

# Count of total values in maxo, hpo, mondo 
total_maxo = df['maxo'].count()
total_hpo = df['hpo'].count()
total_mondo = df['mondo'].count()


# Total count of non-grounded ontologies
non_grounded_columns = [col for col in df.columns if 'non_grounded' in col]
total_non_grounded = df[non_grounded_columns].notna().sum().sum()

# Total count of potential ontologies identified
potential_columns = [col for col in df.columns if 'potential' in col]
total_potential = df[potential_columns].notna().sum().sum()


# Count non-null values in non-grounded columns for maxo, hpo, mondo
non_grounded_maxo_count = df['non_grounded_maxo'].notna().sum()
non_grounded_hpo_count = df['non_grounded_hpo'].notna().sum()
non_grounded_mondo_count = df['non_grounded_mondo'].notna().sum()

# Count non-null values in potential columns for maxo, hpo, mondo
potential_maxo_count = df['potential_maxo'].notna().sum()
potential_hpo_count = df['potential_hpo'].notna().sum()
potential_mondo_count = df['potential_mondo'].notna().sum()



# Output results
print("Total number of titles:", total_titles)
print("Total number of abstracts:", total_abstracts)
print("Unique MAXO values:", unique_maxo)
print("Total MAXO values:", total_maxo)
print("Unique HPO values:", unique_hpo)
print("Total HPO values:", total_hpo)
print("Unique MONDO values:", unique_mondo)
print("Total MONDO values:", total_mondo)
print("Total non-grounded ontologies:", total_non_grounded)


print("Non-grounded MAXO count:", non_grounded_maxo_count)
print("Non-grounded HPO count:", non_grounded_hpo_count)
print("Non-grounded MONDO count:", non_grounded_mondo_count)

print("Total potential ontologies identified:", total_potential)
print("Potential MAXO count:", potential_maxo_count)
print("Potential HPO count:", potential_hpo_count)
print("Potential MONDO count:", potential_mondo_count)



Total number of titles: 24189
Total number of abstracts: 24189
Unique MAXO values: 198
Total MAXO values: 2374
Unique HPO values: 800
Total HPO values: 4797
Unique MONDO values: 490
Total MONDO values: 15194
Total non-grounded ontologies: 48993
Non-grounded MAXO count: 21412
Non-grounded HPO count: 18989
Non-grounded MONDO count: 8592
Total potential ontologies identified: 19018
Potential MAXO count: 6239
Potential HPO count: 6124
Potential MONDO count: 6655
