# Phenopacket store statistics

This notebook performs quality assessment and calculate descriptive statistics about a phenopacket-store release.

Note: 

We recommend installing Phenopacket Store Toolkit into the notebook kernel:

```shell
python3 -m pip install phenopacket-store-toolkit[release]
```

In [7]:
import math

import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

The input file is the ZIP file that is or will be added to each release.

The ZIP file can be generated by running:

```shell
python3 -m ppktstore package --notebook-dir notebooks --release-tag 0.1.19 --output all_phenopackets
```

assuming `phenopacket-store-toolkit` was installed into the active environment, and `notebooks` points to the folder with Phenopacket store notebook directory.


In [11]:
import os
from ppktstore.model import PhenopacketStore

#input_zip = os.getcwd() + "in_multlingual_nov24/prompts/used_ppkts/"
input_dir = "/Users/leonardo/data/4917_poly_ppkts/"
store = PhenopacketStore.from_notebook_dir(input_dir, "jsons/")

Now we can summarize statistics of the individuals described in the phenopackets, their phenotypic features, diseases, and genotypes.

In [12]:
from ppktstore.release.stats import PPKtStoreStats

stats = PPKtStoreStats(store)

The summary 

In [13]:
df = stats.get_summary_df().sort_values(by='gene')
df.head()

Unnamed: 0,patient_id,cohort,disease_id,disease,gene,allele_1,allele_2,PMID,filename
2194,Family 1 proband,cohortdir,OMIM:148600,"Keratoderma, palmoplantar, punctate type IA",AAGAB,NM_024666.5:c.505_506dup,,PMID:28239884,cohortdir/jsons/PMID_28239884_Family1proband.json
155,Family 2 proband,cohortdir,OMIM:148600,"Keratoderma, palmoplantar, punctate type IA",AAGAB,NM_024666.5:c.473del,,PMID:28239884,cohortdir/jsons/PMID_28239884_Family2proband.json
3427,Family 3 proband,cohortdir,OMIM:148600,"Keratoderma, palmoplantar, punctate type IA",AAGAB,NM_024666.5:c.870+1G>A,,PMID:28239884,cohortdir/jsons/PMID_28239884_Family3proband.json
85,II.2,cohortdir,OMIM:601718,Retinitis pigmentosa 19,ABCA4,NM_000350.3:c.1938-1G>A,,PMID:10874631,cohortdir/jsons/PMID_10874631_II2.json
886,PATIENT II.2,cohortdir,OMIM:301310,"Anemia, sideroblastic, and spinocerebellar ataxia",ABCB7,NM_001271696.3:c.1231G>C,,PMID:11118249,cohortdir/jsons/PMID_11118249_PATIENTII2.json


## Individual statistics

In [14]:
from ppktstore.release.stats import summarize_individuals

individuals_df = summarize_individuals(store)
individuals_df.head(10)

Unnamed: 0,id,sex,age_in_days,age_in_years,vital_status
0,PMID_34722527_individual_048-051_1_Thaddeus_P__Dryja_Null RPGRIP1 Al-individual_048-051_1_Thaddeus_P__Dryja_Null RPGRIP1 Al,UNKNOWN_SEX,,,
1,PMID_23407777_23407777_P1-23407777_P1,FEMALE,44.0,0.120465,
2,"PMID_31239556_individual_22_father-individual 22, father",MALE,11322.75,31.0,
3,PMID_29469822_Family_4_II-2-Family 4 II-2,MALE,4.0,0.010951,
4,"PMID_31021519_SATB2_47_from_Zarate_et_al__2018a__Bengani_et_al-SATB2-47 from Zarate et al., 2018a; Bengani et al.",MALE,2556.75,7.0,
5,PMID_37196654_Individual_5-Individual 5,MALE,9131.25,25.0,
6,PMID_29290338_Family_UAB_R45201FN_101_individual_RS-Family UAB-R45201FN.101 individual RS,MALE,1461.0,4.0,UNKNOWN_STATUS
7,"PMID_36446582_Novara_2017_P2-Novara, 2017_P2",MALE,,,
8,PMID_29122497_29122497_P8-29122497_P8,MALE,300.0,0.821355,
9,STX_EG1010P-STX_EG1010P,UNKNOWN_SEX,1461.0,4.0,


### Summary statistics


#### Sex
The number of males and females in all case report collections.

In [15]:
sex_summary = {
    'males': sum(individuals_df.sex=='MALE'),
    'females': sum(individuals_df.sex=='FEMALE'),
    'unknown': sum(individuals_df.sex=='UNKNOWN_SEX')
}
sex_summary

{'males': 1826, 'females': 1590, 'unknown': 1500}

In [16]:
n_w_sex = sex_summary['males'] + sex_summary['females']
perc_w_sex = (100 * n_w_sex) / sum(sex_summary.values())
perc_males = (100 * sex_summary['males']) / n_w_sex
perc_females = (100 * sex_summary['females']) / n_w_sex

f'{n_w_sex} ({perc_w_sex:.1f}%) had the sex specified ({perc_males:.1f}% males, {perc_females:.1f}% females)'

'3416 (69.5%) had the sex specified (53.5% males, 46.5% females)'


#### Age
The number and percentage of subjects with the age information available.

In [17]:
n_no_age = sum(individuals_df.age_in_days.isna())
n_w_age = len(individuals_df) - n_no_age
age_summary = {
    'individuals with no age': f'{n_no_age} ({n_no_age * 100 / len(individuals_df):.1f}%)',
    'individuals with age': f'{n_w_age} ({n_w_age * 100 / len(individuals_df):.1f}%)',
}
age_summary

{'individuals with no age': '1972 (40.1%)',
 'individuals with age': '2944 (59.9%)'}

In [18]:
stats_d = stats.get_descriptive_stats(version=release_tag)
items = list()
for k,v in stats_d.items():
    items.append({"item": k, "value": v})
pd.DataFrame(items)


Unnamed: 0,item,value
0,version,0.1.19
1,phenopackets,4916
2,diseases,360
3,genes,326
4,alleles,2899
5,PMIDs,706
6,individuals per gene (max),456
7,individuals per gene (min),1
8,individuals per gene (mean),15.079755
9,individuals per gene (median),4.0


In [None]:
# For GPT-4o the results are:
dir = '/Users/leonardo/git/malco/final_multilingual_output'
langs = ["en", "es", "fr", "de", "it", "nl", "ja", "zh", "tr", "cs"]
fn = "full_df_results.tsv"

# Create a DataFrame to hold all results
all_dfs = pd.DataFrame()
# for each language, read the result tsv file 
for lang in langs:
    df = pd.read_csv(os.path.join(dir, lang, fn), sep='\t',usecols=['term'])
    # concat df to all_dfs
    all_dfs = pd.concat([all_dfs, df], ignore_index=True)

# Print the number of lines and the number of unique terms
print(f"Number of lines: {len(all_dfs)}")
print(f"Number of unique lines: {len(all_dfs['term'].unique())}")

Number of MONDO IDs: 233563
Number of unique terms: 6919


In [None]:
# For Meditron3 the results are:
dir = '/Users/leonardo/git/malco/data/results/BUP_multilingual_main/full_results/full_df_'
langs = ["en", "es", "fr", "de", "it", "nl", "ja", "zh", "tr", "cs"]
suffix = "-Meditron3_70b.tsv"

# Create a DataFrame to hold all results
all_replies = []
# for each language, read the result tsv file 
for lang in langs:
    # Import the column 'scored', which is a list of dictionaries
    df = pd.read_csv(dir + lang + suffix, sep='\t',usecols=['scored'])
    # Convert the 'scored' column from string representation of list of dicts to actual list of dicts
    df['scored'] = df['scored'].apply(eval)
    # For each row in 'scored' iterate over each dictionary element and append the value of 'grounded_id' to all_replies
    for index, row in df.iterrows():
        for item in row['scored']:
            if isinstance(item, dict) and 'grounded_id' in item:
                all_replies.append(item['grounded_id'])
    

# Print the number of lines and the number of unique terms
print(f"Number of lines: {len(all_replies)}")
print(f"Number of unique lines: {len(set(all_replies))}")

Number of MONDO IDs: 311359
Number of unique terms: 3155


In [None]:
# Transform all_dfs['term'].unique() in a list
unique_terms = set(all_dfs['term'].unique().tolist())
# Transform all_replies in a set
unique_replies = set(all_replies)
# Look at the intersection of the two sets
intersection = unique_terms.union(unique_replies)
# Print the number of unique terms in the intersection
print(f"Number of unique lines in the union: {len(intersection)}")

Number of unique terms in the union: 8273
