# HPOA File creator
This notebook shows how to convert a collection of phenopackets into a file that can be use for the phenotype.hpoa resource. The latter provides aggregate frequency data for each source.

For this example we first create a collection of phenopackets and then transform them into a file for the HPOA.

Our example will be [Yogev Y, et al. (2023) Limb girdle muscular disease caused by HMGCR mutation and statin myopathy treatable with mevalonolactone. Proc Natl Acad Sci U S A.  120:e2217831120 PMID: 36745799](https://pubmed.ncbi.nlm.nih.gov/36745799/).

In [1]:
import phenopackets as PPKt
from google.protobuf.json_format import MessageToDict, MessageToJson
from google.protobuf.json_format import Parse, ParseDict
import pandas as pd
import os, sys
pd.set_option('display.max_colwidth', None) # show entire column contents, important!
pd.set_option('display.max_rows', None)
from collections import defaultdict

from phenopackets import Phenopacket
from google.protobuf.json_format import Parse
import json
import hpotk

from IPython.display import display, HTML
from pyphetools.creation import *
from pyphetools.visualization import *
from pyphetools.validation import *

import importlib.metadata
__version__ = importlib.metadata.version("pyphetools")
print(f"Using pyphetools version {__version__}")

Using pyphetools version 0.8.7


In [2]:
parser = HpoParser()
hpo_cr = parser.get_hpo_concept_recognizer()
hpo_version = parser.get_version()
hpo_ontology = parser.get_ontology()
PMID="PMID:36745799"
title = "Limb girdle muscular disease caused by HMGCR mutation and statin myopathy treatable with mevalonolactone"
metadata = MetaData(created_by="ORCID:0000-0002-0736-9199", pmid=PMID, pubmed_title=title)
metadata.default_versions_with_hpo(version=hpo_version)

In [3]:
data = "data/LGMDR28.xlsx"
df = pd.read_excel(data)

In [4]:
df.head()

Unnamed: 0,INDIVIDUAL,V:2,V:5,V:8,V:9,V:12,V:13
0,SEX,F,M,M,M,F,M
1,AGE_AT_EXAMINATION,49,58,37,42,51,41
2,AGE_AT_ONSET,31,39,24,33,31,34
3,PROXIMAL_STRENGTH-UPPER_LIMB,0/5,3/5,5/5,5/5,2/5,3/5
4,PROXIMAL_STRENGTH-LOWER_LIMB,0/5,2/5,5/5,4/5,2/5,4/5


In [5]:
dft = df.transpose()
dft.columns = dft.iloc[0]
dft.drop(dft.index[0], inplace=True)
dft['patient_id'] = dft.index  # Set the new column 'patient_id' to be identical to the contents of the index
dft.head() 

INDIVIDUAL,SEX,AGE_AT_EXAMINATION,AGE_AT_ONSET,PROXIMAL_STRENGTH-UPPER_LIMB,PROXIMAL_STRENGTH-LOWER_LIMB,ATROPHY_UPPER_LIMB,ATROPHY_LOWER_LIMB,DEEP_TENDON_REFLEXES,PAIN_ON_EXERTION,AMBULATORY,...,VLDL,FASTING_BLOOD_SUGAR,"ANA,RF,C3,C4ABNORMALITIES","ANTI-SM,ANTIJO-1,ANTI-SSA/B,ANCA,AMA",ANTI-HMGCR_AB,ABNORMAL_BRAIN_IMAGING,MYOPATHIC_CHANGES_IN_EMG,ABNORMAL_NCV,COMORBIDITIES,patient_id
V:2,F,49,31,0/5,0/5,Marked,Marked,Absent,+,-,...,17(9-26),390,-,-,-,-,+,-,Insulindependentdiabetes-onsetatage19,V:2
V:5,M,58,39,3/5,2/5,Marked,Marked,Diminished,+,-,...,25(15-46),123,-,,-,-,+,(+)L4-5radiculopathy,"COPD,Diastolicdysfunction,ICRBBB,Lymphocytosis",V:5
V:8,M,37,24,5/5,5/5,-,-,+,+,+,...,19,127,-,,-,,,,,V:8
V:9,M,42,33,5/5,4/5,-,-,+,+,+,...,22(12-32),111,,,-,-,,,ICRBBB,V:9
V:12,F,51,31,2/5,2/5,Evident,Evident,Diminished,+,-,...,30(11-154),124,-,-,-,-,+,-,Single kidney,V:12


In [6]:
column_mapper_d = {}

In [7]:
# Only needed to stream construction of mappers
#result = OptionColumnMapper.autoformat(df=dft, concept_recognizer=hpo_cr, delimiter=",")
#print(result)

In [8]:
ageMapper = AgeColumnMapper.by_year(column_name='AGE_AT_EXAMINATION')
#ageColumn.preview_column(dft['AGE_AT_EXAMINATION'])

In [9]:
sexMapper = SexColumnMapper(male_symbol="M", female_symbol="F", column_name="SEX")
#sexColumn.preview_column(dft["SEX"])

In [10]:
proximal_strength_upper_limb_d = {
 '3/5': 'Proximal muscle weakness in upper limbs',
 '5/5': 'Proximal muscle weakness in upper limbs',
 '2/5': 'Proximal muscle weakness in upper limbs'}
excluded_d = {'0/5': 'Proximal muscle weakness in upper limbs'}
proximal_strength_upper_limbMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=proximal_strength_upper_limb_d, excluded_d=excluded_d)
proximal_strength_upper_limbMapper.preview_column(dft['PROXIMAL_STRENGTH-UPPER_LIMB'])
column_mapper_d['PROXIMAL_STRENGTH-UPPER_LIMB'] = proximal_strength_upper_limbMapper

In [11]:
proximal_strength_lower_limb_d = {
 '2/5': 'Proximal muscle weakness in lower limbs',
 '5/5': 'Proximal muscle weakness in lower limbs',
 '4/5': 'Proximal muscle weakness in lower limbs'}
excluded = {'0/5': 'Proximal muscle weakness in lower limbs'}
proximal_strength_lower_limbMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=proximal_strength_lower_limb_d, excluded_d=excluded)
proximal_strength_lower_limbMapper.preview_column(dft['PROXIMAL_STRENGTH-LOWER_LIMB'])
column_mapper_d['PROXIMAL_STRENGTH-LOWER_LIMB'] = proximal_strength_lower_limbMapper

In [12]:
atrophy_upper_limb_d = {'Marked': 'Upper limb amyotrophy',
 'Evident': 'Upper limb amyotrophy'}
atrophy_upper_limbMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=atrophy_upper_limb_d)
atrophy_upper_limbMapper.preview_column(dft['ATROPHY_UPPER_LIMB'])
column_mapper_d['ATROPHY_UPPER_LIMB'] = atrophy_upper_limbMapper

In [13]:
atrophy_lower_limb_d = {'Marked': 'Lower limb amyotrophy',
 'Evident': 'Lower limb amyotrophy'}
atrophy_lower_limbMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=atrophy_lower_limb_d)
atrophy_lower_limbMapper.preview_column(dft['ATROPHY_LOWER_LIMB'])
column_mapper_d['ATROPHY_LOWER_LIMB'] = atrophy_lower_limbMapper

In [14]:
deep_tendon_reflexes_d = {'Absent': 'Areflexia',
 'Diminished': 'Hyporeflexia'}
deep_tendon_reflexesMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=deep_tendon_reflexes_d)
deep_tendon_reflexesMapper.preview_column(dft['DEEP_TENDON_REFLEXES'])
column_mapper_d['DEEP_TENDON_REFLEXES'] = deep_tendon_reflexesMapper

In [15]:
#Exercise-induced myalgia HP:0003738
pain_on_exertion_d = {"+":"Exercise-induced myalgia"}
pain_on_exertionMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=pain_on_exertion_d)
pain_on_exertionMapper.preview_column(dft['PAIN_ON_EXERTION'])
column_mapper_d['PAIN_ON_EXERTION'] = pain_on_exertionMapper

In [16]:
# Loss of ambulation HP:0002505
ambulatory_d = {}
ambulatoryMapper = SimpleColumnMapper(hpo_id="HP:0002505", hpo_label="Loss of ambulation", observed="+", excluded="-")
ambulatoryMapper.preview_column(dft['AMBULATORY'])
column_mapper_d['AMBULATORY'] = ambulatoryMapper

In [17]:
# dft['MOBILITY_RESTRICTION'])
# Not coding

In [18]:
# Respiratory insufficiency HP:0002093
respiratory_difficulties_d = {'Ventilated_through_tracheostomy': 'Respiratory insufficiency', "+":"Respiratory insufficiency"}
excluded = {"-":"Respiratory insufficiency"}
respiratory_difficultiesMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=respiratory_difficulties_d, excluded_d=excluded)
respiratory_difficultiesMapper.preview_column(dft['RESPIRATORY_DIFFICULTIES'])
column_mapper_d['RESPIRATORY_DIFFICULTIES'] = respiratory_difficultiesMapper

In [19]:
dysphagia_d = {}
dysphagiaMapper = SimpleColumnMapper(hpo_id="HP:0002015", hpo_label="Dysphagia", observed="+", excluded="-")
dysphagiaMapper.preview_column(dft['DYSPHAGIA'])
column_mapper_d['DYSPHAGIA'] = dysphagiaMapper

In [20]:
echocardiography_d = {
 'Mild diastolic_dysfunction': 'Left ventricular diastolic dysfunction',
}
echocardiographyMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=echocardiography_d)
echocardiographyMapper.preview_column(dft['ECHOCARDIOGRAPHY'])
column_mapper_d['ECHOCARDIOGRAPHY'] = echocardiographyMapper

In [21]:
cpkMapper = ConstantColumnMapper(hpo_id="HP:0003236", hpo_label='Elevated circulating creatine kinase concentration')
cpkMapper.preview_column(dft['CPK(REFERENCE_20-180_U/L)'])
column_mapper_d['CPK(REFERENCE_20-180_U/L)'] = cpkMapper

In [22]:
troponinMapper = ThresholdedColumnMapper(hpo_id="HP:0410174", 
                                         hpo_label="Increased circulating troponin T concentration", 
                                         threshold=14, 
                                         call_if_above=True)
troponinMapper.preview_column(dft['MAXIMAL TROPONIN T(0-14NG/L)'])
column_mapper_d['MAXIMAL TROPONIN T(0-14NG/L)'] = troponinMapper

In [23]:
# Not abnormal for any individual
creatinineMapper = ConstantColumnMapper(hpo_id="HP:0012100", hpo_label="Abnormal circulating creatinine concentration", excluded=True)
creatinineMapper.preview_column(dft['CREATININE'])
column_mapper_d['CREATININE'] = creatinineMapper

In [24]:
ast_d = {'34(12-106)': 'Elevated circulating aspartate aminotransferase concentration',
 '54(15-241)': 'Elevated circulating aspartate aminotransferase concentration',
 '277(68-905)': 'Elevated circulating aspartate aminotransferase concentration',
 '43(21-138)': 'Elevated circulating aspartate aminotransferase concentration',
 '98(28-566)': 'Elevated circulating aspartate aminotransferase concentration'}
excluded = {'23(19-29)': 'Elevated circulating aspartate aminotransferase concentration'}
astMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=ast_d, excluded_d=excluded)
astMapper.preview_column(dft['AST(REFERENCE_0-35U/L)'])
column_mapper_d['AST(REFERENCE_0-35U/L)'] = astMapper

In [25]:
alt_d = {'31(9-113)': 'Elevated circulating alanine aminotransferase concentration',
 '50(10-199)': 'Elevated circulating alanine aminotransferase concentration',
 '322(43-911)': 'Elevated circulating alanine aminotransferase concentration',
 '44(12-173)': 'Elevated circulating alanine aminotransferase concentration',
 '80(21-375)': 'Elevated circulating alanine aminotransferase concentration'}
excluded = {'15(11-25)': 'Elevated circulating alanine aminotransferase concentration'}
altMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=alt_d, excluded_d=excluded)
altMapper.preview_column(dft['ALT(REFERENCE_0-45U/L)'])
column_mapper_d['ALT(REFERENCE_0-45U/L)'] = altMapper

In [26]:
alkaline_phosphatase_d = {'151(108-331)': 'Elevated circulating alkaline phosphatase concentration',
 }
excluded = {'109(78-130)': 'Elevated circulating alkaline phosphatase concentration',
 '78(67-88)': 'Elevated circulating alkaline phosphatase concentration',
 '89(65-107)': 'Elevated circulating alkaline phosphatase concentration',
 '100(72-132)': 'Elevated circulating alkaline phosphatase concentration',
 '79(68-94)': 'Elevated circulating alkaline phosphatase concentration'}
alkaline_phosphataseMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=alkaline_phosphatase_d, excluded_d=excluded)
alkaline_phosphataseMapper.preview_column(dft['ALKALINE_PHOSPHATASE(REFERENCE_30-120U/L)'])
column_mapper_d['ALKALINE_PHOSPHATASE(REFERENCE_30-120U/L)'] = alkaline_phosphataseMapper

In [27]:
total_cholesterol_averageMapper = ConstantColumnMapper(hpo_id="HP:0003124", hpo_label="Hypercholesterolemia", excluded=True)
total_cholesterol_averageMapper.preview_column(dft['TOTAL_CHOLESTEROL_AVERAGE(RECOMMENDED<200MG/DL)'])
column_mapper_d['TOTAL_CHOLESTEROL_AVERAGE(RECOMMENDED<200MG/DL)'] = total_cholesterol_averageMapper

In [28]:
#Hypertriglyceridemia HP:0002155
triglycerides_d = {
 '149(55-270)': 'Hypertriglyceridemia',
 '167(77-232)': 'Hypertriglyceridemia'}
excluded = {'87(47-129)': 'Hypertriglyceridemia',
 '123(79-230)': 'Hypertriglyceridemia',
 '95.5(95-96)': 'Hypertriglyceridemia',
 '108(58-160)': 'Hypertriglyceridemia'
           }
triglyceridesMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=triglycerides_d, excluded_d=excluded)
triglyceridesMapper.preview_column(dft['TRIGLYCERIDES(RECOMMENDED<150MG/DL)'])
column_mapper_d['TRIGLYCERIDES(RECOMMENDED<150MG/DL)'] = triglyceridesMapper

In [29]:
hdl_d = {'49(31-65)': 'Decreased HDL cholesterol concentration',
 '49(43-57)': 'Decreased HDL cholesterol concentration',
 '38(30-46)': 'Decreased HDL cholesterol concentration',
 '45(31-50)': 'Decreased HDL cholesterol concentration',
 '41(27-49)': 'Decreased HDL cholesterol concentration'}
hdlMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=hdl_d)
hdlMapper.preview_column(dft['HDL(RECOMMENDED>60MG/DL)'])
column_mapper_d['HDL(RECOMMENDED>60MG/DL)'] = hdlMapper

In [30]:
#Increased LDL cholesterol concentration HP:0003141
# no individual has increased LDL
ldlMapper = ConstantColumnMapper(hpo_id="HP:0003141", hpo_label="Increased LDL cholesterol concentration", excluded=True)
ldlMapper.preview_column(dft['LDL(RECOMMENDED<100MG/DL)'])
column_mapper_d['LDL(RECOMMENDED<100MG/DL)'] = ldlMapper

In [31]:
# Normal range between 2-30, we will omit since the values are not clearly abnormal
# dft['VLDL'])

In [32]:
# normal range 70-100
fasting_blood_sugar_d = {'390': 'Hyperglycemia',
 '123': 'Hyperglycemia',
 '127': 'Hyperglycemia',
 '111': 'Hyperglycemia',
 '124': 'Hyperglycemia',
 '155': 'Hyperglycemia'}
fasting_blood_sugarMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=fasting_blood_sugar_d)
fasting_blood_sugarMapper.preview_column(dft['FASTING_BLOOD_SUGAR'])
column_mapper_d['FASTING_BLOOD_SUGAR'] = fasting_blood_sugarMapper

In [33]:
ana_d = {'-': 'Antinuclear antibody positivity'}
anaMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d={},excluded_d=ana_d)
anaMapper.preview_column(dft['ANA,RF,C3,C4ABNORMALITIES'])
column_mapper_d['ANA,RF,C3,C4ABNORMALITIES'] = anaMapper

In [34]:
# negative in two datalackjong in others scant details, thius omitting
#dft['ANTI-SM,ANTIJO-1,ANTI-SSA/B,ANCA,AMA']

In [35]:
# Anti-3-hydroxy- 3-methylglutaryl-coA reductase antibody positivity
HMGCR_d = {"-": "Anti-3-hydroxy- 3-methylglutaryl-coA reductase antibody positivity"}
ana_d = {'-': 'Antinuclear antibody positivity'}
hmgcrMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d={},excluded_d=HMGCR_d)
hmgcrMapper.preview_column(dft['ANTI-HMGCR_AB'])
column_mapper_d['ANTI-HMGCR_AB'] = hmgcrMapper

In [36]:
# no abnormal finding but no details, omit
#dft['ABNORMAL_BRAIN_IMAGING']

In [37]:
myopathic_changes_in_emg_d = {'+': 'Myopathy'}
myopathic_changes_in_emgMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=myopathic_changes_in_emg_d)
myopathic_changes_in_emgMapper.preview_column(dft['MYOPATHIC_CHANGES_IN_EMG'])
column_mapper_d['MYOPATHIC_CHANGES_IN_EMG'] = myopathic_changes_in_emgMapper

In [38]:
comorbidities_d = {'Insulindependentdiabetes-onsetatage19': 'Type II diabetes mellitus',
 'COPD': 'Chronic pulmonary obstruction',
 'Diastolicdysfunction': 'Left ventricular diastolic dysfunction',
 'Lymphocytosis': 'Lymphocytosis',
 'Single kidney': 'Unilateral renal agenesis'}
comorbiditiesMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=comorbidities_d, assumeExcluded=True)
comorbiditiesMapper.preview_column(dft['COMORBIDITIES'])
column_mapper_d['COMORBIDITIES'] = comorbiditiesMapper

# Variant
The affected members of the family share this variant: NM_000859.3:c.2465G>A; p.(G822D) in HMGCR

In [39]:
vvalidator = VariantValidator(genome_build="hg38", transcript="NM_000859.3")

In [40]:
var = vvalidator.encode_hgvs("c.2465G>A")
var.set_heterozygous()

https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_000859.3%3Ac.2465G>A/NM_000859.3?content-type=application%2Fjson


# Cohort mapper

In [41]:
encoder = CohortEncoder(df=dft, 
                        hpo_cr=hpo_cr, 
                        column_mapper_d=column_mapper_d, 
                        individual_column_name="patient_id", 
                        agemapper=ageMapper, 
                        sexmapper=sexMapper,
                        metadata=metadata,
                        pmid=PMID)
omim_label = "Muscular dystrophy, limb-girdle, autosomal recessive 28"
omim_id = "OMIM:620375"
disease = Disease(disease_id=omim_id, disease_label=omim_label)
encoder.set_disease(disease)

In [42]:
individuals = encoder.get_individuals()

In [43]:
for indi in individuals:
    indi.add_variant(var)

# Validation

In [44]:
cohort_validator = CohortValidator(cohort=individuals, ontology=hpo_ontology, min_hpo=1, min_allele=1, min_var=1)

In [45]:
validated = cohort_validator.get_validated_individual_list()

In [46]:
qcViz = QcVisualizer(ontology=hpo_ontology)

In [47]:
display(HTML(qcViz.to_html(validated_individual_list=validated)))

In [48]:
# Visualization

In [54]:
individuals = [v.get_individual_with_clean_terms() for v in validated]
ppacket_list = [i.to_ga4gh_phenopacket(metadata=metadata.to_ga4gh()) for i in individuals]
table = PhenopacketTable(phenopacket_list=ppacket_list)
display(HTML(table.to_html()))

Individual,Disease,Genotype,Phenotypic features
V:2 (FEMALE; P49Y),"Muscular dystrophy, limb-girdle, autosomal recessive 28 (OMIM:620375)",NM_000859.3:c.2465G>A (heterozygous),Upper limb amyotrophy (HP:0009129); Lower limb amyotrophy (HP:0007210); Areflexia (HP:0001284); Exercise-induced myalgia (HP:0003738); Respiratory insufficiency (HP:0002093); Elevated circulating creatine kinase concentration (HP:0003236); Increased circulating troponin T concentration (HP:0410174); Elevated circulating aspartate aminotransferase concentration (HP:0031956); Elevated circulating alanine aminotransferase concentration (HP:0031964); Elevated circulating alkaline phosphatase concentration (HP:0003155); Decreased HDL cholesterol concentration (HP:0003233); Hyperglycemia (HP:0003074); Myopathy (HP:0003198); Type II diabetes mellitus (HP:0005978)
V:5 (MALE; P58Y),"Muscular dystrophy, limb-girdle, autosomal recessive 28 (OMIM:620375)",NM_000859.3:c.2465G>A (heterozygous),Proximal muscle weakness in upper limbs (HP:0008997); Proximal muscle weakness in lower limbs (HP:0008994); Upper limb amyotrophy (HP:0009129); Lower limb amyotrophy (HP:0007210); Hyporeflexia (HP:0001265); Exercise-induced myalgia (HP:0003738); Respiratory insufficiency (HP:0002093); Left ventricular diastolic dysfunction (HP:0025168); Elevated circulating creatine kinase concentration (HP:0003236); Increased circulating troponin T concentration (HP:0410174); Elevated circulating aspartate aminotransferase concentration (HP:0031956); Elevated circulating alanine aminotransferase concentration (HP:0031964); Decreased HDL cholesterol concentration (HP:0003233); Hyperglycemia (HP:0003074); Myopathy (HP:0003198); Chronic pulmonary obstruction (HP:0006510); Lymphocytosis (HP:0100827)
V:8 (MALE; P37Y),"Muscular dystrophy, limb-girdle, autosomal recessive 28 (OMIM:620375)",NM_000859.3:c.2465G>A (heterozygous),Proximal muscle weakness in upper limbs (HP:0008997); Proximal muscle weakness in lower limbs (HP:0008994); Exercise-induced myalgia (HP:0003738); Loss of ambulation (HP:0002505); Respiratory insufficiency (HP:0002093); Elevated circulating creatine kinase concentration (HP:0003236); Elevated circulating aspartate aminotransferase concentration (HP:0031956); Elevated circulating alanine aminotransferase concentration (HP:0031964); Decreased HDL cholesterol concentration (HP:0003233); Hyperglycemia (HP:0003074)
V:9 (MALE; P42Y),"Muscular dystrophy, limb-girdle, autosomal recessive 28 (OMIM:620375)",NM_000859.3:c.2465G>A (heterozygous),Proximal muscle weakness in upper limbs (HP:0008997); Proximal muscle weakness in lower limbs (HP:0008994); Exercise-induced myalgia (HP:0003738); Loss of ambulation (HP:0002505); Elevated circulating creatine kinase concentration (HP:0003236); Increased circulating troponin T concentration (HP:0410174); Decreased HDL cholesterol concentration (HP:0003233); Hyperglycemia (HP:0003074)
V:12 (FEMALE; P51Y),"Muscular dystrophy, limb-girdle, autosomal recessive 28 (OMIM:620375)",NM_000859.3:c.2465G>A (heterozygous),Proximal muscle weakness in upper limbs (HP:0008997); Proximal muscle weakness in lower limbs (HP:0008994); Upper limb amyotrophy (HP:0009129); Lower limb amyotrophy (HP:0007210); Hyporeflexia (HP:0001265); Exercise-induced myalgia (HP:0003738); Elevated circulating creatine kinase concentration (HP:0003236); Increased circulating troponin T concentration (HP:0410174); Elevated circulating aspartate aminotransferase concentration (HP:0031956); Elevated circulating alanine aminotransferase concentration (HP:0031964); Hypertriglyceridemia (HP:0002155); Hyperglycemia (HP:0003074); Myopathy (HP:0003198); Unilateral renal agenesis (HP:0000122)
V:13 (MALE; P41Y),"Muscular dystrophy, limb-girdle, autosomal recessive 28 (OMIM:620375)",NM_000859.3:c.2465G>A (heterozygous),Proximal muscle weakness in upper limbs (HP:0008997); Proximal muscle weakness in lower limbs (HP:0008994); Upper limb amyotrophy (HP:0009129); Hyporeflexia (HP:0001265); Exercise-induced myalgia (HP:0003738); Loss of ambulation (HP:0002505); Respiratory insufficiency (HP:0002093); Elevated circulating creatine kinase concentration (HP:0003236); Elevated circulating aspartate aminotransferase concentration (HP:0031956); Elevated circulating alanine aminotransferase concentration (HP:0031964); Hypertriglyceridemia (HP:0002155); Decreased HDL cholesterol concentration (HP:0003233); Hyperglycemia (HP:0003074); Myopathy (HP:0003198)


In [55]:
Individual.output_individuals_as_phenopackets(individual_list=individuals,
                                              metadata=metadata,
                                              pmid=PMID,
                                              outdir="phenopackets")

We output 6 GA4GH phenopackets to the directory phenopackets


# Transform to HPOA-style aggregate file
The HPO project provides the phenotype.hpoa file that contains aggregate HPO associations for currently over 8.5 thousand diseases. Here, we show how to take a collection of phenopackets and create a HPOA file in the format of the phenotype.hpoa file that can be added to that resource.

The strategy is to read in a collection of phenopackets (we will use the above 6 phenopackets) and to output one line for each HPO mentioned in the data.

In [56]:
creator = HpoaTableCreator(phenopacket_list=ppacket_list)
df = creator.get_dataframe()
df.head()

[pyphetools] Ingested 6 GA4GH phenopackets.
We found a total of 29 HPO terms
extracted PubMed identifier: PMID:36745799
Extracted disease: Muscular dystrophy, limb-girdle, autosomal recessive 28 (OMIM:620375)


Unnamed: 0,#diseaseID,diseaseName,phenotypeID,phenotypeName,onsetID,onsetName,frequency,sex,negation,modifier,description,publication,evidence,biocuration
0,OMIM:620375,"Muscular dystrophy, limb-girdle, autosomal recessive 28",HP:0410174,Increased circulating troponin T concentration,,,4/6,,,,,PMID:36745799,PCS,ORCID:0000-0002-0736-9199
1,OMIM:620375,"Muscular dystrophy, limb-girdle, autosomal recessive 28",HP:0003493,Antinuclear antibody positivity,,,,,,,,PMID:36745799,PCS,ORCID:0000-0002-0736-9199
2,OMIM:620375,"Muscular dystrophy, limb-girdle, autosomal recessive 28",HP:0009129,Upper limb amyotrophy,,,4/4,,,,,PMID:36745799,PCS,ORCID:0000-0002-0736-9199
3,OMIM:620375,"Muscular dystrophy, limb-girdle, autosomal recessive 28",HP:0005978,Type II diabetes mellitus,,,1/6,,,,,PMID:36745799,PCS,ORCID:0000-0002-0736-9199
4,OMIM:620375,"Muscular dystrophy, limb-girdle, autosomal recessive 28",HP:0001265,Hyporeflexia,,,3/3,,,,,PMID:36745799,PCS,ORCID:0000-0002-0736-9199


In [57]:
name = disease.id.replace(":", "_") + ".tab"
df.to_csv(name, sep="\t", index=False)

In [58]:
! cat OMIM_620375.tab  

#diseaseID	diseaseName	phenotypeID	phenotypeName	onsetID	onsetName	frequency	sex	negation	modifier	description	publication	evidence	biocuration
OMIM:620375	Muscular dystrophy, limb-girdle, autosomal recessive 28	HP:0410174	Increased circulating troponin T concentration			4/6					PMID:36745799	PCS	ORCID:0000-0002-0736-9199
OMIM:620375	Muscular dystrophy, limb-girdle, autosomal recessive 28	HP:0003493	Antinuclear antibody positivity								PMID:36745799	PCS	ORCID:0000-0002-0736-9199
OMIM:620375	Muscular dystrophy, limb-girdle, autosomal recessive 28	HP:0009129	Upper limb amyotrophy			4/4					PMID:36745799	PCS	ORCID:0000-0002-0736-9199
OMIM:620375	Muscular dystrophy, limb-girdle, autosomal recessive 28	HP:0005978	Type II diabetes mellitus			1/6					PMID:36745799	PCS	ORCID:0000-0002-0736-9199
OMIM:620375	Muscular dystrophy, limb-girdle, autosomal recessive 28	HP:0001265	Hyporeflexia			3/3					PMID:36745799	PCS	ORCID:0000-0002-0736-9199
OMIM:620375	Muscular dystrophy, limb-girdle, autosomal