<H1>Creation of phenopackets from PMID:31278393</H1>
<P>In this notebook, we show how to create phenopackets from table 1 of <a href="https://pubmed.ncbi.nlm.nih.gov/31278393/" target="__blank">Dyment DA et al. (2019) De novo substitutions of TRPM3 cause intellectual disability and epilepsy. Eur J Hum Genet. 27:1611-1618</a>. Please refer to the main tutorial notebooks for additional explanations.</P>

In [1]:
import phenopackets as php
from google.protobuf.json_format import MessageToDict, MessageToJson
from google.protobuf.json_format import Parse, ParseDict
import pandas as pd
pd.set_option('display.max_colwidth', None) # show entire column contents, important!
from collections import defaultdict
import os
import sys

sys.path.insert(0, os.path.abspath('../../pyphetools'))
from pyphetools.creation import *

In [2]:
# Import HPO data
parser = HpoParser()
hpo_cr = parser.get_hpo_concept_recognizer()
hpo_version = parser.get_version()
metadata = MetaData(created_by="ORCID:0000-0002-0736-9199")
metadata.default_versions_with_hpo(version=hpo_version)

In [3]:
df = pd.read_excel('data/PMID_31278393.xlsx')

In [4]:
df

Unnamed: 0,Individual,1,2,3,4,5,6,7,8
0,cDNA (NM_020952.4),c.2509G>A,c.2509G>A,c.2509G>A,c.2509G>A,c.2509G>A,c.2509G>A,c.2509G>A,c.2810C>A
1,Polypeptide (NP_066003.3),p.(Val837Met),p.(Val837Met),p.(Val837Met),p.(Val837Met),p.(Val837Met),p.(Val837Met),p.(Val837Met),p.(Pro937Gln)
2,Genomic DNA (NC_000009.11),g.73213379C>T,g.73213379C>T,g.73213379C>T,g.73213379C>T,g.73213379C>T,g.73213379C>T,g.73213379C>T,g.73168145G>T
3,Zygosity,Heterozygous,Heterozygous,Heterozygous,Heterozygous,Heterozygous,Heterozygous,Heterozygous,Heterozygous
4,Segregation,De novo,De novo,De novo,De novo,De novo,De novo,De novo,De novo
5,Clinical features,,,,,,,,
6,Gestation (weeks),38,40,42,39,38 + 3,40,39,Term
7,Perinatal history,C/S,N,N,N,N,N,C/S,C/S (repeat)
8,Birth weight (kg),NR,3.6,3.2,3.48,3.378,3.89,3.1,2.9
9,Sex,M,M,F,M,M,M,M,F


In [5]:
# Convert to row based format
dft = df.transpose()
dft.columns = dft.iloc[0]
dft.drop(dft.index[0], inplace=True)
dft.head()
# Note that the Individual is now the row index but we need it to be available as a column
# Therefore, add it as an explicit, new column
dft['patient_id'] = dft.index
dft.head()

Individual,cDNA (NM_020952.4),Polypeptide (NP_066003.3),Genomic DNA (NC_000009.11),Zygosity,Segregation,Clinical features,Gestation (weeks),Perinatal history,Birth weight (kg),Sex,...,Craniofacial gestalt,Morphological features,Other clinical features,Brain MRI,Apparent heat or pain insensitivity,Genetic investigations,aCGH,Fragile X,Other (nondiagnostic) genetic investigations,patient_id
1,c.2509G>A,p.(Val837Met),g.73213379C>T,Heterozygous,De novo,,38,C/S,NR,M,...,Nondysmorphic,"Broad forehead, deeply set eyes, ptosis, bulbous nasal tip, micrognathia, prominent lobule of ear, tapering fingers",C1 spinal stenosis; Chiari I malformation; scoliosis; torticollis; plagiocephaly; thickened filum terminale; bilateral talipes equinovarus; strabismus (exotropia OU),Possible mild cerebral volume loss,+ (Heat),,Normal,Normal,"ID panel (170 genes), PHF6",1
2,c.2509G>A,p.(Val837Met),g.73213379C>T,Heterozygous,De novo,,40,N,3.6,M,...,Nondysmorphic,"Short philtrum, long nose, turricephaly",EMG/NCS normal,Normal,NR,,Normal,Normal,NR,2
3,c.2509G>A,p.(Val837Met),g.73213379C>T,Heterozygous,De novo,,42,N,3.2,F,...,Nondysmorphic,NR,−,Normal,NR,,Normal,Normal,"MECP2, SMA",3
4,c.2509G>A,p.(Val837Met),g.73213379C>T,Heterozygous,De novo,,39,N,3.48,M,...,NR,"Broad forehead, deeply set eyes, flat midface, short philtrum, micrognathia, broad halluces, fifth-finger clinodactyly, pectus excavatum",Strabismus,Normal,NR,,Normal,Normal,NR,4
5,c.2509G>A,p.(Val837Met),g.73213379C>T,Heterozygous,De novo,,38 + 3,N,3.378,M,...,NR,"Broad forehead, low nasal bridge, unilateral preauricular pit, short broad thumbs","Cryptorchidism, micropenis, bilateral talipes equinovarus","Ventriculomegaly, nonspecific periventricular white matter hyperintensities",+ (Pain),,Normal,,NR,5


<h2>Column mappers</h2>

In [6]:
column_mapper_d = defaultdict(ColumnMapper)

In [7]:
# Developmental delay/intellectual disability  -- use code to intellectual disability 

severity_id = {'+ (Severe)': 'Intellectual disability, severe',
                 '+ (Moderate)': 'Intellectual disability, moderate',
               '+ (Moderate-to-severe)':'Intellectual disability, moderate'}
idMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=severity_id)
idMapper.preview_column(dft['Developmental delay/intellectual disability'])
column_mapper_d['Developmental delay/intellectual disability'] = idMapper

In [8]:
dft.columns

Index(['cDNA (NM_020952.4) ', 'Polypeptide (NP_066003.3)',
       'Genomic DNA (NC_000009.11)', 'Zygosity', 'Segregation',
       'Clinical features', 'Gestation (weeks)', 'Perinatal history',
       'Birth weight (kg)', 'Sex', 'Age (years)', 'Height (cm)', 'Weight (kg)',
       'BMI (kg/m2)', 'OFC (cm)',
       'Developmental delay/intellectual disability',
       'Ambulate independently (age achieved)', 'Any speech (age attained)',
       'Combine words/signs', 'Toilet independently (age attained)',
       'Autism-like features', 'Electrographically confirmed seizures',
       'Seizure types', 'Age of first clinical seizure',
       'Current anticonvulsant therapy', 'Age of last clinical seizure',
       'Hypotonia', 'Craniofacial gestalt', 'Morphological features',
       'Other clinical features', 'Brain MRI',
       'Apparent heat or pain insensitivity', 'Genetic investigations', 'aCGH',
       'Fragile X', 'Other (nondiagnostic) genetic investigations',
       'patient_id'],
    

In [9]:
# By inspection, all entries of this column indicate delayed ability to walk. Therefore, set constant to True
# the alternative would be to code each of the varied entries
delayedWalkColumn = SimpleColumnMapper(hpo_id='HP:0031936', hpo_label='Delayed ability to walk', constant=True)
#delayedWalkColumn.preview_column(dft['Ambulate independently (age achieved)'])
column_mapper_d['Ambulate independently (age achieved)'] = delayedWalkColumn

In [10]:
## Same comments for speech
delayedSpeechColumn = SimpleColumnMapper(hpo_id='HP:0000750', hpo_label='Delayed speech and language development', constant=True)
# delayedSpeechColumn.preview_column(dft['Any speech (age attained)'])
column_mapper_d['Any speech (age attained)'] = delayedSpeechColumn

In [11]:
## 'Autism-like features' # Autistic behavior HP:
autisticFeaturesMapper = SimpleColumnMapper(hpo_id='HP:0000729', hpo_label='Autistic behavior', observed="+", excluded="−")
#autisticFeaturesMapper.preview_column(dft['Autism-like features'])
column_mapper_d['Autism-like features'] = autisticFeaturesMapper

In [12]:
seizure_d = {'Absence': 'Typical absence seizure',
                 'Infantile spasms': 'Infantile spasms',
               'GTC':'Bilateral tonic-clonic seizure',
                'ESES': 'Status epilepticus'}
seizureMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=seizure_d)
#seizureMapper.preview_column(dft['Seizure types'])
column_mapper_d['Seizure types'] = seizureMapper

In [13]:
# Hypotonia HP:0001252 -- note that we include   + (mixed tone abnormality)  as Hypotonia
hypotoniaMapper = SimpleColumnMapper(hpo_id='HP:0001252', hpo_label='Hypotonia', 
                                     observed=['+', '+ (mixed tone abnormality)'], excluded='−')
#hypotoniaMapper.preview_column(dft['Hypotonia'])
column_mapper_d['Hypotonia'] = hypotoniaMapper

In [14]:
#dft['Morphological features']
morph_d = {
    'bulbous nasal tip': 'Bulbous nose'
}
morphologicalMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=morph_d)
morphologicalMapper.preview_column(dft['Morphological features'])

Unnamed: 0,terms
0,HP:0000414 (Bulbous nose/observed); HP:0000337 (Broad forehead/observed); HP:0000347 (Micrognathia/observed); HP:0000490 (Deeply set eye/observed); HP:0000508 (Ptosis/observed); HP:0001182 (Tapered finger/observed)
1,HP:0000262 (Turricephaly/observed); HP:0000322 (Short philtrum/observed); HP:0003189 (Long nose/observed)
2,
3,HP:0000322 (Short philtrum/observed); HP:0000337 (Broad forehead/observed); HP:0000347 (Micrognathia/observed); HP:0000490 (Deeply set eye/observed); HP:0000767 (Pectus excavatum/observed); HP:0010055 (Broad hallux/observed); HP:0011800 (Midface retrusion/observed); HP:0030084 (Clinodactyly/observed)
4,HP:0000337 (Broad forehead/observed); HP:0004467 (Preauricular pit/observed); HP:0005280 (Depressed nasal bridge/observed); HP:0011304 (Broad thumb/observed)
5,HP:0000218 (High palate/observed); HP:0000347 (Micrognathia/observed)
6,HP:0000414 (Bulbous nose/observed); HP:0000324 (Facial asymmetry/observed); HP:0000347 (Micrognathia/observed); HP:0000470 (Short neck/observed); HP:0000506 (Telecanthus/observed); HP:0000508 (Ptosis/observed)
7,HP:0000154 (Wide mouth/observed); HP:0000322 (Short philtrum/observed); HP:0000337 (Broad forehead/observed); HP:0000463 (Anteverted nares/observed); HP:0000490 (Deeply set eye/observed); HP:0000582 (Upslanted palpebral fissure/observed); HP:0000996 (Facial capillary hemangioma/observed)


In [15]:
other_d = {
    'Chiari I malformation': 'Chiari type I malformation',
    'C1 spinal stenosis':'Cervical spinal canal stenosis'
}
otherMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=other_d)
otherMapper.preview_column(dft['Other clinical features'])

Unnamed: 0,terms
0,HP:0007099 (Chiari type I malformation/observed); HP:0008445 (Cervical spinal canal stenosis/observed); HP:0000473 (Torticollis/observed); HP:0000486 (Strabismus/observed); HP:0000577 (Exotropia/observed); HP:0001357 (Plagiocephaly/observed); HP:0001762 (Talipes equinovarus/observed); HP:0002650 (Scoliosis/observed)
1,
2,
3,HP:0000486 (Strabismus/observed)
4,HP:0000028 (Cryptorchidism/observed); HP:0000054 (Micropenis/observed); HP:0001762 (Talipes equinovarus/observed)
5,HP:0001385 (Hip dysplasia/observed); HP:0001943 (Hypoglycemia/observed); HP:0002650 (Scoliosis/observed)
6,HP:0000486 (Strabismus/observed); HP:0000577 (Exotropia/observed); HP:0001763 (Pes planus/observed); HP:0002305 (Athetosis/observed)
7,HP:0000486 (Strabismus/observed); HP:0001266 (Choreoathetosis/observed); HP:0002650 (Scoliosis/observed)


In [16]:
ageMapper = AgeColumnMapper.by_year('Age (years)')
ageMapper.preview_column(dft['Age (years)'])


Unnamed: 0,original column contents,age
0,16.0,P16Y
1,4.75,P4Y9M
2,6.0,P6Y
3,5.9,P5Y11M
4,6.25,P6Y3M
5,28.0,P28Y
6,38.0,P38Y
7,8.1,P8Y1M


In [17]:
sexMapper = SexColumnMapper(male_symbol='M', female_symbol='F', column_name='Sex')
sexMapper.preview_column(dft['Sex'])

Unnamed: 0,original column contents,sex
0,M,MALE
1,M,MALE
2,F,FEMALE
3,M,MALE
4,M,MALE
5,M,MALE
6,M,MALE
7,F,FEMALE


In [18]:
genome = 'hg38'
default_genotype = 'heterozygous'
transcript='NM_020952.6' # latest version of TRPM3 transcript used in publlication (original: version 4)
# Note there is an extra space at the end of the column name
varMapper = VariantColumnMapper(assembly=genome,column_name='cDNA (NM_020952.4) ', 
                                transcript=transcript, genotype=default_genotype)
#varMapper.preview_column(column=dft['cDNA (NM_020952.4) '])

In [19]:
pmid = "PMID:31278393"
encoder = CohortEncoder(df=dft, hpo_cr=hpo_cr, column_mapper_d=column_mapper_d, 
                        individual_column_name="patient_id", 
                        agemapper=ageMapper, sexmapper=sexMapper,
                        variant_mapper=varMapper,
                        metadata=metadata,
                        pmid=pmid)


CTOR pheno elif pyphe
CTOR pheno metadata is <class 'phenopackets.schema.v2.core.meta_data_pb2.MetaData'>
CTOR pheno metadata is <class 'phenopackets.schema.v2.core.meta_data_pb2.MetaData'>


In [20]:
#individuals = encoder.get_individuals()

In [21]:
output_directory = "phenopackets"
encoder.output_phenopackets(outdir=output_directory)

https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_020952.6%3Ac.2509G>A/NM_020952.6
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_020952.6%3Ac.2509G>A/NM_020952.6
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_020952.6%3Ac.2509G>A/NM_020952.6
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_020952.6%3Ac.2509G>A/NM_020952.6
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_020952.6%3Ac.2509G>A/NM_020952.6
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_020952.6%3Ac.2509G>A/NM_020952.6
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_020952.6%3Ac.2509G>A/NM_020952.6
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_020952.6%3Ac.2810C>A/NM_020952.6
output pheno metadata is <class 'pyphetools.creation.metadata.MetaData'>


ValueError: metadata argument must be GA4GH Phenopacket Schema MetaData but was <class 'pyphetools.creation.metadata.MetaData'>