<H1>Creation of phenopackets from PMID:31278393</H1>
<P>In this notebook, we show how to create phenopackets from table 1 of <a href="https://pubmed.ncbi.nlm.nih.gov/31278393/" target="__blank">Dyment DA et al. (2019) De novo substitutions of TRPM3 cause intellectual disability and epilepsy. Eur J Hum Genet. 27:1611-1618</a>. Please refer to the main tutorial notebooks for additional explanations.</P>

In [1]:
import phenopackets as php
from google.protobuf.json_format import MessageToDict, MessageToJson
from google.protobuf.json_format import Parse, ParseDict
import pandas as pd
pd.set_option('display.max_colwidth', None) # show entire column contents, important!
from collections import defaultdict
import os
import sys

sys.path.insert(0, os.path.abspath('../../pyphetools'))
from pyphetools import *

In [2]:
# Import HPO data
parser = HpoParser()
hpo_cr = parser.get_hpo_concept_recognizer()

Length of valid_node_curies 16536


In [3]:
df = pd.read_excel('data/PMID_31278393.xlsx')

In [4]:
df

Unnamed: 0,Individual,1,2,3,4,5,6,7,8
0,cDNA (NM_020952.4),c.2509G>A,c.2509G>A,c.2509G>A,c.2509G>A,c.2509G>A,c.2509G>A,c.2509G>A,c.2810C>A
1,Polypeptide (NP_066003.3),p.(Val837Met),p.(Val837Met),p.(Val837Met),p.(Val837Met),p.(Val837Met),p.(Val837Met),p.(Val837Met),p.(Pro937Gln)
2,Genomic DNA (NC_000009.11),g.73213379C>T,g.73213379C>T,g.73213379C>T,g.73213379C>T,g.73213379C>T,g.73213379C>T,g.73213379C>T,g.73168145G>T
3,Zygosity,Heterozygous,Heterozygous,Heterozygous,Heterozygous,Heterozygous,Heterozygous,Heterozygous,Heterozygous
4,Segregation,De novo,De novo,De novo,De novo,De novo,De novo,De novo,De novo
5,Clinical features,,,,,,,,
6,Gestation (weeks),38,40,42,39,38 + 3,40,39,Term
7,Perinatal history,C/S,N,N,N,N,N,C/S,C/S (repeat)
8,Birth weight (kg),NR,3.6,3.2,3.48,3.378,3.89,3.1,2.9
9,Sex,M,M,F,M,M,M,M,F


In [5]:
# Convert to row based format
dft = df.transpose()
dft.columns = dft.iloc[0]
dft.drop(dft.index[0], inplace=True)
dft.head()
# Note that the Individual is now the row index but we need it to be available as a column
# Therefore, add it as an explicit, new column
dft['patient_id'] = dft.index
dft.head()

Individual,cDNA (NM_020952.4),Polypeptide (NP_066003.3),Genomic DNA (NC_000009.11),Zygosity,Segregation,Clinical features,Gestation (weeks),Perinatal history,Birth weight (kg),Sex,...,Craniofacial gestalt,Morphological features,Other clinical features,Brain MRI,Apparent heat or pain insensitivity,Genetic investigations,aCGH,Fragile X,Other (nondiagnostic) genetic investigations,patient_id
1,c.2509G>A,p.(Val837Met),g.73213379C>T,Heterozygous,De novo,,38,C/S,NR,M,...,Nondysmorphic,"Broad forehead, deeply set eyes, ptosis, bulbous nasal tip, micrognathia, prominent lobule of ear, tapering fingers",C1 spinal stenosis; Chiari I malformation; scoliosis; torticollis; plagiocephaly; thickened filum terminale; bilateral talipes equinovarus; strabismus (exotropia OU),Possible mild cerebral volume loss,+ (Heat),,Normal,Normal,"ID panel (170 genes), PHF6",1
2,c.2509G>A,p.(Val837Met),g.73213379C>T,Heterozygous,De novo,,40,N,3.6,M,...,Nondysmorphic,"Short philtrum, long nose, turricephaly",EMG/NCS normal,Normal,NR,,Normal,Normal,NR,2
3,c.2509G>A,p.(Val837Met),g.73213379C>T,Heterozygous,De novo,,42,N,3.2,F,...,Nondysmorphic,NR,−,Normal,NR,,Normal,Normal,"MECP2, SMA",3
4,c.2509G>A,p.(Val837Met),g.73213379C>T,Heterozygous,De novo,,39,N,3.48,M,...,NR,"Broad forehead, deeply set eyes, flat midface, short philtrum, micrognathia, broad halluces, fifth-finger clinodactyly, pectus excavatum",Strabismus,Normal,NR,,Normal,Normal,NR,4
5,c.2509G>A,p.(Val837Met),g.73213379C>T,Heterozygous,De novo,,38 + 3,N,3.378,M,...,NR,"Broad forehead, low nasal bridge, unilateral preauricular pit, short broad thumbs","Cryptorchidism, micropenis, bilateral talipes equinovarus","Ventriculomegaly, nonspecific periventricular white matter hyperintensities",+ (Pain),,Normal,,NR,5


<h2>Column mappers</h2>

In [6]:
column_mapper_d = defaultdict(ColumnMapper)

In [7]:
# Developmental delay/intellectual disability  -- use code to intellectual disability 

severity_id = {'+ (Severe)': 'Intellectual disability, severe',
                 '+ (Moderate)': 'Intellectual disability, moderate',
               '+ (Moderate-to-severe)':'Intellectual disability, moderate'}
idMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=severity_id)
idMapper.preview_column(dft['Developmental delay/intellectual disability'])
column_mapper_d['Developmental delay/intellectual disability'] = idMapper

In [8]:
dft.columns

Index(['cDNA (NM_020952.4) ', 'Polypeptide (NP_066003.3)',
       'Genomic DNA (NC_000009.11)', 'Zygosity', 'Segregation',
       'Clinical features', 'Gestation (weeks)', 'Perinatal history',
       'Birth weight (kg)', 'Sex', 'Age (years)', 'Height (cm)', 'Weight (kg)',
       'BMI (kg/m2)', 'OFC (cm)',
       'Developmental delay/intellectual disability',
       'Ambulate independently (age achieved)', 'Any speech (age attained)',
       'Combine words/signs', 'Toilet independently (age attained)',
       'Autism-like features', 'Electrographically confirmed seizures',
       'Seizure types', 'Age of first clinical seizure',
       'Current anticonvulsant therapy', 'Age of last clinical seizure',
       'Hypotonia', 'Craniofacial gestalt', 'Morphological features',
       'Other clinical features', 'Brain MRI',
       'Apparent heat or pain insensitivity', 'Genetic investigations', 'aCGH',
       'Fragile X', 'Other (nondiagnostic) genetic investigations',
       'patient_id'],
    

In [14]:
# By inspection, all entries of this column indicate delayed ability to walk. Therefore, set constant to True
# the alternative would be to code each of the varied entries
delayedWalkColumn = SimpleColumnMapper(hpo_id='HP:0031936', hpo_label='Delayed ability to walk', constant=True)
#delayedWalkColumn.preview_column(dft['Ambulate independently (age achieved)'])
column_mapper_d['Ambulate independently (age achieved)'] = delayedWalkColumn

In [13]:
## Same comments for speech
delayedSpeechColumn = SimpleColumnMapper(hpo_id='HP:0000750', hpo_label='Delayed speech and language development', constant=True)
#delayedSpeechColumn.preview_column(dft['Any speech (age attained)'])
column_mapper_d['Any speech (age achieved)'] = delayedSpeechColumn

In [19]:
## 'Autism-like features' # Autistic behavior HP:
autisticFeaturesMapper = SimpleColumnMapper(hpo_id='HP:0000729', hpo_label='Autistic behavior', observed="+", excluded="−")
#autisticFeaturesMapper.preview_column(dft['Autism-like features'])
column_mapper_d['Autism-like features'] = autisticFeaturesMapper


In [20]:
dft['Electrographically confirmed seizures']

1              + 
2              + 
3              + 
4              + 
5              + 
6              + 
7    Unconfirmed 
8              + 
Name: Electrographically confirmed seizures, dtype: object

In [21]:
dft['Seizure types']

1                        Absence 
2               Infantile spasms 
3                            GTC 
4    Subclinical, including ESES 
5                             NR 
6                Absence and GTC 
7                        Absence 
8                        Absence 
Name: Seizure types, dtype: object

In [23]:
## TODO FIX THIS
seizure_d = {'Absence': 'Typical absence seizure',
                 'Infantile spasms': 'Infantile spasms',
               'GTC':'Bilateral tonic-clonic seizure',
                'ESES': 'Status epilepticus'}
seizureMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=seizure_d)
seizureMapper.preview_column(dft['Seizure types'])

Unnamed: 0,term,status
0,Typical absence seizure (HP:0011147),observed
1,Infantile spasms (HP:0012469),observed
2,Bilateral tonic-clonic seizure (HP:0002069),observed
3,Typical absence seizure (HP:0011147),observed
4,Typical absence seizure (HP:0011147),observed
