<h1>Genotype–phenotype correlation at codon 1740 of SETD2</h1>
<p>Generate phenopackets from the data reported in <a href="https://pubmed.ncbi.nlm.nih.gov/33766796/">Chen et al., (2021)</a>.</p>

In [1]:
import phenopackets as php
from google.protobuf.json_format import MessageToDict, MessageToJson
from google.protobuf.json_format import Parse, ParseDict
import pandas as pd
pd.set_option('display.max_colwidth', None) # show entire column contents, important!
from collections import defaultdict
from pyphetools.creation import *
from pyphetools.creation.simple_column_mapper import try_mapping_columns
import numpy as np

In [2]:
parser = HpoParser()
hpo_cr = parser.get_hpo_concept_recognizer()
hpo_version = parser.get_version()
metadata = MetaData(created_by="ORCID:0000-0002-0736-9199")
metadata.default_versions_with_hpo(version=hpo_version)

In [3]:
df = pd.read_table('./input/chen21_setd2.csv').astype(str)
df

Unnamed: 0,Patient,1,2,3,4,5,8,9,10,11,12,14,16,17,19
0,Sex,female,male,female,male,male,male,female,male,male,male,male,male,female,male
1,Weight.age.measured,,+10.28SD,+3SD,,1.14SD,-2SD,,0.2SD,+1.79SD,4SD,–,+1.5SD,+0.96SD,
2,Height.age.measured,+0.5SD,+3.14SD,,+3SD,+0.25SD,+2SD,,+2.5SD,1.14SD,2.8SD,0.61SD,+2.5SD,+1.79SD,+0.53SD
3,Speech delay,+,+,,+,+,+,+,+,,+,+,+,+,–
4,Motor delay,+,+,+,+,–,–,+,+,–,,+,+,–,
5,Intellectual disability,,+,,+,+,+,+,,+,,+,,,
6,Macrocephaly,+,+,+,+,–,+,+,+,,+,–,+,–,+
7,ASD,–,+,+,–,+,–,+,–,+,–,+,–,+,+
8,Recurrent otitis media,+,,+,,,,,+,+,,,,,+
9,Seizure,,–,+,,–,,+,,,,,,-,


In [4]:
dft = df.transpose()
dft.columns = dft.iloc[0]
dft.drop(dft.index[0], inplace=True)
dft['patient_id'] = dft.index
dft.head()

Patient,Sex,Weight.age.measured,Height.age.measured,Speech delay,Motor delay,Intellectual disability,Macrocephaly,ASD,Recurrent otitis media,Seizure,...,Hypotonia,Accelerated osseous maturation,Anxiety,ADHD,Obsessive behavior,Aggressive behavior,Self-injury behavior,Gastrointestinal disturbance,Variant,patient_id
1,female,,+0.5SD,+,+,,+,–,+,,...,–,+,,,,+,+,,c.6775del,1
2,male,+10.28SD,+3.14SD,+,+,+,+,+,,–,...,,,,,+,+,,,c.6471T>A,2
3,female,+3SD,,,+,,+,+,+,+,...,+,,,,,,,+,c.6341del,3
4,male,,+3SD,+,+,+,+,–,,,...,+,+,,,,,,,c.5285_5286del,4
5,male,1.14SD,+0.25SD,+,–,+,–,+,,–,...,,,-,+,+,+,-,-,c.4715+1G>A,5


In [5]:
hpo_cr = parser.get_hpo_concept_recognizer()



In [6]:
items = {
    'Speech delay': ["Delayed speech and language development", "HP:0000750"], 
    'Motor delay': ['Motor delay', 'HP:0001270'],
    'Intellectual disability': ['Intellectual disability', 'HP:0001249'],
    'Macrocephaly': ['Macrocephaly', 'HP:0000256'],
    'ASD': ['Autism', 'HP:0000717'],
    'Recurrent otitis media': ['Recurrent otitis media','HP:0000403'],
    'Seizure': ['Seizure', 'HP:0001250'],
    'Facial deformity': ['Abnormal facial shape', 'HP:0001999'],
    'Hypotonia': ['Hypotonia', 'HP:0001252'],
    'Accelerated osseous maturation': ['Accelerated skeletal maturation','HP:0005616'],
    'Anxiety': ['Anxiety','HP:0000739'],
    'ADHD': ['Attention deficit hyperactivity disorder','HP:0007018'],
    'Obsessive behavior': ['Compulsive behaviors','HP:0000722'],
    'Aggressive behavior': ['Aggressive behavior','HP:0000718'],
    'Self-injury behavior': ['Self-injurious behavior','HP:0100716'],
}
item_column_mapper_d = hpo_cr.initialize_simple_column_maps(column_name_to_hpo_label_map=items, observed='+',
    excluded='-')
print(f"We created {len(item_column_mapper_d)} simple column mappers")

We created 15 simple column mappers


<h2>Transcript/Variant mapping</h2>

In [7]:
setd2_transcript = "NM_014159.7"
genome = 'hg38'
default_genotype = 'heterozygous'
varMapper = VariantColumnMapper(assembly=genome,column_name='Variant', 
                                transcript=setd2_transcript, default_genotype=default_genotype)

In [8]:
# Ages not available
sexMapper = SexColumnMapper(male_symbol='male', female_symbol='female', column_name='Sex')
#sexMapper.preview_column(dft['Sex'])

In [9]:
pmid = "PMID:33766796"  # Chen et al, 2021
encoder = CohortEncoder(df=dft, hpo_cr=hpo_cr, column_mapper_d=item_column_mapper_d, 
                        individual_column_name="patient_id", sexmapper=sexMapper,
                       variant_mapper=varMapper, metadata=metadata,
                       pmid=pmid)
encoder.set_disease(disease_id='OMIM:616831', label='Luscan-Lumish syndrome')

In [10]:
individuals = encoder.get_individuals()

https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_014159.7%3Ac.6775del/NM_014159.7?content-type=application%2Fjson
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_014159.7%3Ac.6471T>A/NM_014159.7?content-type=application%2Fjson
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_014159.7%3Ac.6341del/NM_014159.7?content-type=application%2Fjson
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_014159.7%3Ac.5285_5286del/NM_014159.7?content-type=application%2Fjson
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_014159.7%3Ac.4715+1G>A/NM_014159.7?content-type=application%2Fjson
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_014159.7%3Ac.4404dupA/NM_014159.7?content-type=application%2Fjson
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_014159.7%3Ac.2028del/NM_014159.7?content-type=application%2Fjson
https

In [11]:
# Preview
i1 = individuals[0]
phenopacket1 = i1.to_ga4gh_phenopacket(metadata=metadata.to_ga4gh())
json_string = MessageToJson(phenopacket1)
print(json_string)

{
  "id": "1",
  "subject": {
    "id": "1",
    "sex": "FEMALE"
  },
  "phenotypicFeatures": [
    {
      "type": {
        "id": "HP:0000750",
        "label": "Delayed speech and language development"
      },
      "onset": {
        "age": {
          "iso8601duration": "NOT_PROVIDED"
        }
      }
    },
    {
      "type": {
        "id": "HP:0001270",
        "label": "Motor delay"
      },
      "onset": {
        "age": {
          "iso8601duration": "NOT_PROVIDED"
        }
      }
    },
    {
      "type": {
        "id": "HP:0000256",
        "label": "Macrocephaly"
      },
      "onset": {
        "age": {
          "iso8601duration": "NOT_PROVIDED"
        }
      }
    },
    {
      "type": {
        "id": "HP:0000403",
        "label": "Recurrent otitis media"
      },
      "onset": {
        "age": {
          "iso8601duration": "NOT_PROVIDED"
        }
      }
    },
    {
      "type": {
        "id": "HP:0001999",
        "label": "Abnormal facial shape"
 

In [12]:
output_directory = "phenopackets"
encoder.output_phenopackets(outdir=output_directory)

https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_014159.7%3Ac.6775del/NM_014159.7?content-type=application%2Fjson
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_014159.7%3Ac.6471T>A/NM_014159.7?content-type=application%2Fjson
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_014159.7%3Ac.6341del/NM_014159.7?content-type=application%2Fjson
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_014159.7%3Ac.5285_5286del/NM_014159.7?content-type=application%2Fjson
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_014159.7%3Ac.4715+1G>A/NM_014159.7?content-type=application%2Fjson
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_014159.7%3Ac.4404dupA/NM_014159.7?content-type=application%2Fjson
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_014159.7%3Ac.2028del/NM_014159.7?content-type=application%2Fjson
https