In [1]:
import typing
import os

import hpotk
from phenopackets import Phenopacket

from genophenocorr.patient import PhenopacketPatientCreator
from genophenocorr.phenotype import PhenotypeCreator
from genophenocorr.protein import UniprotProteinMetadataService, ProteinAnnotationCache, ProtCachingFunctionalAnnotator
from genophenocorr.variant import VarCachingFunctionalAnnotator, VariantAnnotationCache, VepFunctionalAnnotator
from genophenocorr.cohort import PhenopacketCohortCreator, CohortAnalysis


In [2]:
fpath_hpo = 'hpo_data/hp.json'
cache_dir = 'KBG/annotations'
fpath_phenopackets = 'KBG/phenopackets'
tx_id = 'NM_013275.6'
protein_id = 'Q15327'
if not os.path.isdir(cache_dir):
    os.mkdir(cache_dir)

In [3]:
hpo: hpotk.ontology.Ontology = hpotk.ontology.load.obographs.load_ontology(fpath_hpo)
validators = [
    hpotk.validate.AnnotationPropagationValidator(hpo),
    hpotk.validate.ObsoleteTermIdsValidator(hpo),
    hpotk.validate.PhenotypicAbnormalityValidator(hpo)
]
phenotype_creator = PhenotypeCreator(hpo, hpotk.validate.ValidationRunner(validators))

In [4]:
# Protein metadata
pm = UniprotProteinMetadataService()
pac = ProteinAnnotationCache(cache_dir)
pfa = ProtCachingFunctionalAnnotator(pac, pm)

# Functional annotator
vac = VariantAnnotationCache(cache_dir)
vep = VepFunctionalAnnotator(pfa)
vfa = VarCachingFunctionalAnnotator(vac, vep)


# Assemble the patient creator
pc = PhenopacketPatientCreator(phenotype_creator, vfa)

In [5]:
cc = PhenopacketCohortCreator(pc)

In [6]:
patientCohort = cc.create_cohort(fpath_phenopackets)

Expected at least one HPO term per patient, but received none for patient VanDongen2019_P2
Expected at least one HPO term per patient, but received none for patient VanDongen2019_P12
Expected at least one HPO term per patient, but received none for patient Reuter2020
Expected at least one HPO term per patient, but received none for patient Novara, 2017_P10
Expected at least one variant per patient, but received none for patient Parenti2016_P1
Expected at least one HPO term per patient, but received none for patient VanDongen2019_P13
Expected at least one HPO term per patient, but received none for patient VanDongen2019_P8
Expected at least one HPO term per patient, but received none for patient VanDongen2019_P4
Expected at least one HPO term per patient, but received none for patient VanDongen2019_P5
Expected at least one HPO term per patient, but received none for patient KBG31B
Expected at least one HPO term per patient, but received none for patient VanDongen2019_P9
Expected at leas

In [7]:
patientCohort.list_all_phenotypes()

[('HP:0006482', 224),
 ('HP:0011446', 220),
 ('HP:0001249', 194),
 ('HP:0001155', 189),
 ('HP:0012758', 176),
 ('HP:0004322', 150),
 ('HP:0010938', 134),
 ('HP:0000534', 126),
 ('HP:0000343', 121),
 ('HP:0000365', 97),
 ('HP:0000325', 83),
 ('HP:0000356', 77),
 ('HP:0007018', 61),
 ('HP:0000729', 56)]

In [8]:
patientCohort.list_all_variants()

[('16_89284634_GTGTTT/G', 34),
 ('16_89284129_CTTTTT/C', 10),
 ('16_89284140_TTTTC/T', 9),
 ('16_89285157_GTTTC/G', 8),
 ('16_89275181_-/G', 5),
 ('16_89279750_-/G', 5),
 ('16_89217282_deletion', 4),
 ('16_89182742_deletion', 4),
 ('16_89277486_deletion', 4),
 ('16_89284345_G/A', 3),
 ('16_89284363_CTTTG/C', 3),
 ('16_89274958_C/G', 3),
 ('16_89282136_C/T', 3),
 ('16_89284565_G/C', 3),
 ('16_89284358_GAT/G', 3),
 ('16_89283314_CCTTT/C', 3),
 ('16_89284524_duplication', 3),
 ('16_89282710_T/A', 3),
 ('16_88197356_deletion', 3),
 ('16_89284209_TTCTC/T', 2),
 ('16_89321706_deletion', 2),
 ('16_89282947_CTTTTT/C', 2),
 ('16_89280029_-/G', 2),
 ('16_89095277_deletion', 2),
 ('16_89283233_-/T', 2),
 ('16_89275128_G/A', 2),
 ('16_89282834_CTGTT/C', 2),
 ('16_89262070_deletion', 2),
 ('16_89283496_CG/C', 2),
 ('16_89285153_TTTTG/T', 2),
 ('16_89280752_G/T', 2),
 ('16_89228900_deletion', 2),
 ('16_89268636_C/A', 2),
 ('16_89282455_G/A', 2),
 ('16_89281054_C/A', 2),
 ('16_89282158_-/T', 2),
 ('1

In [9]:
patientCohort.list_all_patients()

['Goldenberg2016_P13',
 'Low, 2016_P9 (1)',
 'VanDongen2019_P2',
 'Low, 2016_P29 (27)',
 'Ockeloen2015_P20',
 'Gnazzo, 2020_P29',
 'Gnazzo, 2020_P4',
 'Kutkowska-Kazmierczak2021_P22',
 'Low, 2016_P5 (6)',
 'VanDongen2019_P12',
 'Low, 2016_P2 (26)',
 'Walz2015_PC',
 'Ockeloen2015_P5',
 'Novara, 2017_P1',
 'Reuter2020',
 'KBG57',
 'Goldenberg2016_P29',
 'Scarano, 2013_P6',
 'Kutkowska-Kazmierczak2021_P18',
 'Novara, 2017_P10',
 'KBG41',
 'Gnazzo, 2020_P13',
 'KBG16',
 'Kutkowska-Kazmierczak2021_P3',
 'Khalifa, 2013_P1B',
 'KBG36',
 'KBG10B',
 'KBG20',
 'Gnazzo, 2020_P8',
 'Gnazzo, 2020_P25',
 'Goldenberg2016_P1',
 'Parenti2021_P13',
 'Goldenberg2016_P25',
 'Ockeloen2015_P9',
 'Sirmaci2011_P2/F1? (previously published Tekin, 2004)',
 'Goldenberg2016_P33',
 'Kutkowska-Kazmierczak2021_P14',
 'Parenti2016_P1',
 'Ockeloen2015_P16',
 'Ockeloen2015_P17',
 'Kutkowska-Kazmierczak2021_P15',
 'Goldenberg2016_P32',
 'Bucerzan2020',
 'Ockeloen2015_P8',
 'Goldenberg2016_P24',
 'Parenti2021_P12',
 'Mur

In [10]:
patientCohort.list_data_by_tx('NM_013275.6')

{'NM_013275.6': Counter({'stop_gained': 52,
          'coding_sequence_variant': 43,
          '5_prime_UTR_variant': 42,
          'intron_variant': 54,
          'feature_truncation': 51,
          'feature_elongation': 1,
          'frameshift_variant': 97,
          'stop_lost': 28,
          '3_prime_UTR_variant': 29,
          'missense_variant': 7,
          'inframe_deletion': 2,
          'splice_donor_variant': 2,
          'splice_acceptor_variant': 2,
          'transcript_ablation': 14,
          'transcript_amplification': 1,
          'splice_region_variant': 2,
          'downstream_gene_variant': 1})}

In [11]:
patientCohort.list_data_by_tx()

{'NM_001384941.1': Counter({'transcript_ablation': 8,
          'transcript_amplification': 1,
          'coding_sequence_variant': 1,
          '5_prime_UTR_variant': 1,
          'intron_variant': 1,
          'feature_truncation': 1}),
 'NM_001256183.2': Counter({'stop_gained': 52,
          'coding_sequence_variant': 43,
          '5_prime_UTR_variant': 42,
          'intron_variant': 54,
          'feature_truncation': 51,
          'feature_elongation': 1,
          'frameshift_variant': 97,
          'stop_lost': 28,
          '3_prime_UTR_variant': 29,
          'missense_variant': 7,
          'inframe_deletion': 2,
          'splice_donor_variant': 2,
          'splice_acceptor_variant': 2,
          'transcript_ablation': 14,
          'transcript_amplification': 1,
          'splice_region_variant': 2,
          'downstream_gene_variant': 1}),
 'NM_001173541.2': Counter({'transcript_ablation': 8,
          'transcript_amplification': 1,
          'coding_sequence_variant': 

In [12]:
patientCohort.all_proteins

{ProteinMetadata(id=NP_000092.2, label=Cytochrome b-245 light chain, features=(SimpleProteinFeature(type=FeatureType.REGION, info=FeatureInfo(name=Disordered, start=134, end=195)),)),
 ProteinMetadata(id=NP_000476.1, label=Adenine phosphoribosyltransferase, features=()),
 ProteinMetadata(id=NP_000503.1, label=N-acetylgalactosamine-6-sulfatase, features=(SimpleProteinFeature(type=FeatureType.REGION, info=FeatureInfo(name=Catalytic domain, start=27, end=379)),)),
 ProteinMetadata(id=NP_000968.2, label=60S ribosomal protein L13, features=()),
 ProteinMetadata(id=NP_001012777.1, label=Cytoplasmic tRNA 2-thiolation protein 2, features=(SimpleProteinFeature(type=FeatureType.REGION, info=FeatureInfo(name=Disordered, start=1, end=24)), SimpleProteinFeature(type=FeatureType.REGION, info=FeatureInfo(name=Disordered, start=188, end=217)))),
 ProteinMetadata(id=NP_001012780.1, label=Cytoplasmic tRNA 2-thiolation protein 2, features=(SimpleProteinFeature(type=FeatureType.REGION, info=FeatureInfo(na

In [13]:
analysis = CohortAnalysis(patientCohort, 'NM_013275.6', include_unmeasured=False)
analysis._testing_hpo_terms

{Phenotype(identifier=HP:0000325, observed=True),
 Phenotype(identifier=HP:0000343, observed=True),
 Phenotype(identifier=HP:0000356, observed=True),
 Phenotype(identifier=HP:0000365, observed=True),
 Phenotype(identifier=HP:0000534, observed=True),
 Phenotype(identifier=HP:0000729, observed=True),
 Phenotype(identifier=HP:0001155, observed=True),
 Phenotype(identifier=HP:0001249, observed=True),
 Phenotype(identifier=HP:0004322, observed=True),
 Phenotype(identifier=HP:0006482, observed=True),
 Phenotype(identifier=HP:0007018, observed=True),
 Phenotype(identifier=HP:0010938, observed=True),
 Phenotype(identifier=HP:0011446, observed=True),
 Phenotype(identifier=HP:0012758, observed=True)}

In [14]:
from genophenocorr.constants import variant_effects

In [15]:
analysis.compare_by_variant_type(variant_effects.FRAMESHIFT_VARIANT)

Unnamed: 0_level_0,With frameshift variant,With frameshift variant,Without frameshift variant,Without frameshift variant,Unnamed: 5_level_0
Unnamed: 0_level_1,Count,Percent,Count,Percent,p-value
HP:0011446 (Abnormality of higher mental function),220,88.00%,106,95.50%,0.032591
HP:0007018 (Attention deficit hyperactivity disorder),61,74.39%,26,66.67%,0.393836
HP:0000325 (Triangular face),83,64.84%,38,58.46%,0.432265
HP:0001249 (Intellectual disability),194,87.00%,94,89.52%,0.590097
HP:0001155 (Abnormality of the hand),189,69.49%,88,72.13%,0.634628
HP:0006482 (Abnormality of dental morphology),224,83.90%,99,81.82%,0.660253
HP:0012758 (Neurodevelopmental delay),176,95.65%,90,96.77%,0.755827
HP:0010938 (Abnormal external nose morphology),134,91.16%,63,92.65%,0.797387
HP:0000365 (Hearing impairment),97,78.23%,45,76.27%,0.849893
HP:0000534 (Abnormal eyebrow morphology),126,81.29%,56,80.00%,0.855482


In [16]:
analysis.compare_by_variant('16_89284634_GTGTTT/G')

Unnamed: 0_level_0,With 16_89284634_GTGTTT/G,With 16_89284634_GTGTTT/G,Without 16_89284634_GTGTTT/G,Without 16_89284634_GTGTTT/G,Unnamed: 5_level_0
Unnamed: 0_level_1,Count,Percent,Count,Percent,p-value
HP:0011446 (Abnormality of higher mental function),220,88.00%,201,90.13%,0.556255
HP:0010938 (Abnormal external nose morphology),134,91.16%,121,92.37%,0.828437
HP:0001155 (Abnormality of the hand),189,69.49%,167,68.44%,0.848908
HP:0007018 (Attention deficit hyperactivity disorder),61,74.39%,54,72.00%,0.85697
HP:0000534 (Abnormal eyebrow morphology),126,81.29%,116,82.27%,0.880816
HP:0000729 (Autistic behavior),56,58.33%,53,59.55%,0.882202
HP:0001249 (Intellectual disability),194,87.00%,179,87.75%,0.884489
HP:0004322 (Short stature),150,57.92%,136,58.37%,0.927343
HP:0000325 (Triangular face),83,64.84%,77,64.17%,1.0
HP:0000356 (Abnormality of the outer ear),77,79.38%,71,79.78%,1.0


In [17]:
analysis2 = CohortAnalysis(patientCohort, 'NM_013275.6', include_unmeasured=False, include_large_SV=False)

analysis2.compare_by_exon(9)

Unnamed: 0_level_0,Inside Exon 9,Inside Exon 9,Outside Exon 9,Outside Exon 9,Unnamed: 5_level_0
Unnamed: 0_level_1,Count,Percent,Count,Percent,p-value
HP:0004322 (Short stature),115,59.90%,8,36.36%,0.041417
HP:0001155 (Abnormality of the hand),155,72.09%,19,90.48%,0.073588
HP:0011446 (Abnormality of higher mental function),178,86.83%,23,100.00%,0.084948
HP:0007018 (Attention deficit hyperactivity disorder),51,77.27%,8,61.54%,0.296469
HP:0000365 (Hearing impairment),74,77.08%,9,69.23%,0.505159
HP:0000534 (Abnormal eyebrow morphology),104,81.89%,10,76.92%,0.708574
HP:0001249 (Intellectual disability),158,87.29%,21,91.30%,0.746167
HP:0000729 (Autistic behavior),41,58.57%,8,66.67%,0.753921
HP:0000325 (Triangular face),68,71.58%,11,73.33%,1.0
HP:0000356 (Abnormality of the outer ear),58,82.86%,9,90.00%,1.0


In [18]:
from genophenocorr.protein import FeatureType


In [19]:
analysis.compare_by_protein_feature_type(FeatureType.REGION)

Unnamed: 0_level_0,Inside REGION,Inside REGION,Outside REGION,Outside REGION,Unnamed: 5_level_0
Unnamed: 0_level_1,Count,Percent,Count,Percent,p-value
HP:0001249 (Intellectual disability),194,87.00%,137,90.73%,0.322492
HP:0007018 (Attention deficit hyperactivity disorder),61,74.39%,38,69.09%,0.560963
HP:0001155 (Abnormality of the hand),189,69.49%,129,72.07%,0.598338
HP:0011446 (Abnormality of higher mental function),220,88.00%,151,89.88%,0.636595
HP:0000325 (Triangular face),83,64.84%,53,61.63%,0.665389
HP:0004322 (Short stature),150,57.92%,103,60.23%,0.688993
HP:0006482 (Abnormality of dental morphology),224,83.90%,152,85.39%,0.690877
HP:0000365 (Hearing impairment),97,78.23%,70,80.46%,0.733443
HP:0000343 (Long philtrum),121,79.61%,78,78.79%,0.874791
HP:0000356 (Abnormality of the outer ear),77,79.38%,54,79.41%,1.0
