In [1]:
import typing
import os

import hpotk
from phenopackets import Phenopacket

from genophenocorr.patient import PhenopacketPatientCreator
from genophenocorr.phenotype import PhenotypeCreator
from genophenocorr.protein import UniprotProteinMetadataService, ProteinAnnotationCache, ProtCachingFunctionalAnnotator
from genophenocorr.variant import VarCachingFunctionalAnnotator, VariantAnnotationCache, VepFunctionalAnnotator
from genophenocorr.cohort import PhenopacketCohortCreator


In [2]:
fpath_hpo = 'hpo_data/hp.json'
cache_dir = 'testFiles/annotationsSub'
fpath_phenopackets = 'testFiles'
tx_id = 'NM_013275.6'
protein_id = 'Q15327'
if not os.path.isdir(cache_dir):
    os.mkdir(cache_dir)

In [3]:
hpo: hpotk.ontology.MinimalOntology = hpotk.ontology.load.obographs.load_minimal_ontology(fpath_hpo)
validators = [
    hpotk.validate.AnnotationPropagationValidator(hpo),
    hpotk.validate.ObsoleteTermIdsValidator(hpo),
    hpotk.validate.PhenotypicAbnormalityValidator(hpo)
]
phenotype_creator = PhenotypeCreator(hpo, hpotk.validate.ValidationRunner(validators))

In [4]:
# Functional annotator
vac = VariantAnnotationCache(cache_dir)
vep = VepFunctionalAnnotator()
vfa = VarCachingFunctionalAnnotator(vac, vep)

# Protein metadata
pm = UniprotProteinMetadataService()
pac = ProteinAnnotationCache(cache_dir)
pfa = ProtCachingFunctionalAnnotator(pac, pm)

# Assemble the patient creator
pc = PhenopacketPatientCreator(phenotype_creator, vfa, pfa)

In [5]:
cc = PhenopacketCohortCreator(pc)

In [6]:
patientCohort = cc.create_cohort(fpath_phenopackets)

In [7]:
patientCohort.list_all_phenotypes()

[('HP:0006482', 9),
 ('HP:0001155', 8),
 ('HP:0004322', 8),
 ('HP:0001249', 7),
 ('HP:0011446', 7),
 ('HP:0000325', 7),
 ('HP:0000534', 7),
 ('HP:0000343', 7),
 ('HP:0010938', 7),
 ('HP:0000356', 7),
 ('HP:0000365', 5),
 ('HP:0012758', 3),
 ('HP:0000729', 2),
 ('HP:0007018', 1)]

In [8]:
patientCohort.list_all_proteins(5)

[('NP_037407.4', 18),
 ('NP_001243111.1', 9),
 ('NP_001243112.1', 9),
 ('NP_003110.1', 6),
 ('NP_000503.1', 5)]

In [9]:
patientCohort.list_all_variants()

[('16_89284524_duplication', 3),
 ('16_89283314_CCTTT/C', 1),
 ('16_88788350_deletion', 1),
 ('16_89476288_duplication', 1),
 ('16_89284129_CTTTTT/C', 1),
 ('16_89282012_CGG/C', 1),
 ('16_89281397_G/C', 1)]

In [10]:
patientCohort.list_all_patients()

['Cucco, 2020 (Patient B)',
 'Crippa2015_P1',
 'Crippa2015_P3',
 'Behnert, 2018',
 'Bucerzan2020',
 'Crippa2015_P2',
 'Bianchi, 2018',
 'DeBernardi2018',
 'Alves, 2019']

In [12]:
patientCohort.list_vartypes_by_tx('NM_013275.6')

{'NM_013275.6': Counter({'frameshift_variant': 3,
          'stop_gained': 1,
          'coding_sequence_variant': 1,
          '5_prime_UTR_variant': 2,
          'intron_variant': 2,
          'feature_elongation': 1,
          'transcript_ablation': 1})}

In [13]:
patientCohort.list_vartypes_by_tx()

{'NM_001256182.2': Counter({'frameshift_variant': 3,
          'stop_gained': 1,
          'coding_sequence_variant': 1,
          '5_prime_UTR_variant': 2,
          'intron_variant': 2,
          'feature_elongation': 1,
          'transcript_ablation': 1}),
 'NM_013275.6': Counter({'frameshift_variant': 3,
          'stop_gained': 1,
          'coding_sequence_variant': 1,
          '5_prime_UTR_variant': 2,
          'intron_variant': 2,
          'feature_elongation': 1,
          'transcript_ablation': 1}),
 'NM_003119.4': Counter({'coding_sequence_variant': 1,
          '5_prime_UTR_variant': 1,
          'intron_variant': 1,
          'feature_truncation': 1,
          'transcript_amplification': 1}),
 'NM_001363850.1': Counter({'coding_sequence_variant': 1,
          '5_prime_UTR_variant': 1,
          'intron_variant': 1,
          'feature_truncation': 1,
          'transcript_amplification': 1}),
 'NM_153636.3': Counter({'coding_sequence_variant': 1,
          '5_prime_UTR_