In [1]:
import phenopackets
import os
from os.path import isfile
from google.protobuf.json_format import MessageToDict, MessageToJson
from google.protobuf.json_format import Parse, ParseDict

In [2]:
from phenopackets import Phenopacket
import varcode as vc
#import myvariant as mv
import numpy as np
import pyensembl
import glob
import pandas as pd
from collections import defaultdict
import scipy.stats as stats

In [3]:
retinoblastoma = '../phenopackets/retinoblastoma.json'
nemalineMyopathy = '../phenopackets/nemalineMyopathy.json'
if not isfile(retinoblastoma):
    raise FileNotFoundError("Could not find phenopacket")

In [4]:
import json

with open(nemalineMyopathy) as f:
    data = f.read()
jsondata = json.loads(data)

NMphenopacket = Parse(json.dumps(jsondata), Phenopacket())

In [5]:
with open(retinoblastoma) as f:
    data = f.read()
jsondata = json.loads(data)

RBphenopacket = Parse(json.dumps(jsondata), Phenopacket())

In [6]:
NMphenopacket.id

'arbitrary.id'

In [7]:
RBphenopacket

id: "arbitrary.id"
subject {
  id: "proband A"
  time_at_last_encounter {
    age {
      iso8601duration: "P6M"
    }
  }
  sex: FEMALE
  karyotypic_sex: XX
}
phenotypic_features {
  type {
    id: "HP:0030084"
    label: "Clinodactyly"
  }
  modifiers {
    id: "HP:0012834"
    label: "Right"
  }
  onset {
    age {
      iso8601duration: "P3M"
    }
  }
}
phenotypic_features {
  type {
    id: "HP:0000555"
    label: "Leukocoria"
  }
  modifiers {
    id: "HP:0012835"
    label: "Left"
  }
  onset {
    age {
      iso8601duration: "P4M"
    }
  }
}
phenotypic_features {
  type {
    id: "HP:0000486"
    label: "Strabismus"
  }
  modifiers {
    id: "HP:0012835"
    label: "Left"
  }
  onset {
    age {
      iso8601duration: "P5M15D"
    }
  }
}
phenotypic_features {
  type {
    id: "HP:0000541"
    label: "Retinal detachment"
  }
  modifiers {
    id: "HP:0012835"
    label: "Left"
  }
  onset {
    age {
      iso8601duration: "P6M"
    }
  }
}
measurements {
  assay {
    id: "

In [8]:
class Patient:
    def __init__(self, phenopackJson):
        if not isfile(phenopackJson):
            raise FileNotFoundError("Could not find phenopacket")
            
        with open(phenopackJson) as f:
            data = f.read()
        jsondata = json.loads(data)
        phenopack = Parse(json.dumps(jsondata), Phenopacket())
        
        self._id = phenopack.id
        self._phenopack = phenopack
        self._phenotype = self.__get_hpids()
        self._diseases = Disease(phenopack.diseases[0])
        if len(phenopack.interpretations) != 0:
            self._genotype = self.__get_variants()
        else:
            #print('No interpretations found')
            self._genotype = None
                
        
    def __get_hpids(self):
        hp_ids = []
        for x in self._phenopack.phenotypic_features:
             if not x.excluded:
                hp_ids.append(Phenotype(x))
        return hp_ids
    
    def __get_variants(self):
        interp = self._phenopack.interpretations[0]
        genotypes = []
        for geno in interp.diagnosis.genomic_interpretations:
            genotypes.append(Genotype(geno))
        return genotypes
            
    
    @property
    def disease_id(self):
        return self._diseases.id
    
    @property
    def disease_label(self):
        return self._diseases.label
    
    @property
    def phenopacket(self):
        return self._phenopack
    
    @property
    def phenotype_ids(self):
        allPhenotypes = [phenotype.id for phenotype in self._phenotype]
        return allPhenotypes
    
    @property
    def phenotype_labels(self):
        allPhenotypes = [phenotype.label for phenotype in self._phenotype]
        return allPhenotypes
    
    @property
    def all_phenotypes(self):
        return self._phenotype
    
    @property
    def variant(self):
        return [g.variant for g in self._genotype]
    
    @property
    def var_effect(self):
        return [g.top_var_effect for g in self._genotype]
    
    @property
    def var_is_missense(self):
        return [g.is_missense for g in self._genotype]
    
    @property
    def var_is_nonsense(self):
        return [g.is_nonsense for g in self._genotype]
    
    def describe(self):
        stats = {
            "ID": self._phenopack.id,
            "Disease": self.disease_label,
            "Phenotypic Features": self.phenotype_labels,
            "Variant": self.variant,
            "Primary Effect of Variant": self.var_effect,
            "Is Missense Mutation?": self.var_is_missense,
            "Is Nonsense Mutation?": self.var_is_nonsense
        }
        return stats

In [9]:
class Genotype:
    def __init__(self, genInterp):
        contig = genInterp.variant_interpretation.variation_descriptor.vcf_record.chrom
        start = genInterp.variant_interpretation.variation_descriptor.vcf_record.pos
        ref = genInterp.variant_interpretation.variation_descriptor.vcf_record.ref
        alt = genInterp.variant_interpretation.variation_descriptor.vcf_record.alt
        self._myVar = vc.Variant(str(contig), start, ref, alt, ensembl = pyensembl.ensembl_grch37)
        self._variant = self._myVar.short_description
        self._topVar = self._myVar.effects().top_priority_effect()
        
    @property
    def variant(self):
        return self._variant
    
    @property
    def top_var_effect(self):
        return self._topVar.short_description
    
    @property
    def is_missense(self):
        if self._topVar.short_description.endswith("*") or not self._myVar.is_snv:
            return False
        else:
            return True
    
    @property
    def is_nonsense(self):
        if self._topVar.short_description.endswith("*") and self._myVar.is_snv:
            return True
        else:
            return False
        


In [10]:
class Disease:
    def __init__(self, phenopacket):
        
        self._diseaseID = phenopacket.term.id
        self._diseaseLabel = phenopacket.term.label
        
    @property
    def id(self):
        return self._diseaseID
    
    @property
    def label(self):
        return self._diseaseLabel

In [11]:
class Phenotype:
    def __init__(self, phenopacket):
        
        self._hpid = phenopacket.type.id
        self._label = phenopacket.type.label
        
    @property
    def id(self):
        return self._hpid
    
    @property
    def label(self):
        return self._label

In [12]:
TestPat = Patient('../phenopackets/cohort/Bee-2015-BBS2-II_2.json')
TestPat.describe()

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/rekerl/Library/Caches/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/rekerl/Library/Caches/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/rekerl/Library/Caches/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.pep.all.fa.gz.pickle


{'ID': 'PMID:26078953-Bee-2015-BBS2-II:2',
 'Disease': 'BARDET-BIEDL SYNDROME 2; BBS2',
 'Phenotypic Features': ['Renal insufficiency',
  'Rod-cone dystrophy',
  'Postaxial polydactyly',
  'Proteinuria',
  'Obesity',
  'Specific learning disability',
  'Postaxial hand polydactyly',
  'Dental crowding',
  'Diabetes mellitus'],
 'Variant': ['chr16 g.56530925G>A', 'chr16 g.56519631A>T'],
 'Primary Effect of Variant': ['p.R622*', 'p.Y644N'],
 'Is Missense Mutation?': [False, True],
 'Is Nonsense Mutation?': [True, False]}

In [16]:
allPatients = defaultdict(Patient)
allDiseases = []
allDiseaseNames = set()

for file in glob.glob('../phenopackets/cohort/*.json'):
    fileName = os.path.basename(file)
    current = Patient(file)
    
    if current.variant is not None and len(current.variant) != 0:
        allPatients[fileName] = current

for p in allPatients: 
    print(allPatients[p].describe())
    allDiseaseNames.add(allPatients[p]._diseases)


{'ID': 'PMID:27435956-Naz_Villalba-2016-NLRP3-proband', 'Disease': 'MUCKLE-WELLS SYNDROME; MWS', 'Phenotypic Features': ['Elevated C-reactive protein level', 'Increased intracranial pressure', 'Neutrophilia', 'Elevated erythrocyte sedimentation rate', 'Headache', 'Papilledema', 'Urticaria'], 'Variant': ['chr1 g.247587794C>T'], 'Primary Effect of Variant': ['p.T350M'], 'Is Missense Mutation?': [True], 'Is Nonsense Mutation?': [False]}
{'ID': 'PMID:27672653-Abdul_Wahab-2016-GCDH-Patient_5', 'Disease': 'GLUTARIC ACIDEMIA I', 'Phenotypic Features': ['Macrocephaly', 'Cerebral atrophy', 'Subdural hemorrhage', 'Glutaric aciduria', 'Dystonia'], 'Variant': ['chr19 g.13007113G>A'], 'Primary Effect of Variant': ['p.G244S'], 'Is Missense Mutation?': [True], 'Is Nonsense Mutation?': [False]}
{'ID': 'PMID:20149460-Papanastasiou-2010-STAT3-12_year_old_girl', 'Disease': 'HYPER-IgE RECURRENT INFECTION SYNDROME, AUTOSOMAL DOMINANT', 'Phenotypic Features': ['Eczematoid dermatitis', 'Cutaneous abscess', '

In [14]:
print(f"Number of patients {len(allPatients)}")

Number of patients 384


In [20]:
import scipy


def has_hpo(pat, hpo):
    for h in pat.phenotype_ids:
        if h == hpo:
            return True
    return False

for d in allDiseaseNames:
    #print(f"d: {d}")
    patients_with_d = [p for p in allPatients.values() if p.disease_id == d.id]
    n_dis = len(patients_with_d)
    if n_dis < 15: continue
    print(f"Patients with disease {d.label}: n={n_dis}")
    # fisher exact test is what we actually need
    all_hpo = set()
    total_pat = len(patients_with_d)
    for pat in patients_with_d:
        for hpo in pat.phenotype_ids:
            all_hpo.add(hpo)
    print(f"Total hpo terms: {len(all_hpo)}")
    for hpo_id in all_hpo:
        miss_with_hpo = len([ pat for pat in patients_with_d if has_hpo(pat, hpo_id) and pat.var_is_missense[0]])
        miss_without_hpo = len([ pat for pat in  patients_with_d if not has_hpo(pat, hpo_id) and pat.var_is_missense[0]])
        nons_with_hpo = len([ pat  for pat in patients_with_d if has_hpo(pat, hpo_id) and pat.var_is_nonsense[0]])
        nons_without_hpo = len([ pat for pat in patients_with_d if not has_hpo(pat, hpo_id) and pat.var_is_nonsense[0]])
        print(f"HPO {hpo_id}: miss_with_hpo {miss_with_hpo} miss_without_hpo {miss_without_hpo} nons_with_hpo {nons_with_hpo} nons_without_hpo {nons_without_hpo}")
        table = np.array([[miss_with_hpo, miss_without_hpo], [nons_with_hpo, nons_without_hpo]])
        oddsr, p =  scipy.stats.fisher_exact(table, alternative='two-sided')
        print(f"p: {p}")
        

Patients with disease Hypotonia, infantile, with psychomotor retardation and characteristic facies 3: n=19
Total hpo terms: 130
HPO HP:0001297: miss_with_hpo 0 miss_without_hpo 6 nons_with_hpo 1 nons_without_hpo 6
p: 1.0
HPO HP:0001252: miss_with_hpo 3 miss_without_hpo 3 nons_with_hpo 0 nons_without_hpo 7
p: 0.06993006993006992
HPO HP:0000341: miss_with_hpo 1 miss_without_hpo 5 nons_with_hpo 4 nons_without_hpo 3
p: 0.26573426573426573
HPO HP:0002119: miss_with_hpo 2 miss_without_hpo 4 nons_with_hpo 3 nons_without_hpo 4
p: 1.0
HPO HP:0002750: miss_with_hpo 0 miss_without_hpo 6 nons_with_hpo 0 nons_without_hpo 7
p: 1.0
HPO HP:0000954: miss_with_hpo 0 miss_without_hpo 6 nons_with_hpo 0 nons_without_hpo 7
p: 1.0
HPO HP:0001999: miss_with_hpo 0 miss_without_hpo 6 nons_with_hpo 0 nons_without_hpo 7
p: 1.0
HPO HP:0000717: miss_with_hpo 1 miss_without_hpo 5 nons_with_hpo 0 nons_without_hpo 7
p: 0.46153846153846156
HPO HP:0012697: miss_with_hpo 1 miss_without_hpo 5 nons_with_hpo 4 nons_without_

p: 1.0
HPO HP:0000252: miss_with_hpo 0 miss_without_hpo 6 nons_with_hpo 1 nons_without_hpo 6
p: 1.0
HPO HP:0007359: miss_with_hpo 0 miss_without_hpo 6 nons_with_hpo 2 nons_without_hpo 5
p: 0.4615384615384615
HPO HP:0000280: miss_with_hpo 1 miss_without_hpo 5 nons_with_hpo 4 nons_without_hpo 3
p: 0.26573426573426573
HPO HP:0100277: miss_with_hpo 0 miss_without_hpo 6 nons_with_hpo 1 nons_without_hpo 6
p: 1.0
HPO HP:0001837: miss_with_hpo 1 miss_without_hpo 5 nons_with_hpo 0 nons_without_hpo 7
p: 0.46153846153846156
HPO HP:0000426: miss_with_hpo 1 miss_without_hpo 5 nons_with_hpo 4 nons_without_hpo 3
p: 0.26573426573426573
HPO HP:0000463: miss_with_hpo 1 miss_without_hpo 5 nons_with_hpo 3 nons_without_hpo 4
p: 0.5594405594405595
HPO HP:0011344: miss_with_hpo 3 miss_without_hpo 3 nons_with_hpo 3 nons_without_hpo 4
p: 1.0
HPO HP:0007302: miss_with_hpo 1 miss_without_hpo 5 nons_with_hpo 0 nons_without_hpo 7
p: 0.46153846153846156
HPO HP:0000486: miss_with_hpo 0 miss_without_hpo 6 nons_with_h

In [None]:
class Counts:
    def __init__(self, disease):
        self._disease = disease.term.id
        self._diseaseName = disease.term.label
        self._totals = pd.DataFrame({'key':["Missense","Nonsense"]}).set_index('key')
    
    
    def add_patient(self, patient):
        if not isinstance(patient, Patient):
            raise NotAPatientError("ERROR: Input must be of class Patient")
        hasDisease = False
        for d in patient.describe()["Disease"]:
            if d.term.id.upper() == self._disease:
                hasDisease = True
        if not hasDisease:
            #print("Patient " + patient._id +" does not have disease " + self._disease)
            return 
        
        miss = sum(patient.is_missense())
        nons = sum(patient.is_nonsense())
        
        if miss == 0 and nons == 0:
            #print("Patient " + patient._id + " has no missense or nonsense mutations")
            return 
        for p in patient.get_phenotypes:
            if p.id not in self._totals.columns:
                df1 = pd.DataFrame({'key':["Missense", "Nonsense"],
                    p.id: [miss, nons]})
                df1 = df1.set_index('key')
                self._totals = self._totals.join(df1, how='outer')
            else:
                self._totals.at["Missense", p.id] += miss
                self._totals.at["Nonsense", p.id] += nons
        return self._totals
    
    def run_chi2(self):
        """
        self._totals is a 2x2 array with [0,0] = category 1, has HPO, 
        [0,1] category 1 does not have hpo
        
        """
        if len(self._totals.columns) != 0:
            if self._totals.sum(axis=1).at['Nonsense'] !=0 and self._totals.sum(axis=1).at['Missense'] != 0:
                chi2, pval, dof, exp = stats.chi2_contingency(self._totals)
            else:
                #print("There are either no Nonsense or no Missense Mutations")
                return
        else:
            #print("Add patients to run a Chi2 Test")
            return
        print("The p-value is: " + str(pval) )
        if pval <= 0.05:
            print("We can conclude the Missense vs Nonsense mutations are NOT independent of the Phenotypes for disease " + self._disease)
            return 'False'
        else:
            print("We can conclude the Missense vs Nonsense mutations are independent of the Phenotypes for disease " + self._disease)
            return 'True'

