In [1]:
import phenopackets
import os
from os.path import isfile
from google.protobuf.json_format import MessageToDict, MessageToJson
from google.protobuf.json_format import Parse, ParseDict

In [2]:
from phenopackets import Phenopacket
import varcode as vc
#import myvariant as mv
import numpy as np
import pyensembl
import glob
import pandas as pd
from collections import defaultdict
import scipy.stats as stats

In [3]:
retinoblastoma = '../phenopackets/retinoblastoma.json'
nemalineMyopathy = '../phenopackets/nemalineMyopathy.json'
if not isfile(retinoblastoma):
    raise FileNotFoundError("Could not find phenopacket")

In [4]:
import json

with open(nemalineMyopathy) as f:
    data = f.read()
jsondata = json.loads(data)

NMphenopacket = Parse(json.dumps(jsondata), Phenopacket())

In [5]:
with open(retinoblastoma) as f:
    data = f.read()
jsondata = json.loads(data)

RBphenopacket = Parse(json.dumps(jsondata), Phenopacket())

In [6]:
NMphenopacket.id

'arbitrary.id'

In [7]:
RBphenopacket

id: "arbitrary.id"
subject {
  id: "proband A"
  time_at_last_encounter {
    age {
      iso8601duration: "P6M"
    }
  }
  sex: FEMALE
  karyotypic_sex: XX
}
phenotypic_features {
  type {
    id: "HP:0030084"
    label: "Clinodactyly"
  }
  modifiers {
    id: "HP:0012834"
    label: "Right"
  }
  onset {
    age {
      iso8601duration: "P3M"
    }
  }
}
phenotypic_features {
  type {
    id: "HP:0000555"
    label: "Leukocoria"
  }
  modifiers {
    id: "HP:0012835"
    label: "Left"
  }
  onset {
    age {
      iso8601duration: "P4M"
    }
  }
}
phenotypic_features {
  type {
    id: "HP:0000486"
    label: "Strabismus"
  }
  modifiers {
    id: "HP:0012835"
    label: "Left"
  }
  onset {
    age {
      iso8601duration: "P5M15D"
    }
  }
}
phenotypic_features {
  type {
    id: "HP:0000541"
    label: "Retinal detachment"
  }
  modifiers {
    id: "HP:0012835"
    label: "Left"
  }
  onset {
    age {
      iso8601duration: "P6M"
    }
  }
}
measurements {
  assay {
    id: "

In [8]:
class Genotype:
    def __init__(self, genInterp):
        contig = genInterp.variant_interpretation.variation_descriptor.vcf_record.chrom
        start = genInterp.variant_interpretation.variation_descriptor.vcf_record.pos
        ref = genInterp.variant_interpretation.variation_descriptor.vcf_record.ref
        alt = genInterp.variant_interpretation.variation_descriptor.vcf_record.alt
        self._myVar = vc.Variant(str(contig), start, ref, alt, ensembl = pyensembl.ensembl_grch37)
        self._variant = self._myVar.short_description
        self._topVar = self._myVar.effects().top_priority_effect()
        
    @property
    def variant(self):
        return self._variant
    
    @property
    def top_var_effect(self):
        return self._topVar.short_description
    
    @property
    def full_var(self):
        return self._myVar
    
    @property
    def is_missense(self):
        if self._topVar.short_description.endswith("*") or not self._myVar.is_snv:
            return False
        else:
            return True
    
    @property
    def is_nonsense(self):
        if self._topVar.short_description.endswith("*") and self._myVar.is_snv:
            return True
        else:
            return False
        
    @property 
    def is_deletion(self):
        return self._myVar.is_deletion
    
    @property
    def is_insertion(self):
        return self._myVar.is_insertion
    
    @property
    def is_transition(self):
        return self._myVar.is_transition
    
    @property
    def is_transversion(self):
        return self._myVar.is_transversion
    
    @property
    def is_indel(self):
        return self._myVar.is_indel
    
    @property
    def is_duplication(self):
        if 'dup' in self._myVar.short_description:
            return True
        else:
            return False
    
    
        


In [9]:
class Patient:
    def __init__(self, phenopackJson):
        if not isfile(phenopackJson):
            raise FileNotFoundError("Could not find phenopacket")
            
        with open(phenopackJson) as f:
            data = f.read()
        jsondata = json.loads(data)
        phenopack = Parse(json.dumps(jsondata), Phenopacket())
        
        self._id = phenopack.id
        self._phenopack = phenopack
        self._phenotype = self.__get_hpids()
        if len(phenopack.diseases) != 0:
            self._diseases = Disease(phenopack.diseases[0])
        else:
            self._diseases = None
        if len(phenopack.interpretations) != 0:
            self._genotype = self.__get_variants()
        else:
            #print('No interpretations found')
            self._genotype = None
                
        
    def __get_hpids(self):
        hp_ids = []
        for x in self._phenopack.phenotypic_features:
             if not x.excluded:
                hp_ids.append(Phenotype(x))
        return hp_ids
    
    def __get_variants(self):
        interp = self._phenopack.interpretations[0]
        genotypes = []
        for geno in interp.diagnosis.genomic_interpretations:
            genotypes.append(Genotype(geno))
        return genotypes
            
    
    @property
    def disease_id(self):
        if self._diseases is not None:
            return self._diseases.id
        else:
            return None
    
    @property
    def disease_label(self):
        if self._diseases is not None:
            return self._diseases.label
        else:
            return None
    
    @property
    def phenopacket(self):
        return self._phenopack
    
    @property
    def phenotype_ids(self):
        if self._phenotype is not None:
            return [phenotype.id for phenotype in self._phenotype]
        else:
            return None
    
    @property
    def phenotype_labels(self):
        if self._phenotype is not None:
            return [phenotype.label for phenotype in self._phenotype]
        else:
            return None
    
    @property
    def all_phenotypes(self):
        return self._phenotype
    
    @property
    def variant(self):
        if self._genotype is not None:
            return [g.variant for g in self._genotype]
        else:
            return None
    
    @property
    def var_effect(self):
        if self._genotype is not None:
            return [g.top_var_effect for g in self._genotype]
        else:
            return None
    
    @property
    def var_is_missense(self):
        if self._genotype is not None:
            return [g.is_missense for g in self._genotype]
        else:
            return None
    
    @property
    def var_is_nonsense(self):
        if self._genotype is not None:
            return [g.is_nonsense for g in self._genotype]
        else:
            return None
        
    @property
    def var_is_deletion(self):
        if self._genotype is not None:
            return [g.is_deletion for g in self._genotype]
        elif self._diseases is not None:
            return [self._diseases.is_deletion]
        else:
            return None
        
    @property
    def var_is_duplication(self):
        if self._genotype is not None:
            return [g.is_duplication for g in self._genotype]
        elif self._diseases is not None:
            return [self._diseases.is_duplication]
        else:
            return None
        
    
    def describe(self):
        stats = {
            "ID": self._phenopack.id,
            "Disease": self.disease_label,
            "Phenotypic Features": self.phenotype_labels,
            "Variant": self.variant,
            "Primary Effect of Variant": self.var_effect
        }
        return stats

In [10]:
class Disease:
    def __init__(self, phenopacket):
        self._diseaseID = phenopacket.term.id
        self._diseaseLabel = phenopacket.term.label
        
    @property
    def id(self):
        return self._diseaseID
    
    @property
    def label(self):
        return self._diseaseLabel
    
    @property
    def is_duplication(self):
        if 'duplication' in self._diseaseLabel:
            return True
        else:
            return False
    
    @property
    def is_deletion(self):
        if 'deletion' in self._diseaseLabel:
            return True
        else:
            return False
        

In [11]:
class Phenotype:
    def __init__(self, phenopacket):
        
        self._hpid = phenopacket.type.id
        self._label = phenopacket.type.label
        
    @property
    def id(self):
        return self._hpid
    
    @property
    def label(self):
        return self._label

In [12]:
TestPat = Patient('../phenopackets/cohort/Dauber-2014-SLC35C1-Proband_1.json')
eff = TestPat._genotype
eff = eff.pop()
stringvar = eff.full_var.short_description
print(stringvar)
parts = stringvar.split(' ')
chro = parts[0].replace('chr', '')


INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/rekerl/Library/Caches/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/rekerl/Library/Caches/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/rekerl/Library/Caches/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.pep.all.fa.gz.pickle


chr11 g.45827853_45827855delCTT


In [13]:
allPatients = defaultdict(Patient)
allDiseases = []
allDiseaseNames = set()

for file in glob.glob('../phenopackets/cohort/*.json'):
    fileName = os.path.basename(file)
    current = Patient(file)
    print(current.variant)
    
    
    #if type(current.variant) != "NoneType":
    #if current.variant is not None and len(current.variant) != 0:
    allPatients[fileName] = current

#for p in allPatients: 
    #print(allPatients[p]._diseases)
    #allDiseaseNames.add(allPatients[p]._diseases.label)


['chr1 g.247587794C>T']
['chr19 g.13007113G>A']
['chr17 g.40485715C>T']
['chr20 g.44522679T>A', 'chr20 g.44523343_44523344delTA']
['chr21 g.38862599C>T']
['chr1 g.1737951C>A']
['chr1 g.120458723G>A']
['chr19 g.7623928C>A']
['chr8 g.28574958C>T']
['chr17 g.26684713C>A']
['chr18 g.10797515G>A']
['chr2 g.189951460C>T']
['chr1 g.1167851A>G', 'chr1 g.1167659A>G']
['chr20 g.3214218C>G']
['chr16 g.67645347_67645350delAAAG']
['chr6 g.86224294_86224297delTCTC']
['chr6 g.158613022A>T', 'chr6 g.158591564T>A']
['chr22 g.20049061G>A']
['chr15 g.48776128C>T']
['chr19 g.41848135G>A']
['chr19 g.3586872G>A']
['chr4 g.107115874C>T']
['chr4 g.107169430_107169431insA']
['chr4 g.80905083_80905084delCA']
['chr7 g.44151537T>C']
['chr4 g.107092429T>C', 'chr4 g.107168421_107168424delTTCA']
['chr11 g.130029877G>A']
['chr19 g.7622129G>T', 'chr19 g.7623842G>C']
['chr5 g.151266323T>A']
['chr8 g.61728948C>T']
['chr14 g.29236938_29236939insG']
['chr1 g.8045005C>A']
['chr17 g.65340898T>C']
['chr11 g.88068220A>C']
['c

['chr1 g.224592155C>A']
['chr7 g.140501302T>C']
['chr18 g.67834203T>C']
['chr16 g.57693480T>C']
['chr7 g.92760808T>C']
['chr14 g.24551909_24551911delGAA']
['chr5 g.64082455G>A']
['chr15 g.62305256_62305257insTCTG', 'chr15 g.62174851C>A']
['chr1 g.227075798C>A']
['chr1 g.218609311G>A']
['chr7 g.92761698C>G']
['chr4 g.107156512T>A']
['chr1 g.8418291T>C']
['chr10 g.50680431ATCTT>TGCACACCA']
['chr14 g.29237342_29237343insC']
['chr9 g.94499750C>T']
['chr11 g.71148914C>T', 'chr11 g.71146524T>C']
['chr2 g.170343646T>C']
['chr2 g.44573407_44573407delT']
['chr22 g.18075487G>A']
['chr20 g.43251680C>T']
['chr12 g.48381473C>T']
['chr9 g.139265796C>A']
['chr5 g.127668649A>C']
['chr16 g.56530925G>A', 'chr16 g.56519631A>T']
['chr15 g.48808561T>A']
['chr5 g.483385C>T', 'chr5 g.477462_477462delG']
['chr19 g.7184444C>T']
['chr22 g.20049207G>C']
['chrX g.69255235_69255235delG']
['chr7 g.92760637A>G']
['chr2 g.48050286T>C']
['chr4 g.107156505_107156505delT']
['chr3 g.38639391C>T']
['chr11 g.118373871G>T']

In [14]:
print(f"Number of patients {len(allPatients)}")

Number of patients 384


In [18]:
test = allPatients.get
test.variant.variant

TypeError: pop expected at least 1 argument, got 0

In [None]:
import regex as re
import requests

from requests.adapters import HTTPAdapter, Retry

url = 'https://rest.uniprot.org/uniprotkb/A1A519.json'
all_files = requests.get(url).json()

all_files


In [None]:
all_files.get('features')

In [None]:
import scipy


def has_hpo(pat, hpo):
    for h in pat.phenotype_ids:
        if h == hpo:
            return True
    return False

for d in allDiseaseNames:
    #print(f"d: {d}")
    patients_with_d = [p for p in allPatients.values() if p.disease_id == d.id]
    n_dis = len(patients_with_d)
    if n_dis < 15: continue
    print(f"Patients with disease {d.label}: n={n_dis}")
    # fisher exact test is what we actually need
    all_hpo = set()
    total_pat = len(patients_with_d)
    for pat in patients_with_d:
        for hpo in pat.phenotype_ids:
            all_hpo.add(hpo)
    print(f"Total hpo terms: {len(all_hpo)}")
    #for hpo_id in all_hpo:
        #miss_with_hpo = len([ pat for pat in patients_with_d if has_hpo(pat, hpo_id) and pat.var_is_missense[0]])
        #miss_without_hpo = len([ pat for pat in  patients_with_d if not has_hpo(pat, hpo_id) and pat.var_is_missense[0]])
        #nons_with_hpo = len([ pat  for pat in patients_with_d if has_hpo(pat, hpo_id) and pat.var_is_nonsense[0]])
        #nons_without_hpo = len([ pat for pat in patients_with_d if not has_hpo(pat, hpo_id) and pat.var_is_nonsense[0]])
        #print(f"HPO {hpo_id}: miss_with_hpo {miss_with_hpo} miss_without_hpo {miss_without_hpo} nons_with_hpo {nons_with_hpo} nons_without_hpo {nons_without_hpo}")
        #table = np.array([[miss_with_hpo, miss_without_hpo], [nons_with_hpo, nons_without_hpo]])
        #oddsr, p =  scipy.stats.fisher_exact(table, alternative='two-sided') ##Add option for chi2
        #print(f"p: {p}")
        

In [None]:
class Counts:
    def __init__(self, disease):
        self._disease = disease.term.id
        self._diseaseName = disease.term.label
        self._totals = pd.DataFrame({'key':["Missense","Nonsense"]}).set_index('key')
    
    
    def add_patient(self, patient):
        if not isinstance(patient, Patient):
            raise NotAPatientError("ERROR: Input must be of class Patient")
        hasDisease = False
        for d in patient.describe()["Disease"]:
            if d.term.id.upper() == self._disease:
                hasDisease = True
        if not hasDisease:
            #print("Patient " + patient._id +" does not have disease " + self._disease)
            return 
        
        miss = sum(patient.is_missense())
        nons = sum(patient.is_nonsense())
        
        if miss == 0 and nons == 0:
            #print("Patient " + patient._id + " has no missense or nonsense mutations")
            return 
        for p in patient.get_phenotypes:
            if p.id not in self._totals.columns:
                df1 = pd.DataFrame({'key':["Missense", "Nonsense"],
                    p.id: [miss, nons]})
                df1 = df1.set_index('key')
                self._totals = self._totals.join(df1, how='outer')
            else:
                self._totals.at["Missense", p.id] += miss
                self._totals.at["Nonsense", p.id] += nons
        return self._totals
    
    def run_chi2(self):
        """
        self._totals is a 2x2 array with [0,0] = category 1, has HPO, 
        [0,1] category 1 does not have hpo
        
        """
        if len(self._totals.columns) != 0:
            if self._totals.sum(axis=1).at['Nonsense'] !=0 and self._totals.sum(axis=1).at['Missense'] != 0:
                chi2, pval, dof, exp = stats.chi2_contingency(self._totals)
            else:
                #print("There are either no Nonsense or no Missense Mutations")
                return
        else:
            #print("Add patients to run a Chi2 Test")
            return
        print("The p-value is: " + str(pval) )
        if pval <= 0.05:
            print("We can conclude the Missense vs Nonsense mutations are NOT independent of the Phenotypes for disease " + self._disease)
            return 'False'
        else:
            print("We can conclude the Missense vs Nonsense mutations are independent of the Phenotypes for disease " + self._disease)
            return 'True'

