<h1>landscape of STXBP1-related disorders </h1>
<p>Extract the clinical data from <a href="https://pubmed.ncbi.nlm.nih.gov/35190816/"target="__blank">Xian et al. (2022) Assessing the landscape of STXBP1-related disorders in 534 individuals. Brain.</a>.<p>

In [1]:
import phenopackets as php
from google.protobuf.json_format import MessageToDict, MessageToJson
from google.protobuf.json_format import Parse, ParseDict
import pandas as pd
from csv import DictReader
pd.set_option('display.max_colwidth', None) # show entire column contents, important!
from collections import defaultdict
import re
from pyphetools.creation import *
# last tested with pyphetools version 0.2.23

In [2]:
parser = HpoParser()
hpo_cr = parser.get_hpo_concept_recognizer()
hpo_version = parser.get_version()
metadata = MetaData(created_by="ORCID:0000-0002-0736-9199")
metadata.default_versions_with_hpo(version=hpo_version)

In [60]:
clinical_df = pd.read_table("input/brain-2021-00642-File011.tsv");
genotype_df = pd.read_table("input/brain-2021-00642-File011-genotype.tsv");

In [61]:
clinical_df.head()

Unnamed: 0,PatID,Source_Journal,Source_PMID*,Year,Sex,Phenotypic_group**,age_onset_m,age_offset_m,age_eval_y,Base_HPO***,HPO_term,Notes
0,STX_18469812_Subject_11,Nat Genet,18469812,2008.0,M,EOEE,2.0,,8.0,HP:0003593,Infantile onset,
1,STX_18469812_Subject_11,Nat Genet,18469812,2008.0,M,EOEE,2.0,,8.0,HP:0010818,Generalized tonic seizures,
2,STX_18469812_Subject_11,Nat Genet,18469812,2008.0,M,EOEE,2.0,,8.0,HP:0002069,Generalized tonic-clonic seizures,
3,STX_18469812_Subject_11,Nat Genet,18469812,2008.0,M,EOEE,2.0,,8.0,HP:0010851,EEG with burst suppression,
4,STX_18469812_Subject_11,Nat Genet,18469812,2008.0,M,EOEE,2.0,,8.0,HP:0002521,Hypsarrhythmia,


In [62]:
patient_d = defaultdict(list)

In [63]:
with open("input/brain-2021-00642-File011.tsv") as f:
    reader = DictReader(f, delimiter="\t")
    for row in reader:
        prow = PatientRow(row=row)
        patient_d[prow.patID].append(prow)
print(f"We extracted data on {len(patient_d)} individuals")

We extracted data on 534 individuals


In [64]:
genotype_df.head()

Unnamed: 0,PatID,Chr,Start,End,Ref,Alt,Func.refGeneWithVer,Gene.refGeneWithVer,GeneDetail.refGeneWithVer,ExonicFunc.refGeneWithVer,...,Otherinfo3,Otherinfo4,Otherinfo5,Otherinfo7,Otherinfo8,Otherinfo9,Otherinfo10,Otherinfo11,bed,Unnamed: 134
0,STX_18469812_Subject_11,9.0,130422313.0,130422313.0,T,A,exonic,STXBP1,.,nonsynonymous SNV,...,.,9,130422313,T,A,.,PASS,.,Name=70.695764689,
1,STX_18469812_Subject_3,9.0,130444768.0,130444768.0,G,A,exonic,STXBP1,.,nonsynonymous SNV,...,.,9,130444768,G,A,.,PASS,.,Name=75.862552050,
2,STX_18469812_Subject_6,9.0,130425593.0,130425593.0,G,A,exonic,STXBP1,.,nonsynonymous SNV,...,.,9,130425593,G,A,.,PASS,.,Name=89.193399398,
3,STX_18469812_Subject_7,9.0,130439001.0,130439001.0,T,G,exonic,STXBP1,.,nonsynonymous SNV,...,.,9,130439001,T,G,.,PASS,.,Name=20.833326089,
4,STX_19557857_Patient_1,9.0,130416076.0,130416076.0,G,A,splicing,STXBP1,NM_001032221.3:exon3:c.169+1G>A:NM_003165.3:exon3:c.169+1G>A,.,...,.,9,130416076,G,A,.,PASS,.,.,


In [73]:
def extract_var_inf(aachange):
    """
    aachange: e.g., STXBP1:NM_001032221.3:exon6:c.T353G:p.L118R,STXBP1:NM_003165.3:exon6:c.T353G:p.L118R
    """
    fields = aachange.split(":")
    index = 0
    i = 0
    transcript = "?"
    for f in fields:
        if f == "NM_001032221.3" or f == "NM_003165.3":
            index = i
            transcript = f
            break
        i += 1
    if (i + 2) < len(fields):
        variant = fields[i+2]
    else:
        raise ValueError(f"Could not get variant because of fields: {fields} and i={i}")   
    return transcript, variant

def extract_splice_var(genedetail):
    """
    genedetail: e.g., NM_001032221.3:exon3:c.169+1G>A;NM_003165.3:exon3:c.169+1G>A
    """
    fields = genedetail.split(":")
    index = 0
    i = 0
    transcript = "?"
    for f in fields:
        if f == "NM_001032221.3" or f == "NM_003165.3":
            index = i
            transcript = f
            break
        i += 1
    if (i + 2) < len(fields):
        variant = fields[i+2]
    else:
        raise ValueError(f"Could not get variant because of fields: {fields} and i={i} from genedetail \"{genedetail}\"")   
    return transcript, variant


class GenotypeEntry:
    def __init__(self, row):
        self.patID = row["PatID"]
        self.chrom = row["Chr"]
        self.start = row["Start"]
        self.end = row["End"]
        self.ref = row["Ref"]
        self.alt = row["Alt"]
        self.transcript = "?"
        func = row["Func.refGeneWithVer"]
        self.category = func
        genenot = row["Gene.refGeneWithVer"]
        aachange = row["AAChange.refGeneWithVer"]
        if func == "exonic":
            transcript, variant = extract_var_inf(aachange)
            self.transcript = transcript
            regex_del = r"c.\d+_\d+del"
            regex_single_nt_del = r"(c.\d+del)[ACGT]"
            regex_dup = r"c.(\d+)dup([A-Z]+)"
            regex_sub = r"c.([A-Z]+)(\d+)([A-Z]+)"
            regex_ins = r"c.(\d+)_(\d+)ins([A-Z]+)"  # e.g., 1372_1373insGCCGGAGCAA
            regex_delins = r"(c.\d+_\d+delins[A-Z]+)"
            result = re.search(regex_sub, variant)
            result_dup = re.search(regex_dup, variant)
            result_single_nt_del = re.search(regex_single_nt_del, variant)
            result_ins = re.search(regex_ins, variant)
            result_delins = re.search(regex_delins, variant)
            if re.match(regex_del, variant):
                self.hgvs = variant
            elif result:
                ref = result.group(1)
                position = result.group(2)
                alt = result.group(3)
                hgvs = f"c.{position}{ref}>{alt}"
                self.hgvs = hgvs
            elif result_dup:
                position=result_dup.group(1)
                hgvs = f"c.{position}dup"
                self.hgvs = hgvs
            elif result_single_nt_del:
                self.hgvs = result_single_nt_del.group(1)
            elif result_delins:
                self.hgvs = result_delins.group(1)
            elif result_ins:
                pos1 = result_ins.group(1)
                pos2 = result_ins.group(2)
                seq = result_ins.group(3)
                self.hgvs = f"c.{pos1}_{pos2}ins{seq}"
            else:
                raise ValueError(f"Could not parse variant {variant}")
        elif func == 'splicing':
            geneDetail = row["GeneDetail.refGeneWithVer"]
            transcript, variant = extract_splice_var(geneDetail)
            self.transcript = transcript
            self.hgvs = variant
        elif func == "NA":
            pass
        elif func == "intronic":
            pass  
        else:
            print(f"{self.patID}---function {func}")
            raise ValueError(f"Could not parse variant  for func {func}\n{row}")
                    

In [74]:
na_genotype = 0
intronic_genotype = 0
genotype_d = defaultdict(GenotypeEntry)
with open("input/brain-2021-00642-File011-genotype.tsv") as f:
    reader = DictReader(f, delimiter="\t")
    for row in reader:
        ge = GenotypeEntry(row=row)
        if ge.category == "intronic":
            intronic_genotype = intronic_genotype + 1
        elif ge.category == "NA":
            na_genotype = na_genotype +1
        else:
            patient_id = ge.patID
            genotype_d[patient_id] = ge
print(f"We got {len(genotype_d)} usable genotypes")
print(f"We got {na_genotype} NAs, and {intronic_genotype} intronic genotypes - both were skipped")
        

We got 463 usable genotypes
We got 46 NAs, and 25 intronic genotypes - both were skipped


<h2>HpoTerm</h2>
<p>We create classes called HpoTerm and PatientRow to help process the data from the input file. Note that
the data for one patient is distributed across multiple rows of the input Excel file.</p>

In [75]:
class PatientRow:
    def __init__(self, row):
        self.patID = row["PatID"]
        self.sex = row["Sex"]
        self.phenogroup = row["Phenotypic_group**"]
        
def row_to_hpo(row):
    """Transform a row of the dataframe to an HPO term
    """
    try: 
        age_onset_m = int(row["age_onset_m"])
        onset = f"P{age_onset_m}M"
    except:
        onset = None
    try:
        age_offset_m = int(row["age_offset_m"])
        offset = f"P{age_offset_m}M"
    except:
        offset = None
    hpo_id = row["Base_HPO***"]
    hpo_label = row["HPO_term"]
    excluded = False
    # excluded terms are coded with NP:0001234 instead of HP:0001234
    if hpo_id.startswith("NP"):
        excluded = True
    hpo_id = "H" + hpo_id[1:]
    return HpTerm(hpo_id=hpo_id, label=hpo_label, onset=onset, resolution=offset) 

In [76]:
patient_d = defaultdict(list)
patient_demographic_d = defaultdict(PatientRow)
for _, row in clinical_df.iterrows():
    patID = row["PatID"]
    if patID not in genotype_d:
        continue
    if patID not in patient_demographic_d:
        patient_demographic_d[patID] = PatientRow(row=row)
    hpo = row_to_hpo(row=row)
    patient_d[patID].append(hpo)
print(f"We got {len(patient_d)} patients and {len(patient_demographic_d)} demographics")

We got 463 patients and 463 demographics


<H2>Putting it all together</h2>

In [77]:
disease_label = "Developmental and epileptic encephalopathy 4"
disease_id = "OMIM:612164"

In [78]:
#  def __init__(self, individual_id, hpo_terms, sex, age=Constants.NOT_PROVIDED, variant_list=None, disease_id=None, disease_label=None):
individual_list = []
for pat_id, patRow in patient_demographic_d.items():
    hpo_list = patient_d.get(pat_id)
    if hpo_list is None:
        print(f"Could not find hpo list for {pat_id}")
        continue
    if len(hpo_list) == 0:
        print(f"warning, empty HPO list for {pat_id}")
    sex = patRow.sex
    gtype = genotype_d.get(pat_id)
    if gtype is None:
        print(f"Could not find genotype for {pat_id} (should never happen)")
        continue
    #(self, assembly, vcf_d, symbol=None, hgnc=None, hgvs=None, transcript=None, g_hgvs=None) -> None:
    
    genome = 'hg38'
    transcript='NM_015133.4'
    varMapper = VariantColumnMapper(assembly=genome,
                                column_name='Transcript\nNM_015133.4\nc.', 
                                transcript=transcript, 
                                default_genotype='heterozygous')

    
    

In [79]:
validator = VariantValidator(genome_build='hg38')

In [80]:
validated_var_d = defaultdict()
c = 0
for patid, gtype in genotype_d.items():
    #print(f"{patid} - {gtype.hgvs}")
    if gtype.transcript == 'NM_001032221.3':
        transcript = 'NM_001032221.6'
    elif gtype.transcript == 'NM_003165.3':
        transcript = 'NM_003165.6'
    else:
        raise ValueError(f"Unexpected transcript: {gtype.transcript}")
    total_hgvs = f"{transcript}:{gtype.hgvs}"
    if total_hgvs in validated_var_d:
        pass
    else:
        print(total_hgvs)
        v = validator.encode_hgvs(hgvs=gtype.hgvs, custom_transcript=transcript)
        print(v)
        validated_var_d[total_hgvs] = v

NM_001032221.6:c.251T>A
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_001032221.6%3Ac.251T>A/NM_001032221.6?content-type=application%2Fjson
chr9:127660034T>A
NM_001032221.6:c.1631G>A
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_001032221.6%3Ac.1631G>A/NM_001032221.6?content-type=application%2Fjson
chr9:127682489G>A
NM_001032221.6:c.539G>A
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_001032221.6%3Ac.539G>A/NM_001032221.6?content-type=application%2Fjson
chr9:127663314G>A
NM_001032221.6:c.1328T>G
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_001032221.6%3Ac.1328T>G/NM_001032221.6?content-type=application%2Fjson
chr9:127676722T>G
NM_001032221.6:c.169+1G>A
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_001032221.6%3Ac.169+1G>A/NM_001032221.6?content-type=application%2Fjson
chr9:127653797G>A
NM_001032221.6:c.1162C>T
https://rest.variantvalidat

chr9:127663218AC>A
NM_001032221.6:c.416C>T
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_001032221.6%3Ac.416C>T/NM_001032221.6?content-type=application%2Fjson
chr9:127661192C>T
NM_001032221.6:c.901C>T
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_001032221.6%3Ac.901C>T/NM_001032221.6?content-type=application%2Fjson
chr9:127668186C>T
NM_001032221.6:c.148dup
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_001032221.6%3Ac.148dup/NM_001032221.6?content-type=application%2Fjson
chr9:127653774C>CA
NM_001032221.6:c.704G>A
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_001032221.6%3Ac.704G>A/NM_001032221.6?content-type=application%2Fjson
chr9:127666206G>A
NM_001032221.6:c.437_438del
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_001032221.6%3Ac.437_438del/NM_001032221.6?content-type=application%2Fjson
chr9:127663211TCC>T
NM_001032221.6:c.778G>T
https:

ValueError: Expecting to get a gene_variant from Variant Validator but got warning