<h1>KBG Syndrome</h1>
<p>Data from <a href="https://pubmed.ncbi.nlm.nih.gov/36446582/" target="__blank">Martinez-Cayuelas E, et al. Clinical description, molecular delineation and genotype-phenotype correlation in 340 patients with KBG syndrome: addition of 67 new patients. J Med Genet. 2022 Nov 29:jmedgenet-2022-108632. PMID: 36446582.</a>.</p>

In [21]:
import phenopackets as php
from google.protobuf.json_format import MessageToDict, MessageToJson
from google.protobuf.json_format import Parse, ParseDict
import pandas as pd
pd.set_option('display.max_colwidth', None) # show entire column contents, important!
from collections import defaultdict
import numpy as np
from google.protobuf.json_format import MessageToDict, MessageToJson
import os
import sys
import re
from pyliftover import LiftOver
import requests

sys.path.append('../')
from pyphetools.creation import *

In [2]:
parser = HpoParser()
hpo_cr = parser.get_hpo_concept_recognizer()
hpo_version = parser.get_version()
metadata = MetaData(created_by="ORCID:0000-0003-2598-6622")
metadata.default_versions_with_hpo(version=hpo_version)

In [3]:
df = pd.read_excel('input/Martinez-KBG-SupplTable-340.xlsx', skiprows=1, dtype=np.str_)

In [4]:
df.head()

Unnamed: 0,Patient origin (1=our cohort; 2=literature),ID,Gender (1=male; 2=female),Intrauterin Growth Restriction (Yes/No),Length at birth (raw data),Length at birth ≤p3 (Yes/No),Weigth at birth (raw data),Weigth at birth ≤p3 (Yes/No),Pregnancy and perinatal complications,Abnormal MRI (Yes/No),...,Other dysmorphic features (Type),Suspicion of a specific genetic syndrome,ANKRD11 variant (NM_013275;arr[GRCh37]),Variant type (CNV/SNV),ANKRD11 exons involved,Number of ANKRD11 exons involved,Genes (OMIM) involved (for CNVs),CNV size,Variant origin (inheritance),ID.1
0,1,KBG1,1,Yes,p<10,,p<10,,,No,...,No,Coffin Siris syndrome,16q24.3(89336307_89354085)x1,CNV,8 to 13,6,ANKRD11,17kb,de novo,12O_KBG1
1,1,KBG2,1,Yes,p<3,Yes,p10,No,,No,...,Perthes,Cornelia de Lange syndrome,16q24.3(89256478_89506223)x1,CNV,2 to 13,12,"CDH15, ANKRD11",250kb,,12O_KBG2
2,1,KBG3,2,No,p40,No,p20,No,,Yes,...,Thin nasal tip,No,c.2398_2401del;p.Glu800Asnfs*62,SNV,9,1,,,,12O_KBG3
3,1,KBG4,2,,p<10,,p10,No,Yes,No,...,No,No,c.7083del;p.Thr2362Profs*39,SNV,9,1,,,,12O_KBG4
4,1,KBG5,1,,,,,,,Yes,...,No,No,c.1903_1907del;p.Lys635GInfs*26,SNV,9,1,,,,12O_KBG5


In [5]:
CNV = '16q24.3(89336307_89354085)x1'

In [6]:
VD = php.VariationDescriptor()

In [7]:
GD = php.GeneDescriptor()

In [8]:
GD.value_id = 'HGNC:21316'
GD.symbol = 'ANKRD11'

In [9]:
json_string = MessageToJson(GD)
print(json_string)

{
  "valueId": "HGNC:21316",
  "symbol": "ANKRD11"
}


In [10]:
term = php.OntologyClass()

In [11]:
term.label = 'chromosomal_deletion'

In [12]:
term.id = 'SO:1000029'

In [13]:
VD.structural_type.CopyFrom(term)

In [14]:
items = {
 'Intrauterin Growth Restriction (Yes/No)': ['Intrauterine growth retardation', 'HP:0001511'],
 'Length at birth ≤p3 (Yes/No)': ['Birth length less than 3rd percentile', 'HP:0003561'],
 'Weigth at birth ≤p3 (Yes/No)': ['Small for gestational age', 'HP:0001518'],
 'Pregnancy and perinatal complications': ['Abnormal delivery','HP:0001787'],
 'Vascular abnormalities (Yes/No)': ['Abnormality of the vasculature', 'HP:0002597'],
 'White matter abnormalities (Yes/No)': ['Abnormal cerebral white matter morphology', 'HP:0002500'],
 'Delayed bone age (Yes/No)': ['Delayed skeletal maturation', 'HP:0002750'],
 'Brain malformation (Yes/No)': ['Abnormality of brain morphology','HP:0012443'],
 'Altered EEG  (Yes/No)': ['EEG abnormality', 'HP:0002353'],
 'ASD (Yes/No)': ['Autistic behavior','HP:0000729'],
 'ADHD (Yes/No)': ['Attention deficit hyperactivity disorder','HP:0007018'],
 'Sleep problems (Yes/No)': ['Sleep disturbance', 'HP:0002360'],
 'Cardiopathy (Yes/No)': ['Abnormal heart morphology','HP:0001627'],
 'Crytorchidism (Yes/No/NA in males; NA in females)': ['Cryptorchidism', 'HP:0000028'],
 'Fontanele (wide/delayed closure) (Yes/No)': ['Abnormality of fontanelles', 'HP:0011328'],
 'Prognatism (Yes/No)': ['Mandibular prognathia','HP:0000303'],
 'Macrostomy (Yes/No)': ['Wide mouth','HP:0000154'],
 "Cupid's bow (Yes/No)": ["Exaggerated cupid's bow",'HP:0002263']
}

In [15]:
col_not_found = []
for col in df.columns:
    hpo_term = hpo_cr.parse_cell(col)
    if len(hpo_term) > 0:
        hpo_term = hpo_term[0]
        items[col] = [hpo_term.label, hpo_term.id]
    else:
        if items.get(col) is not None:
            continue
        else:
            col_not_found.append(col)
print(col_not_found)

['Patient origin (1=our cohort; 2=literature)', 'ID', 'Gender (1=male; 2=female)', 'Length at birth (raw data)', 'Weigth at birth (raw data)', 'Abnormal MRI (Yes/No)', 'MRI findings', 'Age at independent walking (months)', 'Age at first words (months)', 'Cardiopathy (Type)', 'Postnatal stature percentile/SDS (latest measurement)', 'Cephalic perimeter (percentile)', 'Lips shape (thin/thick, lower/upper)', 'Philtrum shape (Type)', 'Strabism (Yes/No)', 'Palpebral fissures shape (Type)', 'Other dysmorphic features (Type)', 'Suspicion of a specific genetic syndrome', 'ANKRD11 variant (NM_013275;arr[GRCh37])', 'Variant type (CNV/SNV)', 'ANKRD11 exons involved', 'Number of ANKRD11 exons involved', 'Genes (OMIM) involved (for CNVs)', 'CNV size', 'Variant origin (inheritance)', 'ID.1']


In [16]:
column_mapper_d = defaultdict(ColumnMapper)

item_column_mapper_d = hpo_cr.initialize_simple_column_maps(column_name_to_hpo_label_map=items, observed='Yes', excluded='No', non_measured='NA')
print(f"We created {len(item_column_mapper_d)} simple column mappers")

for k, v in item_column_mapper_d.items():
    column_mapper_d[k] = v



We created 58 simple column mappers


In [17]:
MRI_custom_map = {
    'mild posterior leukoencephalopathy':'Posterior leukoencephalopathy',
    'Parieto-occipital leukoencephalopathy': 'Leukoencephalopathy',
    'Cardiac septal thickening': 'Ventricular septal hypertrophy',
    'parietal hemorrhagic lesion': 'Abnormal parietal bone morphology',
    'Megacisterna magna': 'Enlarged cisterna magna',
    'MegaCisterna magna': 'Enlarged cisterna magna',
    'megaCisterna magna': 'Enlarged cisterna magna',
    'Cerebellar angioma': 'Cerebral venous angioma',
    'hypoplasia of cerebellar verm': "Cerebellar vermis hypoplasia",
    'hippocampal malrotation': 'Hippocampal malrotation',
    'megacisterna magna': 'Enlarged cisterna magna',
    'vermis hypoplasia': 'Cerebellar vermis hypoplasia'
    ## Could be Continuted
}

Cardiopathy_CM = {
    'Septal thickening': 'Interlobular septal thickening',
    'Atriovetricular canal': 'Atrioventricular canal defect',
    'cleft mitral': 'Cleft anterior mitral valve leaflet',
    'Congenital heart disease': 'Abnormal heart morphology',
    '2nd degree block': 'Heart block'
}

Cardiopathy_Map = CustomColumnMapper(concept_recognizer=hpo_cr, custom_map_d=Cardiopathy_CM)

Custom_Mapper = CustomColumnMapper(concept_recognizer=hpo_cr, custom_map_d=Cardiopathy_CM)
Custom_Mapper.preview_column(df['Cardiopathy (Type)'])

# 'MRI findings', 
# 'Cardiopathy (Type)', 
# 'Postnatal stature percentile/SDS (latest measurement)', 
# 'Cephalic perimeter (percentile)', 
# 'Lips shape (thin/thick, lower/upper)', 
# 'Philtrum shape (Type)', 
# 'Strabism (Yes/No)', 
# 'Palpebral fissures shape (Type)', 
# 'Other dysmorphic features (Type)', 
# 'Suspicion of a specific genetic syndrome', 
# 'ANKRD11 variant (NM_013275;arr[GRCh37])', 
# 'Variant type (CNV/SNV)', 
# 'ANKRD11 exons involved', 
# 'Number of ANKRD11 exons involved', 
# 'Genes (OMIM) involved (for CNVs)', 
# 'CNV size', 
# 'Variant origin (inheritance)'


Unnamed: 0,column,terms
0,Patent foramen ovale in the first month,Patent foramen ovale (HP:0001655)
1,No,
2,Patent foramen ovale. Mild pulmonary stenosis,Pulmonic stenosis (HP:0001642); Patent foramen ovale (HP:0001655)
3,,
4,Spontaneous closure of ventricular septal defect,Ventricular septal defect (HP:0001629)
5,"Bicuspid aortic valve, moderate valvular pulmonary stenosis",Pulmonic stenosis (HP:0001642); Bicuspid aortic valve (HP:0001647)
6,Septal thickening,Interlobular septal thickening (HP:0030879)
7,Atriovetricular canal,Atrioventricular canal defect (HP:0006695)
8,"Valvular pulmonary stenosis, solved",Pulmonic stenosis (HP:0001642)
9,CIA OP con cleft mitral,Cleft anterior mitral valve leaflet (HP:0011569)


In [18]:
other_d = {'Perthes':'Avascular necrosis of the capital femoral epiphysis',
           '3 café-au-lait spots < 5 mm':'Few cafe-au-lait spots',
           'Anteverted nose tip': 'Anteverted nares',
           'short forehead': 'Small forehead', 
           'Long and bushy eyelashes': 'Long eyelashes',
           'long bushy eyelashes': 'Long eyelashes',
           'left preauricular appendix':'Preauricular skin tag',
           'Severe shortening of bilateral 4th metatarsal of the foot':'Short fourth metatarsal',
           'Fetal pads':'Prominent fingertip pads',
           'Helix root preauricular fistula':'Preauricular pit',
           'Epicantus':'Epicanthus',
           'tricomegalia': 'Long eyelashes',
           'blue esclera': 'Blue sclerae',
           'broad, flattened thorax':'Shield chest',
           'Externally rotated ears': 'Posteriorly rotated ears',
           'Rotated ears': 'Posteriorly rotated ears',
           'Flat left pinna': 'Abnormal pinna morphology',
           'Frontal upsweep': 'Frontal upsweep of hair',
           'Dysplastic left ear': 'Abnormal pinna morphology', 
           'Pits on lobes': 'Abnormal earlobe morphology',
           'Indented earlobes': 'Abnormal earlobe morphology',
           'Fleshy ear lobules': 'Large earlobe',
           'Transverse ear crease': 'Linear earlobe crease'
          }


otherMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=other_d)
otherMapper.preview_column(df['Other dysmorphic features (Type)']).replace('n/a', np.nan).dropna()


Unnamed: 0,terms
1,HP:0005743 (Avascular necrosis of the capital femoral epiphysis/observed)
2,HP:0011832 (Narrow nasal tip/observed)
8,HP:0007429 (Few cafe-au-lait spots/observed)
11,HP:0000463 (Anteverted nares/observed)
20,HP:0000350 (Small forehead/observed); HP:0000527 (Long eyelashes/observed)
21,HP:0000527 (Long eyelashes/observed); HP:0000384 (Preauricular skin tag/observed); HP:0002000 (Short columella/observed)
23,HP:0004689 (Short fourth metatarsal/observed)
38,HP:0000274 (Small face/observed)
46,HP:0001212 (Prominent fingertip pads/observed)
51,HP:0002558 (Supernumerary nipple/observed)


In [19]:
lift = LiftOver('hg19', 'hg38')

In [22]:
count = 0

URL = "https://rest.variantvalidator.org/VariantValidator/variantvalidator/%s/%s%%3A%s/%s?content-type=application%%2Fjson"

tempArray = []
for val in df['ANKRD11 variant (NM_013275;arr[GRCh37])']: 

    want = re.findall(r'(.*);|\((.*)\)', str(val))
    #print(want)
    if len(want) == 0:
        tempArray.append(None)
        count = count +1
        continue
    want = want[0]
    if len(want[0]) == 0:
        subWant = want[1]
        if '_' in subWant:
            coor1, coor2 = subWant.split('_')
        elif '-' in subWant:
            coor1, coor2 = subWant.split('-')
        #print(coor1)
        #print(coor2)
        a1, new1, b1, c1 = lift.convert_coordinate('chr16', int(coor1))[0]
        a2, new2, b2, c2 = lift.convert_coordinate('chr16', int(coor2))[0]
        #print(new1)
        #print(new2)
        FullURL = URL % ('hg38','NC_000016.10','g.' + str(new1) + '_' + str(new2) + 'del', 'refseq_select')
        print(FullURL)
        response = requests.get(FullURL)
        try:
            response = response.json()
        except:
            tempArray.append(None)
            count = count +1
            print (count)
            print(f"{subWant} did not work")
            continue
        var_key = [k for k in response.keys() if k not in {'flag', 'metadata'}][0]
        more_dict = response.get(var_key)
        if more_dict.get('hgvs_transcript_variant') == "":
            finalWant = more_dict.get('primary_assembly_loci').get('hg38').get('hgvs_genomic_description')
        else:
            finalWant = more_dict.get('hgvs_transcript_variant')
    else:
        finalWant = want[0]
    print(finalWant)
    tempArray.append(finalWant)
    count = count +1
    print(count)
df['BetterVariants'] = tempArray
#df.at[0, 'BetterVariants'] = 'g.89336307_89354085del'
print(df['BetterVariants'])

https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NC_000016.10%3Ag.89269899_89287677del/refseq_select?content-type=application%2Fjson
NM_013275.6:c.744+852_7806+919del
1
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NC_000016.10%3Ag.89190070_89439815del/refseq_select?content-type=application%2Fjson
NC_000016.10:g.89190070_89439815del
2
c.2398_2401del
3
c.7083del
4
c.1903_1907del
5
c.7407C>G
6
c.6691dup
7
c.3590_3594del
8
c.3590_3594del
9
c.6792dup
10
c.6792dup
11
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NC_000016.10%3Ag.88788349_89454555del/refseq_select?content-type=application%2Fjson


KeyboardInterrupt: 

In [None]:
print(df['BetterVariants'])

In [None]:
import requests
from pyliftover import LiftOver
URL = "https://rest.variantvalidator.org/VariantValidator/variantvalidator/%s/%s%%3A%s/%s?content-type=application%%2Fjson"


print(FullURL)
requests.get(FullURL)

#varVal = VariantValidator('hg19', 'NC_000016.9')
#varVal.encode_hgvs('g.(?_89334866)_(89383447_?)del')

In [None]:
genome = 'hg38'
default_genotype = 'heterozygous'
transcript = 'NM_013275.6'
varMapper = VariantColumnMapper(assembly=genome,column_name='BetterVariants', 
                                transcript=transcript, default_genotype=default_genotype)

In [None]:
sexMapper = SexColumnMapper(male_symbol='1', female_symbol='2', unknown_symbol='NA', column_name='Gender (1=male; 2=female)')
ageMapper = AgeColumnMapper.by_year('Age at independent walking (months)')
sexMapper.preview_column(df['Gender (1=male; 2=female)'])

In [None]:
individual_colname = 'ID'
pmid = 'PMID:36446582'

In [None]:
encoder = CohortEncoder(df=df, hpo_cr=hpo_cr, column_mapper_d=column_mapper_d, 
                          individual_column_name=individual_colname,
                            sexmapper=sexMapper,
                        agemapper=ageMapper,
                            variant_mapper=varMapper,
                        metadata=metadata,
                       pmid=pmid)

In [None]:
encoder.set_disease(disease_id='OMIM:148050', label='KBG syndrome')

In [None]:
encoder.preview_dataframe()

In [None]:
encoder.output_phenopackets(outdir='phenopackets')