# Yamaguchi T et al. (2023) COL3A1

Data from [Yamaguchi T et al. (2023) Comprehensive genetic screening for vascular Ehlers-Danlos syndrome through an amplification-based next-generation sequencing system. Am J Med Genet A. 2023 Jan;191(1):37-51.](https://pubmed.ncbi.nlm.nih.gov/36189931/)

In [9]:
import phenopackets as php
from google.protobuf.json_format import MessageToDict, MessageToJson
from google.protobuf.json_format import Parse, ParseDict
import pandas as pd
pd.set_option('display.max_colwidth', None) # show entire column contents, important!
from collections import defaultdict
import os
import sys
import numpy as np
from IPython.display import HTML, display
from pyphetools.creation import *
from pyphetools.visualization import *
import pyphetools
print(f"pyphetools version {pyphetools.__version__}")

pyphetools version 0.8.2


In [2]:
PMID = "PMID:36189931"
title = "Comprehensive genetic screening for vascular Ehlers-Danlos syndrome through an amplification-based next-generation sequencing system"
parser = HpoParser()
hpo_cr = parser.get_hpo_concept_recognizer()
hpo_version = parser.get_version()
metadata = MetaData(created_by="ORCID:0000-0002-5648-2155", pmid=PMID, pubmed_title=title)
metadata.default_versions_with_hpo(version=hpo_version)

In [5]:
df = pd.read_excel('input/Yamaguchi_2022_PMID_36189931.xlsx')

In [6]:
df.head()

Unnamed: 0,Patient,Variants (NM_000090.3),Suspected disease,Age at genetic diagnosis (years),Sex,Aortic dissection,Aortic rupture,Arterial dissection,Arterial rupture,Uterine rupture,...,Spontaneous pneumothorax,Acrogeria,Talipes equinovarus,Congenital hip dislocation,Hypermobility of small joints,Tendon and muscle rupture,Gingival recession/fragility,Keratoconus,Early onset varicose veins,Family history
0,1,c.547G>A:p.Gly183Ser,vEDS,59,F,−,−,−,+,,...,−,−,−,−,−,−,−,,−,−
1,2,c.547G>A:p.Gly183Ser,vEDS,29,F,−,−,+,−,−,...,−,−,−,−,−,−,−,,−,"Mother, died, carotid-cavernous sinus fistula"
2,3,c.556G>A:p.Gly186Ser,vEDS,d.15,M,+,+,−,−,,...,−,−,−,−,−,−,−,,−,"Mother, died at 30s, subarachnoid hemorrhage"
3,4,c.565G>C:p.Gly189Arg,vEDS,45,F,−,−,+,−,−,...,−,−,−,−,−,−,−,,−,−
4,5,c.583G>A:p.Gly195Arg,vEDS,17,F,−,−,−,−,,...,−,−,−,−,+,−,−,,−,"Mother, died at 32 years, aortic dissection; grandfather, died at 30s, aortic dissection"


In [11]:
hpo_cr = parser.get_hpo_concept_recognizer()
generator = SimpleColumnMapperGenerator(df=df, observed='+', excluded='-', hpo_cr=hpo_cr)
column_mapper_d = generator.try_mapping_columns()

In [13]:
display(HTML(generator.to_html()))

Result,Columns
Mapped,Aortic dissection; Aortic rupture; Arterial dissection; Arterial rupture; Uterine rupture; Easy bruising; Spontaneous pneumothorax; Talipes equinovarus; Congenital hip dislocation; Keratoconus
Unmapped,"Patient; Variants (NM_000090.3); Suspected disease; Age at genetic diagnosis (years); Sex; Sigmoid colon perforation; Carotid-cavernous sinus fistula; Thin, translucent skin; Characteristic facial features; Acrogeria; Hypermobility of small joints; Tendon and muscle rupture; Gingival recession/fragility; Early onset varicose veins; Family history"


In [19]:
#Acrogeria"; -- need HPO Term
# Carotid-cavernous sinus fistula -- need HPO Term
#v"Characteristic facial features"; " -- too general to code
feature_d = {
    "Early onset varicose veins": ["Varicose veins", "HP:0002619"],
    "Gingival recession/fragility": ["Gingival fragility", "HP:0034518"], 
    "Tendon and muscle rupture": ["Tendon rupture", "HP:0100550"],
    "Hypermobility of small joints":  ["Finger joint hypermobility", "HP:0006094"],
    "Thin, translucent skin": ["Dermal translucency", "HP:0010648"]
}

for k, v in feature_d.items():
    mapper = SimpleColumnMapper(hpo_id=v[1], hpo_label=v[0], observed="+", excluded="-")
    column_mapper_d[k] = mapper

In [22]:
df["hgvs"] = df["Variants (NM_000090.3)"].apply(lambda x: x.split(":")[0])

In [28]:
col3a1_transcript = "NM_000090.3"
col3a1_id = "HGNC:2201"
vvalidator = VariantValidator(genome_build="hg38", transcript=col3a1_transcript)
variant_d = {}
for v in df["hgvs"].unique():
    if v == "ex. 24–33 deletion":
        var = StructuralVariant.chromosomal_deletion(cell_contents=v, gene_symbol="COL3A1", gene_id=col3a1_id)
    else:
        var = vvalidator.encode_hgvs(v)
    variant_d[v] = var

https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_000090.3%3Ac.547G>A/NM_000090.3?content-type=application%2Fjson
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_000090.3%3Ac.556G>A/NM_000090.3?content-type=application%2Fjson
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_000090.3%3Ac.565G>C/NM_000090.3?content-type=application%2Fjson
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_000090.3%3Ac.583G>A/NM_000090.3?content-type=application%2Fjson
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_000090.3%3Ac.598C>T/NM_000090.3?content-type=application%2Fjson
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_000090.3%3Ac.659_664del/NM_000090.3?content-type=application%2Fjson
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_000090.3%3Ac.665G>A/NM_000090.3?content-type=application%2Fjson
https://rest.var

In [26]:
for k, v in variant_d.items():
    print (k, v)

c.547G>A chr2:188988099G>A
c.556G>A chr2:188988108G>A
c.565G>C chr2:188988117G>C
c.583G>A chr2:188988590G>A
c.598C>T chr2:188988605C>T
c.659_664del chr2:188989416TGCTATA>T
c.665G>A chr2:188989424G>A
c.724C>T chr2:188990129C>T
c.754G>A chr2:188990316G>A
c.755G>A chr2:188990317G>A
c.763G>T chr2:188990325G>T
c.848T>A chr2:188991053T>A
c.897+2T>G chr2:188991533T>G
c.897+2T>A chr2:188991533T>A
c.951+5G>C chr2:188991727G>C
c.1194+1G>A chr2:188994083G>A
c.1330G>A chr2:188994577G>A
c.1346G>T chr2:188994593G>T
c.[1546G>T;1556G>T] chr2:188995728G>T
c.1662+1G>A chr2:188996179G>A
c.1862G>A chr2:188997382G>A
c.1977+5G>C chr2:188998324G>C
c.2134_2160del chr2:188999469GGGTGCTGCTGGTCCTCCTGGGCCACCT>G
c.2283+5G>T chr2:188999900G>T
c.2356G>A chr2:189001554G>A
c.2357G>A chr2:189001555G>A
c.2518G>A chr2:189003027G>A
c.2815G>A chr2:189004135G>A
c.2869G>A chr2:189004302G>A
c.2870G>T chr2:189004303G>T
c.3256G>C chr2:189007500G>C
c.3338G>A chr2:189007582G>A
c.3525+1G>A chr2:189008143G>A


In [27]:
df["hgvs"]

0                c.547G>A
1                c.547G>A
2                c.556G>A
3                c.565G>C
4                c.583G>A
5                c.598C>T
6            c.659_664del
7                c.665G>A
8                c.724C>T
9                c.754G>A
10               c.755G>A
11               c.763G>T
12               c.848T>A
13             c.897+2T>G
14             c.897+2T>A
15             c.951+5G>C
16            c.1194+1G>A
17              c.1330G>A
18              c.1346G>T
19    c.[1546G>T;1556G>T]
20            c.1662+1G>A
21              c.1862G>A
22            c.1977+5G>C
23         c.2134_2160del
24            c.2283+5G>T
25              c.2356G>A
26              c.2357G>A
27              c.2518G>A
28              c.2815G>A
29              c.2869G>A
30              c.2870G>T
31              c.3256G>C
32              c.3338G>A
33            c.3525+1G>A
34     ex. 24–33 deletion
Name: hgvs, dtype: object

In [29]:
df["Suspected disease"].unique() # 	Age at genetic diagnosis (years) 	Sex

array(['vEDS', 'LDS/FTAAD', 'FTAAD'], dtype=object)

In [30]:
disease_d = {}
disease_d['vEDS'] = Disease(disease_id="OMIM:130050", disease_label="Ehlers-Danlos syndrome, vascular type")