<H1>Sulfite Oxidase Deficiency SUOX</H1>
<P>Data from <a href="https://pubmed.ncbi.nlm.nih.gov/36303223/" target="__blank">Li JT, et al. (2022) Mutation analysis of SUOX in isolated sulfite oxidase deficiency with ectopia lentis as the presenting feature: insights into genotype-phenotype correlation. Orphanet J Rare Dis.17(1):392. PMID:36303223</a>.</P>
<P>We transferred information from Additional Files 5, 6, and 7 to two Excel files to parse the data.</P>

In [1]:
import phenopackets as php
from google.protobuf.json_format import MessageToDict, MessageToJson
from google.protobuf.json_format import Parse, ParseDict
import pandas as pd
import math
from csv import DictReader
pd.set_option('display.max_colwidth', None) # show entire column contents, important!
from collections import defaultdict
import re
import pyphetools
from pyphetools.creation import *
from pyphetools.visualization import PhenopacketTable
print(f"pyphetools version {pyphetools.__version__}")

pyphetools version 0.7.4


In [2]:
parser = HpoParser()
hpo_cr = parser.get_hpo_concept_recognizer()
hpo_version = parser.get_version()
PMID = "PMID:36303223"
title = "Mutation analysis of SUOX in isolated sulfite oxidase deficiency with ectopia lentis as the presenting feature: insights into genotype-phenotype correlation"
metadata = MetaData(created_by="ORCID:0000-0003-2598-6622", pmid=PMID, pubmed_title=title)
metadata.default_versions_with_hpo(version=hpo_version)

<H2>SUOX variants</H2>
<P>The file Li-SUOX-Variants.xlsx has one variant per line, assigned to the patient ID.</P>
<p>Note that one of the reported variants is erroneous according to Variant Validator:</p>
<pre>NM_001032386.2:c.1355C>A: Variant reference (C) does not agree with reference sequence (G)</pre>
<p>This is how the variant was reported in the original publication. We changed the C to a G and obtain the
same amino acid change as reported in the original publication: <tt>NP_001027558.1:p.(G452D)</tt>.</p>

In [24]:
variant_df = pd.read_excel('input/Li-SUOX-Variants.xlsx', na_values=['n.a.'])
variant_df.head()

Unnamed: 0,Proband ID,ID,Nucleotide,Amino acid,Exon,Domain,Status
0,1,M1,c.433delC,p.Q145Sfs*16,EX6,Cyt-b5 domain,Homo
1,2,M2,c.650G>A,p.R217Q,EX6,Moco domain,Homo
2,3,M3,c.794C>A,p.A265D,EX6,Moco domain,Com het
3,3,M4,c.1280C>A,p.S427*,EX6,Homodimerization domain,
4,4,M5,c.733_736delCTTT,p.L245Pfs*27,EX6,Moco domain,Homo


In [34]:
hg38 = "hg38"
SUOX_transcript = 'NM_001032386.2'
vvalidator = VariantValidator(genome_build=hg38, transcript=SUOX_transcript)
patient_id_to_variant_list_d = defaultdict(list)
all_variants = set()
for _, row in variant_df.iterrows():
    proband = row['Proband ID']
    var = row['Nucleotide']
    if var == "c.1355C>A":
        var = "c.1355G>A" # repair error (see above)
    patient_id_to_variant_list_d[proband].append(var)
    all_variants.add(var)
variant_d = {}
for v in all_variants:
    print(f"variant validator for {v}")
    var = vvalidator.encode_hgvs(v)
    variant_d[v] = var

variant validator for c.1280C>A
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_001032386.2%3Ac.1280C>A/NM_001032386.2?content-type=application%2Fjson
variant validator for c.1201A>G
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_001032386.2%3Ac.1201A>G/NM_001032386.2?content-type=application%2Fjson
variant validator for c.649C>G
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_001032386.2%3Ac.649C>G/NM_001032386.2?content-type=application%2Fjson
variant validator for c.1136A>G
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_001032386.2%3Ac.1136A>G/NM_001032386.2?content-type=application%2Fjson
variant validator for c.1348T>C
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM_001032386.2%3Ac.1348T>C/NM_001032386.2?content-type=application%2Fjson
variant validator for c.1355G>A
https://rest.variantvalidator.org/VariantValidator/variantvalidator/hg38/NM

<H1>Clinical data</H1>

In [35]:
df_clinical = pd.read_excel("input/Li-Suox-Clinical.xlsx")
df_clinical.head()

Unnamed: 0,Proband ID,PMID,Ethnicity,Gender,Parental consanguity,Age at onset (months),Variants,Amino acid,status,Typical type/Mild type,...,Homocys (umol/L) NR: 5-15,Cys (umol/L) NR: 20-70,UA (umol/L) NR: 210-430,Sulfite (mg/L) NR: 0,Thiosulfate NR: 0,Urine SSC (umol/mmolCr) NR: 0.1-10,Urine Taurine (mmol/molCr) NR: 12-150,Urine XA NR: 0-0.46mmol/L or <40umol/mmolCr or <0.29XA/Cr,Urine HypoXA NR: 0-0.18mmol/L or <8umol/mmolCr or <0.5HypoXA/Cr,Urine UA NR: 0.44-4.50mmol/L or 50-980umol/mmolCr
0,1,9050047,EUR,M,No,0,c.(433delC); (433delC),p.(Q145Sfs*16); (Q145Sfs*16),Homo,T,...,n.a.,2,n.a.,20-25,n.a.,320umol/L,95,n.a.,n.a.,n.a.
1,2,9600976,EUR,F,Yes,5,c.(650G>A); (650G>A),p.(R217Q); (R217Q),Homo,T,...,n.a.,n.a.,n.a.,0.108-0.211,0.297-1.632mmol/L,240umol/L,n.a.,0.04mmol/L,0.05mmol/L,0.14mmol/L
2,3,10519592,NAM,M,No,0,c.(794C>A); (1280C>A),p.(A265D); (S427*),Com het,T,...,n.a.,n.a.,normal,80-100,n.a.,690umol/L,n.a.,normal,normal,normal
3,4,12112661,n.a.,n.a.,Yes,n.a.,c.(733_736delCTTT); (733_736delCTTT),p.(L245Pfs*27); (L245Pfs*27),Homo,T,...,n.a.,n.a.,n.a.,n.a.,n.a.,n.a.,n.a.,n.a.,n.a.,n.a.
4,5,12112661,n.a.,n.a.,Yes,n.a.,c.(284_285insC); (1126C>T),p.(E97*); (R376C),Com het,T,...,n.a.,n.a.,n.a.,n.a.,n.a.,n.a.,n.a.,n.a.,n.a.,n.a.


In [36]:
df_clinical.columns

Index(['Proband ID', 'PMID', 'Ethnicity', 'Gender', 'Parental consanguity',
       'Age at onset (months)', 'Variants', 'Amino acid', 'status',
       'Typical type/Mild type', 'Death', 'Age of death (months)',
       'Prodromal infection', 'Developmental delay', 'Regression', 'Seizure',
       'Extrapyramidal symptoms', 'Hypertonia', 'Hypotonia', 'Microcephaly',
       'Ectopia lentis', 'Age of diagnosis of ophthalmic manifestations',
       'Proband ID.1', 'Resource (PMID)', 'SSC (umol/L) NR: 0',
       'Taurine (umol/L) NR: 15-145', 'Homocys (umol/L) NR: 5-15',
       'Cys (umol/L) NR: 20-70', 'UA (umol/L) NR: 210-430',
       'Sulfite (mg/L) NR: 0', 'Thiosulfate NR: 0 ',
       'Urine SSC (umol/mmolCr) NR: 0.1-10',
       'Urine Taurine (mmol/molCr) NR: 12-150',
       'Urine XA NR: 0-0.46mmol/L or <40umol/mmolCr or <0.29XA/Cr',
       'Urine HypoXA NR: 0-0.18mmol/L or <8umol/mmolCr or <0.5HypoXA/Cr',
       'Urine UA NR: 0.44-4.50mmol/L or 50-980umol/mmolCr'],
      dtype='object'

In [37]:
column_mapper_d = {}

items = {
    'Neurological manifestations_Developmental delay': ['Neurodevelopmental delay', 'HP:0012758'],
    'Neurological manifestations_Regression': ['Cognitive regression', 'HP:0034332'],
    'Neurological manifestations_Seizure': ['Seizure', 'HP:0001250'],
    'Neurological manifestations_Extrapyramidal symptoms': ['Abnormality of extrapyramidal motor function', 'HP:0002071'],
    'Neurological manifestations_Hypertonia':['Hypertonia','HP:0001276'],
    'Neurological manifestations_Hypotonia': ['Hypotonia','HP:0001252'],
    'Neurological manifestations_Microcephaly':['Microcephaly', 'HP:0000252'],
    'Opthalmic manifestations_Ectopia lentis':['Ectopia lentis', 'HP:0001083'],
}

item_column_mapper_d = hpo_cr.initialize_simple_column_maps(column_name_to_hpo_label_map=items, 
                                                            observed='+',
                                                            excluded='-')
print(f"We created {len(item_column_mapper_d)} simple column mappers")
# Transfor to column_mapper_d
for k, v in item_column_mapper_d.items():
    column_mapper_d[k] = v

We created 8 simple column mappers


<H2>Threshold mappers</H2>
<p>The data contain information about biochemical abnormalities framed as tests with reference ranges and values. We can capture this using threshold mappers</p>

<h3>SSC (umol/L) NR: 0</h3>
<p>SSC refers to S-sulfocysteine; The normal range is absent (not more than zero). The corresponding
HPO term is Elevated circulating S-sulfocysteine concentration HP:0034745.</p>

In [39]:
df_clinical['SSC (umol/L) NR: 0'].unique()

array(['n.a.', 28, 14], dtype=object)

In [43]:
sscMapper = ThresholdedColumnMapper(hpo_id="HP:0034745", 
                                    hpo_label="Elevated circulating S-sulfocysteine concentration",
                                    threshold=0,
                                   call_if_above=True)
sscMapper.preview_column(df_clinical['SSC (umol/L) NR: 0'])
column_mapper_d['SSC (umol/L) NR: 0'] = sscMapper

In [44]:
df_clinical['Taurine (umol/L) NR: 15-145'].unique()

array([197, 'n.a.', 46], dtype=object)

In [46]:
# Hypertaurinemia HP:0500181
taurineMapper = ThresholdedColumnMapper(hpo_id="HP:0500181",
                                        hpo_label="Hypertaurinemia",
                                        threshold=145,
                                        call_if_above=True)
taurineMapper.preview_column(df_clinical['Taurine (umol/L) NR: 15-145'])
column_mapper_d['Taurine (umol/L) NR: 15-145'] = taurineMapper

In [48]:
# 'Homocys (umol/L) NR: 5-15' -- Hyperhomocystinemia HP:0002160
homocyteineMapper = ThresholdedColumnMapper(hpo_id="HP:0002160",
                                            hpo_label="Hyperhomocystinemia",
                                            threshold=15,
                                            call_if_above=True)
homocyteineMapper.preview_column(df_clinical['Homocys (umol/L) NR: 5-15'])
column_mapper_d['Homocys (umol/L) NR: 5-15'] = homocyteineMapper

In [51]:
# 'Cys (umol/L) NR: 20-70' -- 
# Note this manifests as low circulate Cystine (not Cysteine)
# Hypocystinemia HP:0500152

cystineMapper = ThresholdedColumnMapper(hpo_id="HP:0500152",
                                        hpo_label="Hypocystinemia",
                                        threshold=20,
                                        call_if_above=False)
cystineMapper.preview_column(df_clinical['Cys (umol/L) NR: 20-70'])
column_mapper_d['Cys (umol/L) NR: 20-70'] = cystineMapper

In [55]:
# 'UA (umol/L) NR: 210-430'  -- Hypouricemia HP:0003537
uricAcidMapper = ThresholdedColumnMapper(hpo_id="HP:0003537",
                                         hpo_label="Hypouricemia",
                                         threshold=210,
                                         call_if_above=False)
uricAcidMapper.preview_column(df_clinical['UA (umol/L) NR: 210-430'])
column_mapper_d['UA (umol/L) NR: 210-430'] = uricAcidMapper

In [57]:
# 'Sulfite (mg/L) NR: 0' 
# df_clinical['Sulfite (mg/L) NR: 0']
# requires new HPO term

In [58]:
# 'Thiosulfate NR: 0 ' -- requires new HPO term

In [62]:
# 'Urine SSC (umol/mmolCr) NR: 0.1-10' -- Sulfocysteinuria HP:0032350
urineSscMapper = ThresholdedColumnMapper(hpo_id="HP:0032350",
                                         hpo_label="Sulfocysteinuria",
                                        threshold=10,
                                        call_if_above=True)
urineSscMapper.preview_column(df_clinical['Urine SSC (umol/mmolCr) NR: 0.1-10'])
column_mapper_d['Urine SSC (umol/mmolCr) NR: 0.1-10'] = urineSscMapper

In [64]:
# 'Urine Taurine (mmol/molCr) NR: 12-150'  -- Increased urinary taurine HP:0003166
urineTaurineMapper = ThresholdedColumnMapper(hpo_id="HP:0003166",
                                            hpo_label="Increased urinary taurine",
                                            threshold=150,
                                            call_if_above=True)
urineTaurineMapper.preview_column(df_clinical['Urine Taurine (mmol/molCr) NR: 12-150'])
column_mapper_d['Urine Taurine (mmol/molCr) NR: 12-150'] = urineTaurineMapper

In [75]:
# 'Urine XA NR: 0-0.46mmol/L or <40umol/mmolCr or <0.29XA/Cr'
# Here we need to use an OptionColumnMapper because three different measurement ranges are used
df_clinical['Urine XA NR: 0-0.46mmol/L or <40umol/mmolCr or <0.29XA/Cr'].unique()
urine_xa_d = { '11.7umol/mmolCr':"Xanthinuria",
       '1.7umol/mmolCr':"Xanthinuria" }
urine_not_xa_d = {'0.04mmol/L': "Xanthinuria", 
                  "1.6umol/mmolCr": "Xanthinuria", 
                  "0.0214XA/Cr": "Xanthinuria",
                 "normal": "Xanthinuria"}
urineXAmapper = OptionColumnMapper(concept_recognizer=hpo_cr, 
                                   option_d=urine_xa_d, 
                                   excluded_d=urine_not_xa_d)
urineXAmapper.preview_column(df_clinical['Urine XA NR: 0-0.46mmol/L or <40umol/mmolCr or <0.29XA/Cr'])

#df_clinical['Urine XA NR: 0-0.46mmol/L or <40umol/mmolCr or <0.29XA/Cr'].unique()
# TODO CHECK ME -- thereshould be saome observed terms.

Unnamed: 0,terms
0,
1,HP:0010934 (Xanthinuria/excluded)
2,HP:0010934 (Xanthinuria/excluded)
3,
4,
5,
6,
7,
8,
9,


In [78]:
# 'Urine HypoXA NR: 0-0.18mmol/L or <8umol/mmolCr or <0.5HypoXA/Cr',
# Increased urinary hypoxanthine level HP:0011814
urine_hxa_d = {
    '8umol/mmolCr': "Increased urinary hypoxanthine level",
}
urine_hxa_excluded_d = {
    'normal': "Increased urinary hypoxanthine level",
    '0.05mmol/L': "Increased urinary hypoxanthine level",
    '0.0264HypoXA/Cr': "Increased urinary hypoxanthine level",
}
urineHXAmapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=urine_hxa_d, excluded_d=urine_hxa_excluded_d)
urineHXAmapper.preview_column(df_clinical['Urine HypoXA NR: 0-0.18mmol/L or <8umol/mmolCr or <0.5HypoXA/Cr'])

# TODO CHECK

Unnamed: 0,terms
0,
1,HP:0011814 (Increased urinary hypoxanthine level/excluded)
2,HP:0011814 (Increased urinary hypoxanthine level/excluded)
3,
4,
5,
6,
7,
8,
9,


In [80]:
# 'Urine UA NR: 0.44-4.50mmol/L or 50-980umol/mmolCr'
# Hyperuricosuria HP:0003149
# Decreased urinary urate HP:0011935
# Abnormality of urinary uric acid level HP:0012610
df_clinical['Urine UA NR: 0.44-4.50mmol/L or 50-980umol/mmolCr'].unique()

array(['n.a.', '0.14mmol/L', 'normal', '385umol/mmolCr', '21umol/mmolCr',
       '430umol/mmolCr'], dtype=object)

In [81]:
urine_ua_d = {'0.14mmol/L': "Decreased urinary urate",
             '21umol/mmolCr': "Decreased urinary urate",}
urine_ua_excluded_d = {'normal' : "Abnormality of urinary uric acid level",
                      '385umol/mmolCr': "Abnormality of urinary uric acid level",
                      '430umol/mmolCr': "Abnormality of urinary uric acid level",}
urineUaMapper = OptionColumnMapper(concept_recognizer=hpo_cr,
                                  option_d=urine_ua_d,
                                  excluded_d=urine_ua_excluded_d)
urineUaMapper.preview_column(df_clinical['Urine UA NR: 0.44-4.50mmol/L or 50-980umol/mmolCr'])

Unnamed: 0,terms
0,
1,
2,HP:0012610 (Abnormality of urinary uric acid level/excluded)
3,
4,
5,
6,
7,
8,
9,


<H2>Putting it all together</H2>
<p>First, let's create a new individual id column that shows the PMID</p>

In [90]:
df_clinical.apply(lambda row: f"individual_{row.loc['Proband ID']}")

KeyError: 'Proband ID'

In [104]:
def get_individual_id(arr):
    iid = arr.iloc[0]
    pmid = arr.iloc[1]
    if pmid == "our patient":
        return "individual_35_PMID_36303223" # from current manuscript
    else:
        return f"individual_{iid}_PMID_{pmid}"
df_clinical['individual_id'] = df_clinical[['Proband ID', 'Resource (PMID)']].apply(lambda x: get_individual_id(x), axis=1)

Unnamed: 0,Proband ID,PMID,Ethnicity,Gender,Parental consanguity,Age at onset (months),Variants,Amino acid,status,Typical type/Mild type,...,Cys (umol/L) NR: 20-70,UA (umol/L) NR: 210-430,Sulfite (mg/L) NR: 0,Thiosulfate NR: 0,Urine SSC (umol/mmolCr) NR: 0.1-10,Urine Taurine (mmol/molCr) NR: 12-150,Urine XA NR: 0-0.46mmol/L or <40umol/mmolCr or <0.29XA/Cr,Urine HypoXA NR: 0-0.18mmol/L or <8umol/mmolCr or <0.5HypoXA/Cr,Urine UA NR: 0.44-4.50mmol/L or 50-980umol/mmolCr,individual_id
0,1,9050047,EUR,M,No,0,c.(433delC); (433delC),p.(Q145Sfs*16); (Q145Sfs*16),Homo,T,...,2,n.a.,20-25,n.a.,320umol/L,95,n.a.,n.a.,n.a.,individual_1_PMID_9050047
1,2,9600976,EUR,F,Yes,5,c.(650G>A); (650G>A),p.(R217Q); (R217Q),Homo,T,...,n.a.,n.a.,0.108-0.211,0.297-1.632mmol/L,240umol/L,n.a.,0.04mmol/L,0.05mmol/L,0.14mmol/L,individual_2_PMID_9600976
2,3,10519592,NAM,M,No,0,c.(794C>A); (1280C>A),p.(A265D); (S427*),Com het,T,...,n.a.,normal,80-100,n.a.,690umol/L,n.a.,normal,normal,normal,individual_3_PMID_10519592
3,4,12112661,n.a.,n.a.,Yes,n.a.,c.(733_736delCTTT); (733_736delCTTT),p.(L245Pfs*27); (L245Pfs*27),Homo,T,...,n.a.,n.a.,n.a.,n.a.,n.a.,n.a.,n.a.,n.a.,n.a.,individual_4_PMID_12112661
4,5,12112661,n.a.,n.a.,Yes,n.a.,c.(284_285insC); (1126C>T),p.(E97*); (R376C),Com het,T,...,n.a.,n.a.,n.a.,n.a.,n.a.,n.a.,n.a.,n.a.,n.a.,individual_5_PMID_12112661


In [106]:
ageMapper = AgeColumnMapper('AgeEncodingType.YEAR_AND_MONTH', 'NewAgeCol')
ageMapper.preview_column(df['NewAgeCol'])

sexMapper = SexColumnMapper(male_symbol='M', female_symbol='F', column_name='Gender', unknown_symbol='')
sexMapper.preview_column(df['Gender'])

individual_column_name = 'individual_id'


encoder = CohortEncoder(df=df, 
                        hpo_cr=hpo_cr, 
                        column_mapper_d=column_mapper_d, 
                        individual_column_name=individual_column_name,
                        agemapper=ageMapper, 
                        sexmapper=sexMapper,
                        metadata=metadata,
                        pmid=PMID)

encoder.set_disease(disease_id='OMIM:272300', label='Sulfite oxidase deficiency')



KeyError: 'NewAgeCol'

In [108]:
df_clinical['Age at onset (months)']
ageMapper = AgeColumnMapper.

0        0
1        5
2        0
3     n.a.
4     n.a.
5     n.a.
6     n.a.
7     n.a.
8     n.a.
9     n.a.
10    n.a.
11    n.a.
12    n.a.
13       5
14       0
15     0.8
16     0.1
17      12
18      12
19       0
20     0.5
21     1.3
22     0.7
23       0
24      12
25      16
26      14
27       0
28     0.2
29       9
30      23
31     0.5
32       0
33     0.1
34    n.a.
Name: Age at onset (months), dtype: object