In [1]:
## Clinvar, pharmgkb, chebi

## Load FA genes

In [32]:
url = "https://raw.githubusercontent.com/NCATS-Tangerine/cq-notebooks/master/FA_gene_sets/FA_4_all_genes.txt"
fa_genes = list(pd.DataFrame.from_csv(url, sep="\t", header=None)[1])
print(fa_genes)

['FANCA', 'FANCB', 'FANCC', 'FANCE', 'FANCF', 'FANCG', 'FANCL', 'FANCM', 'FANCD2', 'FANCI', 'UBE2T', 'BRCA2', 'BRIP1', 'PALB2', 'RAD51C', 'SLX4', 'ERCC4', 'RAD51', 'BRCA1', 'MAD2L2', 'XRCC2', 'RFWD3', 'FAAP100', 'FAAP24', 'FAAP20', 'CENPS', 'CENPX']


## Use Chebi to get a list of all neoplastic agents
### https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3A35610

In [35]:
import requests
from tqdm import tqdm
from itertools import chain

In [3]:
d = requests.get("https://www.ebi.ac.uk/ols/api/ontologies/chebi/terms/http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252FCHEBI_35610/graph").json()

In [4]:
chebi_drugs = [x['source'] for x in d['edges'] if x['label'] == "has role"]
print(chebi_drugs[:3])
print(len(chebi_drugs))

['http://purl.obolibrary.org/obo/CHEBI_73716', 'http://purl.obolibrary.org/obo/CHEBI_34154', 'http://purl.obolibrary.org/obo/CHEBI_73512']
198


In [5]:
def get_chebi_synonyms(iri):
    # get the label and synonyms from chebi
    url = "https://www.ebi.ac.uk/ols/api/ontologies/chebi/terms?iri={}"
    d = requests.get(url.format(iri)).json()['_embedded']['terms'][0]
    try:
        s = set([d['label']]) | set(d['synonyms'])
        return set(list(map(str.lower, s)))
    except Exception:
        return set([d['label'].lower()])

In [6]:
chebi_names = {chebi: get_chebi_synonyms(chebi) for chebi in tqdm(chebi_drugs)}

100%|██████████| 198/198 [03:02<00:00,  1.19it/s]


In [43]:
chemo_drugs = set(list(chain(*chebi_names.values())))
print(list(chemo_drugs)[:10])
print(len(chemo_drugs))

['9h-purine-2,6-diamine', 'pierreione b', '(sp-4-2)-diamminedichloroplatinum', '(3r,4e,15z,26e,28r)-triaconta-4,15,26-triene-1,29-diyne-3,28-diol', '(1s,3s)-3,5,12-trihydroxy-3-(hydroxyacetyl)-10-methoxy-6,11-dioxo-1,2,3,4,6,11-hexahydrotetracen-1-yl 3-amino-2,3,6-trideoxy-alpha-l-arabino-hexopyranoside', '8-[(2-chloro-3,4,5-trimethoxyphenyl)methyl]-2-fluoro-9-(pent-4-yn-1-yl)-9h-purin-6-amine', 'daidzein', 'ttt-3002', '(4r,7r,10s,13s,15e,17r,19s)-7-[(2,6-dibromo-1h-indol-3-yl)methyl]-4-(4-hydroxyphenyl)-8,10,13,15,17,19-hexamethyl-1-oxa-5,8,11-triazacyclononadec-15-ene-2,6,9,12-tetrone', 'azelaic acid']
392


## PharmGKB
contains Clinical Variant Data  
Cannot download full information (details, references) without a licensing agreement  

https://www.pharmgkb.org/downloads 
https://s3.pgkb.org/data/clinicalVariants.zip

In [44]:
import pandas as pd
ph = pd.read_csv("clinicalVariants.tsv", sep="\t")
ph['url'] = ph.variant.map(lambda x: "https://www.pharmgkb.org/rsid/" + x)
ph.head()

Unnamed: 0,variant,gene,type,level of evidence,chemicals,diseases,url
0,rs121908755,CFTR,Efficacy,1A,ivacaftor,Cystic Fibrosis,https://www.pharmgkb.org/rsid/rs121908755
1,rs80282562,CFTR,Efficacy,1A,ivacaftor,Cystic Fibrosis,https://www.pharmgkb.org/rsid/rs80282562
2,rs121908757,CFTR,Efficacy,1A,ivacaftor,Cystic Fibrosis,https://www.pharmgkb.org/rsid/rs121908757
3,rs121909005,CFTR,Efficacy,1A,ivacaftor,Cystic Fibrosis,https://www.pharmgkb.org/rsid/rs121909005
4,rs121909013,CFTR,Efficacy,1A,ivacaftor,Cystic Fibrosis,https://www.pharmgkb.org/rsid/rs121909013


In [45]:
# filter to include only FA genes
ph[ph.gene.isin(fa_genes)]

Unnamed: 0,variant,gene,type,level of evidence,chemicals,diseases,url
1348,rs1799801,ERCC4,Efficacy,3,Platinum compounds,"Carcinoma, Non-Small-Cell Lung",https://www.pharmgkb.org/rsid/rs1799801


In [46]:
# https://www.pharmgkb.org/variant/PA166155016/clinicalAnnotation/1444666608

## Clinvar

In [49]:
# Download is an xml that I don't want to parse. I found someone who already did that here:
# https://github.com/macarthur-lab/clinvar/blob/master/output/b38/single/clinvar_alleles.single.b38.tsv.gz

In [190]:
pd.set_option('display.max_columns', None)
pd.options.display.max_colwidth = 999

In [191]:
clinvar = pd.read_csv("clinvar_alleles.single.b38.tsv", sep='\t', low_memory=False)
print(len(clinvar))

293882


In [192]:
# example trait. not structured at all....
clinvar[clinvar.hgvs_c == "NM_004628.4:c.2815C>A"].clinical_significance

55373    drug response
Name: clinical_significance, dtype: object

In [193]:
clinvar.head(1)

Unnamed: 0,chrom,pos,ref,alt,start,stop,strand,variation_type,variation_id,rcv,scv,allele_id,symbol,hgvs_c,hgvs_p,molecular_consequence,clinical_significance,clinical_significance_ordered,pathogenic,likely_pathogenic,uncertain_significance,likely_benign,benign,review_status,review_status_ordered,last_evaluated,all_submitters,submitters_ordered,all_traits,all_pmids,inheritance_modes,age_of_onset,prevalence,disease_mechanism,origin,xrefs,dates_ordered,gold_stars,conflicted
0,1,1014143,C,T,1014143,1014143,+,Variant,183381,RCV000162196,SCV000212156,181485,ISG15,NM_005101.3:c.163C>T,NP_005092.1:p.Gln55Ter,NM_005101.3:c.163C>T:nonsense,Pathogenic,pathogenic,1,0,0,0,0,no assertion criteria provided,no assertion criteria provided,"Jan 01, 2015",OMIM,OMIM,Immunodeficiency 38 with basal ganglia calcification;IMMUNODEFICIENCY 38 WITH BASAL GANGLIA CALCIFICATION,25307056,,Childhood,<1 / 1 000 000,,germline,MedGen:C4015293;OMIM:616126;Orphanet:319563,2015-01-01,0,0


In [194]:
clinvar.columns

Index(['chrom', 'pos', 'ref', 'alt', 'start', 'stop', 'strand',
       'variation_type', 'variation_id', 'rcv', 'scv', 'allele_id', 'symbol',
       'hgvs_c', 'hgvs_p', 'molecular_consequence', 'clinical_significance',
       'clinical_significance_ordered', 'pathogenic', 'likely_pathogenic',
       'uncertain_significance', 'likely_benign', 'benign', 'review_status',
       'review_status_ordered', 'last_evaluated', 'all_submitters',
       'submitters_ordered', 'all_traits', 'all_pmids', 'inheritance_modes',
       'age_of_onset', 'prevalence', 'disease_mechanism', 'origin', 'xrefs',
       'dates_ordered', 'gold_stars', 'conflicted'],
      dtype='object')

In [197]:
# filter to only include those with a drug response phenotype
clinvar = clinvar[clinvar.clinical_significance.str.lower().str.count("drug")>0]
print(len(clinvar))
clinvar.head(2)

332


Unnamed: 0,chrom,pos,ref,alt,start,stop,strand,variation_type,variation_id,rcv,scv,allele_id,symbol,hgvs_c,hgvs_p,molecular_consequence,clinical_significance,clinical_significance_ordered,pathogenic,likely_pathogenic,uncertain_significance,likely_benign,benign,review_status,review_status_ordered,last_evaluated,all_submitters,submitters_ordered,all_traits,all_pmids,inheritance_modes,age_of_onset,prevalence,disease_mechanism,origin,xrefs,dates_ordered,gold_stars,conflicted
1772,1,11794419,T,G,11794419,11794419,-,Variant,3521,RCV000003698;RCV000003699;RCV000144922;RCV000153515;RCV000211350;RCV000350590;RCV000430863,SCV000023861;SCV000023862;SCV000187679;SCV000203039;SCV000268235;SCV000347797;SCV000519507,18560,MTHFR,NM_005957.4:c.1286A>C,NP_005948.3:p.Glu429Ala,NM_005957.4:c.1286A>C:missense variant,drug response,benign;risk factor;uncertain significance;other;drug response;likely benign,0,0,1,1,2,reviewed by expert panel,"no assertion criteria provided;criteria provided, single submitter;reviewed by expert panel","Jun 14, 2016","OMIM;Department of Pharmacy and Biotechnology,University of Bologna;EGL Genetic Diagnostics,Eurofins Clinical Diagnostics;PharmGKB;Illumina Clinical Services Laboratory,Illumina;GeneDx","OMIM;Department of Pharmacy and Biotechnology,University of Bologna;EGL Genetic Diagnostics,Eurofins Clinical Diagnostics;PharmGKB;Illumina Clinical Services Laboratory,Illumina;GeneDx","MTHFR deficiency, thermolabile type;MTHFR THERMOLABILE POLYMORPHISM;Schizophrenia, susceptibility to;Gastrointestinal stromal tumor;Gastrointestinal Stromal Tumors;not provided;fluorouracil, leucovorin, and oxaliplatin response - Efficacy;Colorectal Neoplasms;Neural tube defects, folate-sensitive;Neural Tube Defects, Folate-Sensitive;not specified",10677336;10958762;11590551;11742092;11752418;11938441;12560871;15103709;15951337;16244782;17898028;18583979;23288205;25227144;9545395;22685257;23852704;25394175;20078613;20385995;22992668,Autosomal dominant inheritance;Autosomal unknown,Adolescent;Infancy,1-5 / 10 000,,germline,Genetic Alliance:MTHFR+deficiency%2C+thermolabile+type/8909;Genetic Testing Registry (GTR):GTR000174335;Genetic Testing Registry (GTR):GTR000263075;Genetic Testing Registry (GTR):GTR000326434;Genetic Testing Registry (GTR):GTR000327733;Genetic Testing Registry (GTR):GTR000500311;Genetic Testing Registry (GTR):GTR000500678;Genetic Testing Registry (GTR):GTR000501123;Genetic Testing Registry (GTR):GTR000509249;Genetic Testing Registry (GTR):GTR000511144;Genetic Testing Registry (GTR):GTR000512222;Genetic Testing Registry (GTR):GTR000520021;Genetic Testing Registry (GTR):GTR000520078;Genetic Testing Registry (GTR):GTR000521336;Genetic Testing Registry (GTR):GTR000522333;Genetic Testing Registry (GTR):GTR000530207;Genetic Testing Registry (GTR):GTR000531268;Genetic Testing Registry (GTR):GTR000552460;Genetic Testing Registry (GTR):GTR000552466;MedGen:C1856059;Medical Genetics Summaries:NBK66131;OMIM:607093.0003;OMIM:607093.0004;Genetic Alliance:Gastrointestinal+Stromal+Tumors/2997;MeS...,2008-07-01;0000-00-00;2015-07-23;2016-06-14;2016-04-25,3,0
1805,1,11796321,G,A,11796321,11796321,-,Variant,3520,RCV000003697;RCV000144921;RCV000153516;RCV000211133;RCV000211336;RCV000259890;RCV000417131;RCV000427078;RCV000428048,SCV000023860;SCV000106043;SCV000187678;SCV000203040;SCV000268236;SCV000268238;SCV000347807;SCV000494694;SCV000505736;SCV000519504,18559,MTHFR,NM_005957.4:c.665C>T,NP_005948.3:p.Ala222Val,NM_005957.4:c.665C>T:missense variant,drug response,benign;pathogenic;uncertain significance;other;drug response;likely benign;not provided,1,0,1,1,2,reviewed by expert panel,"no assertion criteria provided;criteria provided, single submitter;reviewed by expert panel;no assertion provided","Feb 28, 2017","OMIM;FirmaLab;Department of Pharmacy and Biotechnology,University of Bologna;EGL Genetic Diagnostics,Eurofins Clinical Diagnostics;PharmGKB;Illumina Clinical Services Laboratory,Illumina;Database of Curated Mutations (DoCM);GeneDx","OMIM;FirmaLab;Department of Pharmacy and Biotechnology,University of Bologna;EGL Genetic Diagnostics,Eurofins Clinical Diagnostics;PharmGKB;Illumina Clinical Services Laboratory,Illumina;Database of Curated Mutations (DoCM);GeneDx","MTHFR deficiency, thermolabile type;MTHFR THERMOLABILE POLYMORPHISM;Gastrointestinal stromal tumor;Gastrointestinal Stromal Tumors;not provided;cyclophosphamide response - Toxicity/ADR;carboplatin response - Efficacy;Carcinoma, Non-Small-Cell Lung;Neural tube defects, folate-sensitive;Neural Tube Defects, Folate-Sensitive;methotrexate response - Dosage, Efficacy, Toxicity/ADR;Neoplasms;Neoplasm of stomach;not specified",10196703;10323741;10440833;10732818;10869114;10930360;11121176;11140843;11781870;11807890;11863127;11888585;11929966;12080391;12095808;12154064;12165282;12196644;12221667;12356947;12383688;12384649;12387655;12400059;12406076;12428084;12529699;12560871;12796225;15054400;15103709;15154859;15173232;1522835;15534175;15565101;15704130;15729744;15806605;15808177;16172608;16365871;16402130;16470725;16712703;16800002;17284634;17350979;17436239;17543893;17726486;17898028;18583979;20154341;21042222;23288205;25227144;7564788;7647779;7741859;8542260;8554053;8554066;8616944;8771990;8826441;8837319;8892013;8903338;8981967;8994411;9133512;9192280;9244205;9341863;9372726;9453374;9545406;9737770;9789068;9798595;9843036;9863598;22685257;23852704;25394175;15051775;19159907;20638924;22992668;19307503;21605004;11418485;12453860;12915598;14647408;15781665;16013960;16019535;16462575;16463153;16501586;16870553;17180579;17323057;17488658;17512587;18458567;18987660;19648163;21644011;21747412;22143415;22838...,Autosomal dominant inheritance;Autosomal unknown,Adolescent;Infancy,"1-5 / 10 000;Gastric cancer is the 4th most frequently diagnosed cancer and the 2nd leading cause of death from cancer, with an estimated 990000 new cases and 738000 deaths registered.",gain of function,germline;somatic,Genetic Alliance:MTHFR+deficiency%2C+thermolabile+type/8909;Genetic Testing Registry (GTR):GTR000174335;Genetic Testing Registry (GTR):GTR000263075;Genetic Testing Registry (GTR):GTR000326434;Genetic Testing Registry (GTR):GTR000327733;Genetic Testing Registry (GTR):GTR000500311;Genetic Testing Registry (GTR):GTR000500678;Genetic Testing Registry (GTR):GTR000501123;Genetic Testing Registry (GTR):GTR000509249;Genetic Testing Registry (GTR):GTR000511144;Genetic Testing Registry (GTR):GTR000512222;Genetic Testing Registry (GTR):GTR000520021;Genetic Testing Registry (GTR):GTR000520078;Genetic Testing Registry (GTR):GTR000521336;Genetic Testing Registry (GTR):GTR000522333;Genetic Testing Registry (GTR):GTR000530207;Genetic Testing Registry (GTR):GTR000531268;Genetic Testing Registry (GTR):GTR000552460;Genetic Testing Registry (GTR):GTR000552466;MedGen:C1856059;Medical Genetics Summaries:NBK66131;OMIM:607093.0003;OMIM:607093.0004;Genetic Alliance:Gastrointestinal+Stromal+Tumors/2997;MeS...,2017-02-28;0000-00-00;2015-05-12;2016-06-14;2016-03-10;2016-04-25,3,0


In [198]:
# filter to only include variants in fa genes
clinvar = clinvar[clinvar.symbol.isin(fa_genes)]
print(len(clinvar))
clinvar.head(1)

0


Unnamed: 0,chrom,pos,ref,alt,start,stop,strand,variation_type,variation_id,rcv,scv,allele_id,symbol,hgvs_c,hgvs_p,molecular_consequence,clinical_significance,clinical_significance_ordered,pathogenic,likely_pathogenic,uncertain_significance,likely_benign,benign,review_status,review_status_ordered,last_evaluated,all_submitters,submitters_ordered,all_traits,all_pmids,inheritance_modes,age_of_onset,prevalence,disease_mechanism,origin,xrefs,dates_ordered,gold_stars,conflicted
