In [63]:
import requests
import pandas as pd
pd.set_option('display.max_columns', None)
pd.options.display.max_colwidth = 999

In [2]:
# get gene sets from github
url = "https://raw.githubusercontent.com/NCATS-Tangerine/cq-notebooks/master/FA_gene_sets/FA_4_all_genes.txt"
fa_genes = pd.read_csv(url, sep='\t', names=['ncbi', 'hgnc'])
fa_genes.head()

Unnamed: 0,ncbi,hgnc
0,NCBIGene:2175,FANCA
1,NCBIGene:2187,FANCB
2,NCBIGene:2176,FANCC
3,NCBIGene:2178,FANCE
4,NCBIGene:2188,FANCF


In [21]:
# Get all variants from civic by gene, print counts: 
url = "https://civic.genome.wustl.edu/api/genes/{}?identifier_type=entrez_symbol".format(",".join(fa_genes.hgnc))
d = requests.get(url).json()
gene_count = {x['name']:len(x['variants']) for x in d}
print(sum(gene_count.values()))
gene_count

33


{'BRCA1': 18,
 'BRCA2': 11,
 'BRIP1': 0,
 'CENPS': 0,
 'CENPX': 0,
 'ERCC4': 0,
 'FAAP100': 0,
 'FAAP20': 0,
 'FAAP24': 0,
 'FANCA': 1,
 'FANCB': 0,
 'FANCC': 1,
 'FANCD2': 0,
 'FANCE': 0,
 'FANCF': 0,
 'FANCG': 0,
 'FANCI': 0,
 'FANCL': 0,
 'FANCM': 0,
 'MAD2L2': 0,
 'PALB2': 2,
 'RAD51': 0,
 'RAD51C': 0,
 'RFWD3': 0,
 'SLX4': 0,
 'UBE2T': 0,
 'XRCC2': 0}

In [4]:
## query for the details from wikidata (note wikidata only contains the "high quality" results)

In [5]:
def query_and_format_result(query):
    params = {'query': query, 'format': 'json'}
    results = requests.get('https://query.wikidata.org/sparql', params=params).json()['results']['bindings']
    return pd.DataFrame([{k:v['value'] for k,v in item.items()} for item in results])

In [7]:
all_genes_quotes = '"' + '" "'.join(fa_genes.hgnc) + '"'

In [43]:
# which of those lead to drug resistance phenotype?
query = """select ?geneLabel ?variantLabel ?variant ?drugLabel ?cid ?civic ?diseaseLabel where {
  values ?hgnc {**hgnc**}
  ?gene wdt:P353 ?hgnc .  # get gene item from hgnc id
  ?variant wdt:P3433 ?gene . # get variants on the gene
  ?variant wdt:P3329 ?civic . # get civic id
  ?variant p:P3354|p:P3355 ?s . # variant is a "positive therapeutic predictor" or "negative therapeutic predictor"
  ?s ps:P3354|ps:P3355 ?drug .  # get the drug associated
  OPTIONAL {?drug wdt:P662 ?cid} # get the pubchem ID for the drug
  ?s pq:P2175 ?disease .  # get the disease associated
  SERVICE wikibase:label {  bd:serviceParam wikibase:language "en" }
}
"""
query = query.replace("**hgnc**", all_genes_quotes)
print(query)

select ?geneLabel ?variantLabel ?variant ?drugLabel ?cid ?civic ?diseaseLabel where {
  values ?hgnc {"FANCA" "FANCB" "FANCC" "FANCE" "FANCF" "FANCG" "FANCL" "FANCM" "FANCD2" "FANCI" "UBE2T" "BRCA2" "BRIP1" "PALB2" "RAD51C" "SLX4" "ERCC4" "RAD51" "BRCA1" "MAD2L2" "XRCC2" "RFWD3" "FAAP100" "FAAP24" "FAAP20" "CENPS" "CENPX"}
  ?gene wdt:P353 ?hgnc .  # get gene item from hgnc id
  ?variant wdt:P3433 ?gene . # get variants on the gene
  ?variant wdt:P3329 ?civic . # get civic id
  ?variant p:P3354|p:P3355 ?s . # variant is a "positive therapeutic predictor" or "negative therapeutic predictor"
  ?s ps:P3354|ps:P3355 ?drug .  # get the drug associated
  OPTIONAL {?drug wdt:P662 ?cid} # get the pubchem ID for the drug
  ?s pq:P2175 ?disease .  # get the disease associated
  SERVICE wikibase:label {  bd:serviceParam wikibase:language "en" }
}



In [44]:
query_and_format_result(query)

Unnamed: 0,cid,civic,diseaseLabel,drugLabel,geneLabel,variant,variantLabel
0,5702198.0,534,pancreatic cancer,cisplatin,FANCC,http://www.wikidata.org/entity/Q28445146,FANCC LOSS-OF-FUNCTION
1,60750.0,534,pancreatic cancer,gemcitabine,FANCC,http://www.wikidata.org/entity/Q28445146,FANCC LOSS-OF-FUNCTION
2,2708.0,534,pancreatic cancer,chlorambucil,FANCC,http://www.wikidata.org/entity/Q28445146,FANCC LOSS-OF-FUNCTION
3,73346703.0,397,mesothelioma,Vinorelbine,BRCA1,http://www.wikidata.org/entity/Q28445085,BRCA1 EXPRESSION
4,460612.0,534,pancreatic cancer,melphalan,FANCC,http://www.wikidata.org/entity/Q28445146,FANCC LOSS-OF-FUNCTION
5,23725625.0,532,prostate cancer,olaparib,PALB2,http://www.wikidata.org/entity/Q28371488,PALB2 BIALLELIC INACTIVATION
6,23725625.0,131,Her2-receptor negative breast cancer,olaparib,BRCA1,http://www.wikidata.org/entity/Q28444937,BRCA1 LOSS-OF-FUNCTION
7,23725625.0,131,prostate cancer,olaparib,BRCA1,http://www.wikidata.org/entity/Q28444937,BRCA1 LOSS-OF-FUNCTION
8,23725625.0,131,ovarian cancer,olaparib,BRCA1,http://www.wikidata.org/entity/Q28444937,BRCA1 LOSS-OF-FUNCTION
9,23725625.0,132,ovarian cancer,olaparib,BRCA2,http://www.wikidata.org/entity/Q28444938,BRCA2 LOSS-OF-FUNCTION


In [10]:
## Get all results from civic directly

In [18]:
# get all variant IDs, from variants in FA genes
from itertools import chain
variant_ids = set(chain(*[[x['id'] for x in dd['variants']] for dd in d]))
# should be 33 of them
len(variant_ids)

33

In [26]:
url = "https://civicdb.org/api/variants/{}"
variant_evs = dict()
for variant_id in variant_ids:
    d = requests.get(url.format(variant_id)).json()
    variant_evs[variant_id] = d

In [57]:
all_r = []
for variant_id, evs in variant_evs.items():
    records = [x for x in evs['evidence_items'] if x['evidence_type'] == 'Predictive']
    for record in records:
        record = {'disease': record['disease']['display_name'],
         'drugs': ";".join([x['name'] for x in record['drugs']]),
         'evidence_direction': record['evidence_direction'],
         'evidence_type': record['evidence_type'],
         'id': record['id'],
         'name': record['name'],
         'variant_id': record['variant_id'],
          'evidence_level': record['evidence_level'],
          'rating': record['rating'],
          'status': record['status'],
          'entrez_id': evs['entrez_id'],
            'entrez_name': evs['entrez_name'],
                  'url': "https://civic.genome.wustl.edu/links/variants/" + str(record['variant_id'])
         }
        all_r.append(record)

In [64]:
df = pd.DataFrame(all_r)
df.sort_values("evidence_level")

Unnamed: 0,disease,drugs,entrez_id,entrez_name,evidence_direction,evidence_level,evidence_type,id,name,rating,status,url,variant_id
49,Cancer,Olaparib,675,BRCA2,Supports,A,Predictive,1371,EID1371,4.0,accepted,https://civic.genome.wustl.edu/links/variants/186,186
26,Cancer,Olaparib,672,BRCA1,Supports,A,Predictive,1370,EID1370,4.0,accepted,https://civic.genome.wustl.edu/links/variants/185,185
59,Pancreatic Cancer,Veliparib;Gemcitabine;Cisplatin,675,BRCA2,Supports,B,Predictive,5933,EID5933,3.0,submitted,https://civic.genome.wustl.edu/links/variants/186,186
28,Ovarian Carcinoma,PLATINUM,672,BRCA1,Supports,B,Predictive,1531,EID1531,3.0,accepted,https://civic.genome.wustl.edu/links/variants/185,185
29,Ovarian Cancer,Olaparib;Cediranib,672,BRCA1,Supports,B,Predictive,1677,EID1677,3.0,accepted,https://civic.genome.wustl.edu/links/variants/185,185
30,Triple-receptor Negative Breast Cancer,Cisplatin;Carboplatin,672,BRCA1,Supports,B,Predictive,1684,EID1684,3.0,accepted,https://civic.genome.wustl.edu/links/variants/185,185
32,Triple-receptor Negative Breast Cancer,Olaparib,672,BRCA1,Supports,B,Predictive,1775,EID1775,3.0,accepted,https://civic.genome.wustl.edu/links/variants/185,185
33,Ovarian Cancer,Rucaparib,672,BRCA1,Supports,B,Predictive,1897,EID1897,4.0,accepted,https://civic.genome.wustl.edu/links/variants/185,185
34,Breast Cancer,Olaparib,672,BRCA1,Supports,B,Predictive,5830,EID5830,4.0,submitted,https://civic.genome.wustl.edu/links/variants/185,185
35,Pancreatic Cancer,Olaparib,672,BRCA1,Supports,B,Predictive,5914,EID5914,4.0,submitted,https://civic.genome.wustl.edu/links/variants/185,185


In [65]:
df.entrez_name.value_counts()

BRCA1    33
BRCA2    27
PALB2     2
FANCC     1
FANCA     1
Name: entrez_name, dtype: int64

In [66]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [67]:
df[df.entrez_name.isin({"PALB2", "FANCC", "FANCA"})]

Unnamed: 0,disease,drugs,entrez_id,entrez_name,evidence_direction,evidence_level,evidence_type,id,name,rating,status,url,variant_id
13,Pancreatic Cancer,Gemcitabine;Melphalan;Chlorambucil;Cisplatin;Mitomycin C,2176,FANCC,Supports,D,Predictive,1307,EID1307,3.0,accepted,https://civic.genome.wustl.edu/links/variants/534,534
47,Prostate Cancer,Olaparib,79728,PALB2,Supports,C,Predictive,1963,EID1963,3.0,accepted,https://civic.genome.wustl.edu/links/variants/532,532
48,Pancreatic Cancer,Mitomycin C,79728,PALB2,Supports,C,Predictive,1305,EID1305,4.0,accepted,https://civic.genome.wustl.edu/links/variants/532,532
63,Prostate Cancer,Cisplatin,2175,FANCA,Supports,C,Predictive,5813,EID5813,4.0,submitted,https://civic.genome.wustl.edu/links/variants/2165,2165
