In [1]:
import requests
import pandas as pd

In [2]:
url = "https://raw.githubusercontent.com/NCATS-Tangerine/cq-notebooks/master/FA_gene_sets/FA_1_core_complex.txt"
core_genes = list(pd.DataFrame.from_csv(url, sep="\t", header=None)[1])
core_genes_quotes = '"' + '" "'.join(core_genes) + '"'
print(core_genes_quotes)
print(",".join(core_genes))

"FANCA" "FANCB" "FANCC" "FANCE" "FANCF" "FANCG" "FANCL" "FANCM" "FANCD2" "FANCI" "UBE2T"
FANCA,FANCB,FANCC,FANCE,FANCF,FANCG,FANCL,FANCM,FANCD2,FANCI,UBE2T


In [3]:
# civic url: 
url = "https://civic.genome.wustl.edu/api/genes/{}?identifier_type=entrez_symbol".format(",".join(core_genes))
d = requests.get(url).json()
[{x['name']:x['variants']} for x in d]

[{'FANCA': []},
 {'FANCC': [{'evidence_items': {'accepted_count': 1,
     'rejected_count': 0,
     'submitted_count': 0},
    'id': 534,
    'name': 'LOSS-OF-FUNCTION'}]},
 {'FANCD2': []},
 {'FANCE': []},
 {'FANCB': []},
 {'FANCF': []},
 {'FANCG': []},
 {'UBE2T': []},
 {'FANCL': []},
 {'FANCI': []},
 {'FANCM': []}]

In [None]:
## only ONE variant in civic for these genes
# (this is a sanity check on wikidata, because we get no results below)

In [7]:
query = """select ?geneLabel ?variantLabel ?variant ?drugLabel ?cid ?diseaseLabel ?ref where {
  values ?hgnc {**hgnc**}
  ?gene wdt:P353 ?hgnc .
  ?variant wdt:P3433 ?gene .
  ?variant p:P3355 ?s .
  ?s ps:P3355 ?drug .
  ?s prov:wasDerivedFrom/pr:P854 ?ref .
  ?drug wdt:P662 ?cid .
  ?s pq:P2175 ?disease .
  ?disease wdt:P279* wd:Q12078 .
  SERVICE wikibase:label {  bd:serviceParam wikibase:language "en" }
}
"""

In [8]:
params = { 'query': query.replace("**hgnc**", core_genes_quotes), 'format': 'json'}
results = requests.get('https://query.wikidata.org/sparql', params=params).json()['results']['bindings']
[{k:v['value'] for k,v in item.items()} for item in results]

[]

In [9]:
### include effector proteins as well
url = "https://raw.githubusercontent.com/NCATS-Tangerine/cq-notebooks/master/FA_gene_sets/FA_2_effector_proteins.txt"
effector_genes = core_genes + list(pd.DataFrame.from_csv(url, sep="\t", header=None)[1])
effector_genes_quotes = '"' + '" "'.join(effector_genes) + '"'
params = { 'query': query.replace("**hgnc**", effector_genes_quotes), 'format': 'json'}
results = requests.get('https://query.wikidata.org/sparql', params=params).json()['results']['bindings']
[{k:v['value'] for k,v in item.items()} for item in results]

[{'cid': '23725625',
  'diseaseLabel': 'breast cancer',
  'drugLabel': 'olaparib',
  'geneLabel': 'BRCA1',
  'ref': 'https://civic.genome.wustl.edu/links/evidence/1775',
  'variant': 'http://www.wikidata.org/entity/Q28444960',
  'variantLabel': 'BRCA1 MUTATION'},
 {'cid': '23725625',
  'diseaseLabel': 'breast cancer',
  'drugLabel': 'olaparib',
  'geneLabel': 'BRCA2',
  'ref': 'https://civic.genome.wustl.edu/links/evidence/1776',
  'variant': 'http://www.wikidata.org/entity/Q28444961',
  'variantLabel': 'BRCA2 MUTATION'}]