In [1]:
import requests
import pandas as pd
from pprint import pprint

In [18]:
# get gene sets from github
base_url = "https://raw.githubusercontent.com/NCATS-Tangerine/cq-notebooks/"
FA_all_genes = "master/FA_gene_sets/FA_4_all_genes.txt"
fa_genes = pd.read_csv(base_url + FA_all_genes, sep='\t')

In [19]:
fa_genes

Unnamed: 0,NCBIGene:2175,FANCA
0,NCBIGene:2187,FANCB
1,NCBIGene:2176,FANCC
2,NCBIGene:2178,FANCE
3,NCBIGene:2188,FANCF
4,NCBIGene:2189,FANCG
5,NCBIGene:55120,FANCL
6,NCBIGene:57697,FANCM
7,NCBIGene:2177,FANCD2
8,NCBIGene:55215,FANCI
9,NCBIGene:29089,UBE2T


In [8]:
#  return gene disease association data from biolink
def query_biolink_gene_disease(gene_curie):
    bl_url = 'https://api.monarchinitiative.org/api/bioentity/gene/{}/diseases/'
    params = {
        'fetch_objects': True,   
    }
    r = requests.get(url=bl_url.format(gene_curie), params=params)
    return r.json()

In [9]:
term_map_reversed = {
    "GENO:0000840": "pathogenic",
    "GENO:0000841": "likely pathogenic",
    "GENO:0000843": "benign",
    "GENO:0000844": "likely benign",
    "GENO:0000845": "uncertain significance"    
}

In [22]:
"""
Look for pathogenic variants and the disease 
they are implicated with via biolink
"""
result_set = []
for index, row in fa_genes.iterrows():
    bl_dat = query_biolink_gene_disease(row[0])
    for assoc in bl_dat['associations']:
        edges = assoc['evidence_graph']['edges']
        nodes = assoc['evidence_graph']['nodes']
        node_map = dict()
        for node in nodes:
            node_map[node['id']] = node['lbl']
        for edge in edges:
            if edge['pred'] in term_map_reversed.keys():
                pd_row = [row[1], row[0], node_map[edge['sub']], 
                       edge['sub'], term_map_reversed[edge['pred']], 
                       edge['pred'], node_map[edge['obj']], edge['obj']]
                result_set.append(pd_row)
    column_names = ['gene_name', 'gene_curie', 'variant_name', 'variant_curie', 
                    'relation_label', 'relation_curie', 'disease_label', 'disease_curie']
result_frame = pd.DataFrame(data=result_set, columns=column_names)
result_frame

Unnamed: 0,gene_name,gene_curie,variant_name,variant_curie,relation_label,relation_curie,disease_label,disease_curie
0,FANCB,NCBIGene:2187,"FANCB, 1-BP DEL, 1650T",ClinVarVariant:10868,pathogenic,GENO:0000840,"Fanconi Anemia, Complementation Group B",OMIM:300514
1,FANCB,NCBIGene:2187,"FANCB, 1-BP INS, 1838T",ClinVarVariant:10866,pathogenic,GENO:0000840,"Fanconi Anemia, Complementation Group B",OMIM:300514
2,FANCB,NCBIGene:2187,"FANCB, LEU717TER",ClinVarVariant:37043,pathogenic,GENO:0000840,"Fanconi Anemia, Complementation Group B",OMIM:300514
3,FANCB,NCBIGene:2187,"FANCB, 3314-BP DEL",ClinVarVariant:10867,pathogenic,GENO:0000840,"Fanconi Anemia, Complementation Group B",OMIM:300514
4,FANCB,NCBIGene:2187,"FANCB, IVS7DS, G-A, +5",ClinVarVariant:10870,pathogenic,GENO:0000840,"Fanconi Anemia, Complementation Group B",OMIM:300514
5,FANCB,NCBIGene:2187,"FANCB, 2-BP DEL, 1857AG",ClinVarVariant:37044,pathogenic,GENO:0000840,"Fanconi Anemia, Complementation Group B",OMIM:300514
6,FANCB,NCBIGene:2187,"FANCB, 1-BP INS, 811T",ClinVarVariant:10869,pathogenic,GENO:0000840,"Fanconi Anemia, Complementation Group B",OMIM:300514
7,FANCC,NCBIGene:2176,NM_000136.2(FANCC):c.595dupC (p.Leu199Profs),ClinVarVariant:221622,likely pathogenic,GENO:0000841,Carcinoma of colon,OMIM:114500
8,FANCC,NCBIGene:2176,NM_000136.2(FANCC):c.67delG (p.Asp23Ilefs),ClinVarVariant:12049,pathogenic,GENO:0000840,"Fanconi anemia, complementation group C",OMIM:227645
9,FANCC,NCBIGene:2176,NM_000136.2(FANCC):c.1599G>A (p.Trp533Ter),ClinVarVariant:370395,likely pathogenic,GENO:0000841,"Fanconi anemia, complementation group C",OMIM:227645


In [12]:
from SPARQLWrapper import SPARQLWrapper, JSON

def execute_query(query):
    endpoint = SPARQLWrapper('https://query.wikidata.org/sparql')
    endpoint.setQuery(query)
    endpoint.setReturnFormat(JSON)
    return endpoint.query().convert()

def var_query(entrez):
    """
    query wikidata by entrez id for variant that is 'positive diagnostic predictor of a disease'
    """    
    query = """
    SELECT distinct ?gene ?geneLabel ?variant ?variantLabel ?disease ?diseaseLabel
     WHERE {
      ?gene wdt:P351 '%s'. 
      OPTIONAL {?variant wdt:P3433 ?gene.}        # variant of gene
      OPTIONAL {?variant wdt:P3433 ?gene;
                         wdt:P3356 ?disease.}    # variant is a positive diagnostic predictor of disease
       SERVICE wikibase:label {
            bd:serviceParam wikibase:language "en" .
      }
    }
    """ % (entrez)
    r = execute_query(query)
    return r['results']['bindings']

def keycheck(ckey, cdict):
    if ckey in cdict.keys():
        return cdict[ckey]['value']
    else:
        return None

In [23]:
"""
Look for variants of FA genes in wikidata that are 'positive diagnostic predictors' for a disease
"""
wd_columns = ['gene', 'geneLabel', 'variant', 'variantLabel', 'disease', "diseaseLabel"]
wd_result_frame = pd.DataFrame(columns=wd_columns)

    
for index, row in fa_genes.iterrows():
    entrez_id = row[0].split(":")[-1]
    wd_hits = var_query(entrez_id)
    for hit in wd_hits:
        result = dict()
        result['gene'] = keycheck('gene', hit)
        result['geneLabel'] = keycheck('geneLabel', hit)
        result['variant'] = keycheck('variant', hit)
        result['variantLabel'] = keycheck('variantLabel', hit)
        result['disease'] = keycheck('disease', hit)
        result['diseaseLabel'] = keycheck('diseaseLabel', hit)
        wd_result_frame = wd_result_frame.append(result, ignore_index=True)
wd_result_frame
                

Unnamed: 0,gene,geneLabel,variant,variantLabel,disease,diseaseLabel
0,http://www.wikidata.org/entity/Q17927471,FANCB,,,,
1,http://www.wikidata.org/entity/Q18250517,FANCC,http://www.wikidata.org/entity/Q28445146,FANCC LOSS-OF-FUNCTION,,
2,http://www.wikidata.org/entity/Q17927077,FANCE,,,,
3,http://www.wikidata.org/entity/Q17927502,FANCF,,,,
4,http://www.wikidata.org/entity/Q17927524,FANCG,,,,
5,http://www.wikidata.org/entity/Q18041564,FANCL,,,,
6,http://www.wikidata.org/entity/Q18044458,FANCM,,,,
7,http://www.wikidata.org/entity/Q17927069,FANCD2,,,,
8,http://www.wikidata.org/entity/Q18041663,FANCI,,,,
9,http://www.wikidata.org/entity/Q18039587,UBE2T,,,,
