## Given a compound retrieve a tissue expression profile

* August 20, 2019
* Workshop - FederatedPhacts 
* Paul Groth, Egon Willighagen, Alasdair Gray, Nick Juty, Alan Gateau, Chris Evelo


* What we're doing:
   * experiment with using grlc encapsulated sparql queries in combination with normal sparql queries
   * merging wikidata, chembl, nextprot data
   * mapping done both externally and internally
   * fun with using wikidata to do autosuggest search

In [190]:
from IPython.display import display
from SPARQLWrapper import SPARQLWrapper, JSON
import wikipedia
import wikidata
import requests
import pandas as pd
from ipywidgets import interact, widgets
from IPython.display import display
import itertools


### Wikidata search functions

In [155]:
def wikidata_search(search_str):
    wd_search_param = {"action":"wbsearchentities", "format": "json", "limit":"1","language":"en","search":search_str}
    wd_search_url = "https://www.wikidata.org/w/api.php"
    r = requests.get(wd_search_url, params = wd_search_param)
    return r.json()
    

In [156]:

search_results = widgets.Label(value="")

text = widgets.Text(
    value='last',
    placeholder='Search',
    description='String:',
    disabled=False,
    continuous_update=True
)


def callback(update):
    search_str = update["new"]
    displaystring = ""
    r = wikidata_search(search_str)
    if "search" in r:
        for i in r["search"]:
            displaystring = displaystring + i["id"] + " | " + i["match"]["text"]
            if "description" in i:
                displaystring = displaystring + " | " + i["description"]
    search_results.value = displaystring
    

def finalize(input):
    finalresult = search_results.value
    

text.observe(callback, 'value')
text.on_submit(finalize)


___Run the next two cells to see the search box and the search results. What you see in the search results will be input to the subsequent cells___

In [157]:
display(text)

Text(value='last', description='String:', placeholder='Search')

In [158]:
display(search_results)

Label(value='')

In [159]:
wd_query_id = search_results.value.split(" | ")[0]

In [160]:
wd_query_id

'Q177094'

### Map a Wikidata chemical id to CHEBL


In [161]:
def wikidataid_to_chembl_id(wikidataid):
    wdtochem_params = {"wdID" : wikidataid}
    headers = {"accept" : "application/json"}
    wdtochem_param_url = "http://grlc.io/api/openphacts/FederatedPhacts/wikidata-chemical-id-mapping"
    r = requests.get(wdtochem_param_url, params = wdtochem_params, headers=headers)
    return r.json()


In [162]:
chemblid = wikidataid_to_chembl_id("http://www.wikidata.org/entity/Q421136")["results"]["bindings"][0]["chemblIRI"]["value"]

In [163]:
chemblid

'http://rdf.ebi.ac.uk/resource/chembl/molecule/CHEMBL1336'

### Get a set of uniprot protein targets related to the particular chemical through an assay

In [164]:
targetquery = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX dbpedia2: <http://dbpedia.org/property/>
PREFIX dbpedia: <http://dbpedia.org/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

PREFIX cco: <http://rdf.ebi.ac.uk/terms/chembl#>
PREFIX chembl_molecule: <http://rdf.ebi.ac.uk/resource/chembl/molecule/>
SELECT ?assay ?target ?uniprot
WHERE {{
    ?activity a cco:Activity ;
    cco:hasMolecule <{chemical}> ;
    cco:hasAssay ?assay .
    ?assay cco:hasTarget ?target .
    ?target cco:hasTargetComponent ?targetcmpt .
    ?targetcmpt cco:targetCmptXref ?uniprot .
    ?uniprot a cco:UniprotRef .
}}
""".format(chemical=chemblid)

In [174]:
sparql_chembl = SPARQLWrapper("https://www.ebi.ac.uk/rdf/services/sparql")
sparql_chembl.setQuery(targetquery)
sparql_chembl.setReturnFormat(JSON)
results = sparql_chembl.query().convert()

target_list = []
for r in results['results']['bindings']:
    target_list.append(r["uniprot"]['value'])

In [175]:
target_list = set(target_list)

In [176]:
display(target_list)

{'http://purl.uniprot.org/uniprot/O00141',
 'http://purl.uniprot.org/uniprot/O00238',
 'http://purl.uniprot.org/uniprot/O00255',
 'http://purl.uniprot.org/uniprot/O00311',
 'http://purl.uniprot.org/uniprot/O00329',
 'http://purl.uniprot.org/uniprot/O00418',
 'http://purl.uniprot.org/uniprot/O00444',
 'http://purl.uniprot.org/uniprot/O00506',
 'http://purl.uniprot.org/uniprot/O00750',
 'http://purl.uniprot.org/uniprot/O14578',
 'http://purl.uniprot.org/uniprot/O14730',
 'http://purl.uniprot.org/uniprot/O14733',
 'http://purl.uniprot.org/uniprot/O14757',
 'http://purl.uniprot.org/uniprot/O14920',
 'http://purl.uniprot.org/uniprot/O14936',
 'http://purl.uniprot.org/uniprot/O14965',
 'http://purl.uniprot.org/uniprot/O14976',
 'http://purl.uniprot.org/uniprot/O15075',
 'http://purl.uniprot.org/uniprot/O15111',
 'http://purl.uniprot.org/uniprot/O15146',
 'http://purl.uniprot.org/uniprot/O15197',
 'http://purl.uniprot.org/uniprot/O15264',
 'http://purl.uniprot.org/uniprot/O15530',
 'http://pu

### Map the Uniprot URIs to Nextprot URIs

In [177]:
def uniprot_to_nexprotid(uniprot_uri):
    uniprotid = uniprot_uri.split("http://purl.uniprot.org/uniprot/")[1]
    nxp = "http://nextprot.org/rdf/entry/NX_" + uniprotid
    return nxp

In [178]:
nxp_target_list = [uniprot_to_nexprotid(x) for x in target_list]

In [179]:
len(nxp_target_list)

437

### Find the tissues where each protein is highly expressed

In [180]:
def nextprot_highly_expressed(nextprot_uri):
    
    payload = {'entry': nextprot_uri, 'endpoint': 'https://sparql.nextprot.org/'}
    headers = {'Accept': 'application/sparql-results+json'}
    r = requests.get('http://grlc.io/api/openphacts/FederatedPhacts/nextprot-highlyExpressed', headers=headers, params=payload)
    results = r.json()
    tissues_expressed = []
    for r in results["results"]['bindings']:
        tissues_expressed.append((nextprot_uri, r['tissue']['value'], r['tisslab']['value']))
    print(nextprot_uri + " Result Count: " + str(len(tissues_expressed)))
    return tissues_expressed

In [181]:
display(nxp_target_list)

['http://nextprot.org/rdf/entry/NX_P15735',
 'http://nextprot.org/rdf/entry/NX_O75747',
 'http://nextprot.org/rdf/entry/NX_P06240',
 'http://nextprot.org/rdf/entry/NX_O00444',
 'http://nextprot.org/rdf/entry/NX_Q06418',
 'http://nextprot.org/rdf/entry/NX_P52333',
 'http://nextprot.org/rdf/entry/NX_P62344',
 'http://nextprot.org/rdf/entry/NX_P35916',
 'http://nextprot.org/rdf/entry/NX_P00533',
 'http://nextprot.org/rdf/entry/NX_O60563',
 'http://nextprot.org/rdf/entry/NX_P22607',
 'http://nextprot.org/rdf/entry/NX_P21802',
 'http://nextprot.org/rdf/entry/NX_P54760',
 'http://nextprot.org/rdf/entry/NX_P06213',
 'http://nextprot.org/rdf/entry/NX_Q13546',
 'http://nextprot.org/rdf/entry/NX_Q9NYY3',
 'http://nextprot.org/rdf/entry/NX_Q00526',
 'http://nextprot.org/rdf/entry/NX_P48730',
 'http://nextprot.org/rdf/entry/NX_P54753',
 'http://nextprot.org/rdf/entry/NX_Q16288',
 'http://nextprot.org/rdf/entry/NX_O15146',
 'http://nextprot.org/rdf/entry/NX_O14965',
 'http://nextprot.org/rdf/entry/

In [182]:
tissue_results = [nextprot_highly_expressed(x) for x in nxp_target_list]
 

http://nextprot.org/rdf/entry/NX_P15735 Result Count: 1
http://nextprot.org/rdf/entry/NX_O75747 Result Count: 0
http://nextprot.org/rdf/entry/NX_P06240 Result Count: 0
http://nextprot.org/rdf/entry/NX_O00444 Result Count: 0
http://nextprot.org/rdf/entry/NX_Q06418 Result Count: 2
http://nextprot.org/rdf/entry/NX_P52333 Result Count: 0
http://nextprot.org/rdf/entry/NX_P62344 Result Count: 0
http://nextprot.org/rdf/entry/NX_P35916 Result Count: 1
http://nextprot.org/rdf/entry/NX_P00533 Result Count: 3
http://nextprot.org/rdf/entry/NX_O60563 Result Count: 4
http://nextprot.org/rdf/entry/NX_P22607 Result Count: 2
http://nextprot.org/rdf/entry/NX_P21802 Result Count: 8
http://nextprot.org/rdf/entry/NX_P54760 Result Count: 2
http://nextprot.org/rdf/entry/NX_P06213 Result Count: 0
http://nextprot.org/rdf/entry/NX_Q13546 Result Count: 1
http://nextprot.org/rdf/entry/NX_Q9NYY3 Result Count: 8
http://nextprot.org/rdf/entry/NX_Q00526 Result Count: 1
http://nextprot.org/rdf/entry/NX_P48730 Result C

http://nextprot.org/rdf/entry/NX_Q15746 Result Count: 26
http://nextprot.org/rdf/entry/NX_Q15208 Result Count: 0
http://nextprot.org/rdf/entry/NX_Q96PY6 Result Count: 1
http://nextprot.org/rdf/entry/NX_Q8WXR4 Result Count: 0
http://nextprot.org/rdf/entry/NX_P78362 Result Count: 3
http://nextprot.org/rdf/entry/NX_P45984 Result Count: 6
http://nextprot.org/rdf/entry/NX_O60331 Result Count: 1
http://nextprot.org/rdf/entry/NX_Q13555 Result Count: 3
http://nextprot.org/rdf/entry/NX_Q9NY57 Result Count: 0
http://nextprot.org/rdf/entry/NX_O14976 Result Count: 2
http://nextprot.org/rdf/entry/NX_Q14004 Result Count: 2
http://nextprot.org/rdf/entry/NX_Q9NRM7 Result Count: 1
http://nextprot.org/rdf/entry/NX_Q99755 Result Count: 1
http://nextprot.org/rdf/entry/NX_Q07002 Result Count: 2
http://nextprot.org/rdf/entry/NX_Q9Y243 Result Count: 2
http://nextprot.org/rdf/entry/NX_Q14164 Result Count: 4
http://nextprot.org/rdf/entry/NX_Q86V86 Result Count: 1
http://nextprot.org/rdf/entry/NX_Q9P0L2 Result 

http://nextprot.org/rdf/entry/NX_Q02779 Result Count: 1
http://nextprot.org/rdf/entry/NX_Q9UM73 Result Count: 0
http://nextprot.org/rdf/entry/NX_Q9H093 Result Count: 2
http://nextprot.org/rdf/entry/NX_Q9UK32 Result Count: 0
http://nextprot.org/rdf/entry/NX_Q6DT37 Result Count: 0
http://nextprot.org/rdf/entry/NX_Q9UL54 Result Count: 0
http://nextprot.org/rdf/entry/NX_P12931 Result Count: 1
http://nextprot.org/rdf/entry/NX_Q15835 Result Count: 0
http://nextprot.org/rdf/entry/NX_O60285 Result Count: 0
http://nextprot.org/rdf/entry/NX_P53671 Result Count: 5
http://nextprot.org/rdf/entry/NX_P49841 Result Count: 2
http://nextprot.org/rdf/entry/NX_Q92772 Result Count: 0
http://nextprot.org/rdf/entry/NX_Q9UF33 Result Count: 0
http://nextprot.org/rdf/entry/NX_P21860 Result Count: 14
http://nextprot.org/rdf/entry/NX_Q9JI10 Result Count: 0
http://nextprot.org/rdf/entry/NX_O75676 Result Count: 5
http://nextprot.org/rdf/entry/NX_Q9UIK4 Result Count: 0
http://nextprot.org/rdf/entry/NX_P53778 Result 

In [188]:
tissue_results[:5]

[[('http://nextprot.org/rdf/entry/NX_P15735',
   'http://nextprot.org/rdf/terminology/TS-1030',
   'Testis')],
 [],
 [],
 [],
 [('http://nextprot.org/rdf/entry/NX_Q06418',
   'http://nextprot.org/rdf/terminology/TS-0091',
   'Cerebral cortex'),
  ('http://nextprot.org/rdf/entry/NX_Q06418',
   'http://nextprot.org/rdf/terminology/TS-0730',
   'Ovary')]]

In [197]:
final_tissue_df = pd.DataFrame(list(itertools.chain.from_iterable(tissue_results)), columns=["proteinID", "tissue_id", "tissue_name"])

In [198]:
display(final_tissue_df)

Unnamed: 0,proteinID,tissue_id,tissue_name
0,http://nextprot.org/rdf/entry/NX_P15735,http://nextprot.org/rdf/terminology/TS-1030,Testis
1,http://nextprot.org/rdf/entry/NX_Q06418,http://nextprot.org/rdf/terminology/TS-0091,Cerebral cortex
2,http://nextprot.org/rdf/entry/NX_Q06418,http://nextprot.org/rdf/terminology/TS-0730,Ovary
3,http://nextprot.org/rdf/entry/NX_P35916,http://nextprot.org/rdf/terminology/TS-1262,Renal tubule
4,http://nextprot.org/rdf/entry/NX_P00533,http://nextprot.org/rdf/terminology/TS-1070,Trophoblast
5,http://nextprot.org/rdf/entry/NX_P00533,http://nextprot.org/rdf/terminology/TS-0799,Placenta
6,http://nextprot.org/rdf/entry/NX_P00533,http://nextprot.org/rdf/terminology/TS-0934,Skin
7,http://nextprot.org/rdf/entry/NX_O60563,http://nextprot.org/rdf/terminology/TS-1266,Endometrium stroma
8,http://nextprot.org/rdf/entry/NX_O60563,http://nextprot.org/rdf/terminology/TS-1070,Trophoblast
9,http://nextprot.org/rdf/entry/NX_O60563,http://nextprot.org/rdf/terminology/TS-1088,Urinary bladder urothelium
