## Let me compare MW, logP and PSA for known oxidoreductase inhibitors

* August 20, 2019
* Workshop - FederatedPhacts 
* Paul Groth, Egon Willighagen, Alasdair Gray, Nick Juty, Alan Gateau, Chris Evelo


* What we're doing:
   * extending on the results from Tissue Expression Profiling For Compound [Notebook](https://github.com/openphacts/FederatedPhacts/blob/master/notebooks/TissueExpressionProfilingForCompound.ipynb)
   * building on the federated [query](https://github.com/openphacts/FederatedPhacts/blob/master/oxidoreductase_inhibitors.rq) that extracts the data from Wikidata and the EBI RDF Platform
   * use the modular grlc approach to retrieve the same result
   
This query was one of the top 3 prioritised business questions in the [Open PHACTS project](http://www.openphacts.org).

In [1]:
from IPython.display import display
from SPARQLWrapper import SPARQLWrapper, JSON
import requests
import pandas as pd
from ipywidgets import interact, widgets
from IPython.display import display
import itertools
import time

### Get a set of wikidata molecules that have the role of being an oxidoreductase inhibitor

In [2]:
oxidoreductaseInhibitorMoleculeQuery = """
PREFIX bd: <http://www.bigdata.com/rdf#>
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wd:  <http://www.wikidata.org/entity/>

SELECT DISTINCT ?molecule ?moleculeLabel WHERE {
    ?molecule wdt:P31/wdt:P279* wd:Q66587127 .
    SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
} LIMIT 10
"""

In [3]:
sparql_wikidata = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql_wikidata.setQuery(oxidoreductaseInhibitorMoleculeQuery)
sparql_wikidata.setReturnFormat(JSON)
results = sparql_wikidata.query().convert()
display(results)

{'head': {'vars': ['molecule', 'moleculeLabel']},
 'results': {'bindings': [{'molecule': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q393336'},
    'moleculeLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'kaempferol'}},
   {'molecule': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q421162'},
    'moleculeLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'bifonazole'}},
   {'molecule': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q951449'},
    'moleculeLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'myricetin'}},
   {'molecule': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q4637100'},
    'moleculeLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': '4-androstene-3,6,17-trione'}},
   {'molecule': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q4817643'},
    'moleculeLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'atromentin

In [4]:
molecule_list = []
for r in results['results']['bindings']:
    molecule_list.append(r["molecule"]['value'])
molecule_list[:5]

['http://www.wikidata.org/entity/Q393336',
 'http://www.wikidata.org/entity/Q421162',
 'http://www.wikidata.org/entity/Q951449',
 'http://www.wikidata.org/entity/Q4637100',
 'http://www.wikidata.org/entity/Q4817643']

### Find the MW, logP and PSA for each of the oxidoreductase

In [5]:
wikidata_uri = "http://www.wikidata.org/entity/Q4817643"
payload = {'molecule': wikidata_uri}
headers = {'accept': 'application/json'}
r = requests.get('http://grlc.io/api/openphacts/FederatedPhacts/wikidata-molecule-information', headers=headers, params=payload)
results = r.json()
cols = results['head']['vars']
out = []
for r in results["results"]['bindings']:
    item = []
    for c in cols:
        item.append(r.get(c, {}).get('value'))
    out.append(item)
display(out)

[['atromentin',
  'C₁₈H₁₂O₆',
  'C1=CC(=CC=C1C2=C(C(=O)C(=C(C2=O)O)C3=CC=C(C=C3)O)O)O',
  'FKQQKMGWCJGUCS-UHFFFAOYSA-N',
  '1S/C18H12O6/c19-11-5-1-9(2-6-11)13-15(21)17(23)14(18(24)16(13)22)10-3-7-12(20)8-4-10/h1-8,19-21,24H',
  None,
  '324.063388']]

In [6]:
def wikidata_chemical_information(wikidata_uri):  
    payload = {'molecule': wikidata_uri}
    headers = {'accept': 'application/json'}
    r = requests.get('http://grlc.io/api/openphacts/FederatedPhacts/wikidata-molecule-information', headers=headers, params=payload)
    results = r.text
    print(results)
    # Need to pause 60s to not overload the wikidata service
    # https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#Query_limits
    time.sleep(60)
#     cols = results['head']['vars']
#     out = []
#     for r in results["results"]['bindings']:
#         item = []
#         for c in cols:
#             item.append(r.get(c, {}).get('value'))
#         out.append(item)
#     return pd.DataFrame(out, columns=cols)

In [7]:
molecule_results = [wikidata_chemical_information(x) for x in molecule_list]

{
  "head" : {
    "vars" : [ "moleculeLabel", "formula", "smiles", "inchikey", "inchi", "logp", "molweight" ]
  },
  "results" : {
    "bindings" : [ {
      "formula" : {
        "type" : "literal",
        "value" : "C₁₅H₁₀O₆"
      },
      "moleculeLabel" : {
        "xml:lang" : "en",
        "type" : "literal",
        "value" : "kaempferol"
      },
      "smiles" : {
        "type" : "literal",
        "value" : "C1=CC(=CC=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O)O"
      },
      "inchikey" : {
        "type" : "literal",
        "value" : "IYRMWMYZSQPJKC-UHFFFAOYSA-N"
      },
      "inchi" : {
        "type" : "literal",
        "value" : "1S/C15H10O6/c16-8-3-1-7(2-4-8)15-14(20)13(19)12-10(18)5-9(17)6-11(12)21-15/h1-6,16-18,20H"
      },
      "molweight" : {
        "datatype" : "http://www.w3.org/2001/XMLSchema#decimal",
        "type" : "literal",
        "value" : "286.048"
      }
    }, {
      "formula" : {
        "type" : "literal",
        "value" : "C₁₅H₁₀O₆"
      },
 

{
  "head" : {
    "vars" : [ "moleculeLabel", "formula", "smiles", "inchikey", "inchi", "logp", "molweight" ]
  },
  "results" : {
    "bindings" : [ {
      "formula" : {
        "type" : "literal",
        "value" : "C₁₅H₁₀O₇"
      },
      "moleculeLabel" : {
        "xml:lang" : "en",
        "type" : "literal",
        "value" : "quercetin"
      },
      "smiles" : {
        "type" : "literal",
        "value" : "C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O)O)O"
      },
      "inchikey" : {
        "type" : "literal",
        "value" : "REFJWTPEDVJJIY-UHFFFAOYSA-N"
      },
      "inchi" : {
        "type" : "literal",
        "value" : "1S/C15H10O7/c16-7-4-10(19)12-11(5-7)22-15(14(21)13(12)20)6-1-2-8(17)9(18)3-6/h1-5,16-19,21H"
      },
      "molweight" : {
        "datatype" : "http://www.w3.org/2001/XMLSchema#decimal",
        "type" : "literal",
        "value" : "302.043"
      }
    } ]
  }
}
{
  "head" : {
    "vars" : [ "moleculeLabel", "formula", "smiles", "inchikey"

In [None]:
molecule_results

In [None]:
final_tissue_df = pd.DataFrame(list(itertools.chain.from_iterable(tissue_results)), columns=["proteinID", "tissue_id", "tissue_name"])

In [None]:
display(final_tissue_df)

### Wikidata search functions

In [None]:
def wikidata_search(search_str):
    wd_search_param = {"action":"wbsearchentities", "format": "json", "limit":"1","language":"en","search":search_str}
    wd_search_url = "https://www.wikidata.org/w/api.php"
    r = requests.get(wd_search_url, params = wd_search_param)
    return r.json()
    

In [None]:

search_results = widgets.Label(value="")

text = widgets.Text(
    value='last',
    placeholder='Search',
    description='String:',
    disabled=False,
    continuous_update=True
)


def callback(update):
    search_str = update["new"]
    displaystring = ""
    r = wikidata_search(search_str)
    if "search" in r:
        for i in r["search"]:
            displaystring = displaystring + i["id"] + " | " + i["match"]["text"]
            if "description" in i:
                displaystring = displaystring + " | " + i["description"]
    search_results.value = displaystring
    

def finalize(input):
    finalresult = search_results.value
    

text.observe(callback, 'value')
text.on_submit(finalize)


___Run the next two cells to see the search box and the search results. What you see in the search results will be input to the subsequent cells___

In [None]:
display(text)

In [None]:
display(search_results)

In [None]:
wd_query_id = search_results.value.split(" | ")[0]

In [None]:
wd_query_id