# Extract PTSD-related Dense Dynamic Data Cloud using Blackboard

## John C. Earls (ISB)
## Chunhua Weng, Chi Yuan(Columbia)
## Mike Yu (UCSD)
## Mark Williams, Rajarshi Guha (NCATS)
## I know this is not everybody, but it is a good start

**APIs used**:
* http://isbtranslatorapi.adversary.us/
* http://www.ndexbio.org/
* http://biothings.io/

**Preconditions**:
* The problem of understanding PTSD has been posed to the blackboard.
* The orange team has identified drugs commonly prescribed to PTSD patients using OHDSI 
* The blue team recognizes this beacon as a request to contextualize adenosine using Dense Dynamic Data Clouds

**Blackboard steps performed**:
* Drugs are mapped to their targest
* Query HPWP API for target specific subnetworks
* Write drug target specific subnetworks back to blackboard

**NOTE** : This is real data, these are real results.  Someone could reasonably follow up on this.

In [1]:
import urllib2
import json
import requests
import logging
import pandas
from collections import Counter
logging.basicConfig(level=logging.WARNING,
                    format='%(asctime)s %(levelname)s %(message)s',)

#Some helper query functions to API
base_url = 'http://isbtranslatorapi.adversary.us'
def query_isb(endpoint, data={}, base_url=base_url):
    req = requests.post('%s/%s' % (base_url,endpoint), data=data)
    return req.json()

def get_analytes(kwargs):
    kw_local = kwargs.copy()
    frm = 0
    size = 1000
    meta = []
    kw_local['from'] = frm
    kw_local['size'] = size
    res = query_isb('/v1/analyte', data=kw_local)
    meta += res
    # Note: this is relying on the pagination, it would be smarter to just partition
    # the *sig_ids* set which would greatly speed up the query
    while len(res) > 0:
        kw_local['from'] += size
        logging.debug("Saving records from %i to %i" %(frm, frm+size))
        res = query_isb('/v1/analyte', data=kw_local)
        meta+=res
    return meta

def get_correlations(kwargs):
    kw_local = kwargs.copy()
    sigs = []
    frm = 0
    size = 10000
    meta = []
    kw_local['from'] = frm
    kw_local['size'] = size
    res = query_isb('v1/correlation', data=kw_local)
    correlations = res[:]
    while len(res) > 0:
        logging.debug("Saving records from %i to %i" %(frm, frm+size))
        kw_local['from'] += size
        frm = kw_local['from']
        res = query_isb('v1/correlation', data=kw_local)
        correlations += res
    return correlations

### The data service identifies a set of metabolites in the DDDC which contain adenosine

In [2]:
drugs = pandas.read_csv('patientdrugs.txt', sep='\t', header=None)
drugs.columns=['OHDSI_id', "drug_name", "count"]
drugs = drugs.drop(0)
drugs

Unnamed: 0,OHDSI_id,drug_name,count
1,2213440,"Influenza virus vaccine, trivalent, split viru...",2006
2,19025280,Oxygen 99 % Gas for Inhalation,1984
3,19019418,Omeprazole 20 MG Delayed Release Oral Capsule,1560
4,19077577,Gemfibrozil 600 MG Oral Tablet,1260
5,19019116,Lovastatin 20 MG Oral Tablet,1176
6,1539407,Simvastatin 40 MG Oral Tablet,1087
7,40080069,Sodium Chloride Injectable Solution,1022
8,957136,Furosemide 40 MG Oral Tablet,1007
9,1539411,Simvastatin 20 MG Oral Tablet,978
10,40164929,Metformin hydrochloride 500 MG Oral Tablet,970


In [3]:
def get_targets_from_drug_table(drugs):
    target_map = {}
    ctr = 1 
    for i,r in drugs.iterrows():
        if i > 0:
            query = 'http://c.biothings.io/v1/query?q=drugbank.name:%s' % r['drug_name']
            req = requests.get(query)
            res = req.json()
            if 'success' in res and not res['success']:
                print "Error"
                print query
                print res
            else:
                if res['total'] > 0:
                    target_map[i] = {'full_response':res}
                    for h in res['hits']:
                        if 'drugbank' in h:
                            if 'targets' in h['drugbank']:
                                if 'targets' not in target_map[i]:
                                    target_map[i]['targets'] = []
                                target_map[i]['targets'].append(h['drugbank']['targets'])
    ups = {}
    for k, v in target_map.items():
        ups[k] = []
        if 'targets' in v:
            for t in v['targets']:
                if type(t) is list:
                    for x in t:
                        if 'uniprot' in x:  
                            ups[k].append(x['uniprot'])
                else:
                    if 'uniprot' in t:
                        ups[k].append(t['uniprot'])
    return ups

In [4]:
from biothings_client import get_client
from biothings_explorer import IdListHandler
def p100_protein_to_uniprot_map():
    """Map the p100 proteins to uniprot ids
    """
    prot_vars = get_analytes({'category':"Proteomics"})
    no_up = []
    up_to_prot = {}
    #map the ones that actually have uniprot ids
    for v in prot_vars:
        if 'uniprot' in v:
            up = v['uniprot']
            if up not in up_to_prot:
                up_to_prot[up] = []
            up_to_prot[up].append(v)
        else:
            no_up.append(v)
    # ones without uniprot ids

    md = get_client('drug')
    ih = IdListHandler()
    missing = []
    still_missing = []
    for prot in no_up:
        req = requests.get('http://mygene.info/v3/query?q=symbol:%s' % (prot['abbreviation'],))
        res = req.json()
        if res['total'] > 0:
            egs = map(str,[x['entrezgene'] for x in res['hits'] if 'entrezgene' in x])
            uniprot_list = ih.list_handler(input_id_list=egs, input_type='entrez_gene_id', output_type='uniprot_id')
            if len(uniprot_list):
                for up in uniprot_list:
                    if up not in up_to_prot:
                        up_to_prot[up] = []
                    up_to_prot[up].append(prot)
            else:
                still_missing.append(prot)
        else:
            still_missing.append(prot)
    still_missing2 = []
    for prot in still_missing:
        req = requests.get('http://mygene.info/v3/query?q=symbol:%s' % (prot['abbreviation'].replace('_',''),))
        res = req.json()
        if res['total'] > 0:
            egs = map(str,[x['entrezgene'] for x in res['hits'] if 'entrezgene' in x])
            uniprot_list = ih.list_handler(input_id_list=egs, input_type='entrez_gene_id', output_type='uniprot_id')
            if len(uniprot_list):
                for up in uniprot_list:
                    if up not in up_to_prot:
                        up_to_prot[up] = []
                    up_to_prot[up].append(prot)
            else:
                still_missing2.append(prot)
        else:
            still_missing2.append(prot)
    print "%i unmapped proteins, %i mapped proteins" % (len(still_missing2), len(up_to_prot))
    return up_to_prot 

In [5]:
# lets find protein
def from_dt_drugs_to_p100_proteins(ups, up_to_prot):
    targets_to_drugs = {}
    for k, v in ups.items():
        for upd in v:
            if upd in up_to_prot:
                if k not in targets_to_drugs:
                    targets_to_drugs[k] = []
                targets_to_drugs[k].append(up_to_prot[upd])
    return targets_to_drugs

In [6]:
def describe_network(subnet, drug_name):
    print "%s targets %s" % (drug_name, ','.join(subnet['target']))
    print "%i edges in HPWP in %s seeded network." % (len(subnet['edges']), drug_name,)
    num_nodes = len(subnet['nodes'])
    print "%i total nodes in HPWP %s seeded subnetwork" % (num_nodes, drug_name)
    for cat, count in Counter([v['category'] for v in subnet['nodes'].values()]).items():
        print " - %i %s in HPWP %s seeded network" % (count, cat, drug_name)

def get_subnets(targets_to_drugs):
    neighbors = {}
    for k, v in targets_to_drugs.items():
        neighbors[k] = {}
        id_list = list(set([prot['_id'] for prot in v[0]]))
        neighbors[k]['target'] = id_list
        #return neighbors of target
        acorr = get_correlations({'ids1':','.join(id_list), 'bh_adjusted_pvalue':.1})
        adf = pandas.DataFrame(acorr)
        nodes = set(adf._id_1.tolist() + adf._id_2.tolist())
        my_nodes = {a['_id']: a for a in get_analytes({'ids':','.join(nodes)})}
        # get the connecting edges
        acorr = get_correlations({'ids1':','.join(my_nodes.keys()), 'ids2':','.join(my_nodes.keys())
                                  , 'bh_adjusted_pvalue':.1})
        adf = pandas.DataFrame(acorr)                 
        neighbors[k]['edges'] = adf
        neighbors[k]['nodes'] = my_nodes
    return neighbors

In [7]:
my_drugs = drugs[drugs['count']> 200]

### Grab the uniprot ids of proteins that are targets of these drugs

In [8]:
dt_idx_to_uniprot_target = get_targets_from_drug_table(my_drugs)

Error
http://c.biothings.io/v1/query?q=drugbank.name:Acetaminophen 650 MG / propoxyphene napsylate 100 MG Oral Tablet
{u'success': False, u'error': u'Could not execute query due to the following exception(s): [\'token_mgr_error token_mgr_error: Lexical error at line 1, column 79.  Encountered: <EOF> after : "/ propoxyphene napsylate 100 MG Oral Tablet"\']'}
Error
http://c.biothings.io/v1/query?q=drugbank.name:Acetaminophen 500 MG / Hydrocodone Bitartrate 5 MG Oral Tablet
{u'success': False, u'error': u'Could not execute query due to the following exception(s): [\'token_mgr_error token_mgr_error: Lexical error at line 1, column 77.  Encountered: <EOF> after : "/ Hydrocodone Bitartrate 5 MG Oral Tablet"\']'}
Error
http://c.biothings.io/v1/query?q=drugbank.name:Acetaminophen 300 MG / Codeine Phosphate 30 MG Oral Tablet
{u'success': False, u'error': u'Could not execute query due to the following exception(s): [\'token_mgr_error token_mgr_error: Lexical error at line 1, column 73.  Encounte

Error
http://c.biothings.io/v1/query?q=drugbank.name:lansoprazole 15 MG Delayed Release Oral Capsule [Prevacid]
{u'success': False, u'error': u'Could not execute query due to the following exception(s): [\'parse_exception parse_exception: Encountered " "]" "] "" at line 1, column 71.\\nWas expecting one of:\\n    "TO" ...\\n    <RANGE_QUOTED> ...\\n    <RANGE_GOOP> ...\\n    \']'}
Error
http://c.biothings.io/v1/query?q=drugbank.name:Acetaminophen 500 MG / Hydrocodone Bitartrate 10 MG Oral Tablet
{u'success': False, u'error': u'Could not execute query due to the following exception(s): [\'token_mgr_error token_mgr_error: Lexical error at line 1, column 78.  Encountered: <EOF> after : "/ Hydrocodone Bitartrate 10 MG Oral Tablet"\']'}
Error
http://c.biothings.io/v1/query?q=drugbank.name:Ipratropium Bromide 0.2 MG/ML Inhalant Solution
{u'success': False, u'error': u'Could not execute query due to the following exception(s): [\'token_mgr_error token_mgr_error: Lexical error at line 1, colum

Error
http://c.biothings.io/v1/query?q=drugbank.name:irbesartan 150 MG Oral Tablet [Avapro]
{u'success': False, u'error': u'Could not execute query due to the following exception(s): [\'parse_exception parse_exception: Encountered " "]" "] "" at line 1, column 51.\\nWas expecting one of:\\n    "TO" ...\\n    <RANGE_QUOTED> ...\\n    <RANGE_GOOP> ...\\n    \']'}


### Get a dictionary with keys uniprot -> values p100 nodes

In [9]:
import pickle
import os
if os.path.exists('uniprot_to_p100_protein_nodes.pkl'):
    uniprot_to_p100_protein_nodes = pickle.load(open('uniprot_to_p100_protein_nodes.pkl', 'r'))
else:
    uniprot_to_p100_protein_nodes = p100_protein_to_uniprot_map()
    pickle.dump(uniprot_to_p100_protein_nodes, open('uniprot_to_p100_protein_nodes.pkl','w'))

### Get a dict with keys dt idx -> p100 targets

In [10]:
dt_drugs_to_p100_proteins = from_dt_drugs_to_p100_proteins(dt_idx_to_uniprot_target, uniprot_to_p100_protein_nodes)

### Get subnets related to targets

In [12]:
subnets = get_subnets(dt_drugs_to_p100_proteins)

In [49]:
## Summary of results

In [13]:
for k, sub in subnets.items():
    print "Index:", k
    drug_name =  my_drugs.loc[k, 'drug_name']
    describe_network(sub, drug_name)
    print "="*30

Index: 7
Sodium Chloride Injectable Solution targets PROTE.None.liver.SERPINC1.TSDQIHFFFAK,PROTE.None.liver.SERPINC1.DDLYVSDAFHK
4481 edges in HPWP in Sodium Chloride Injectable Solution seeded network.
99 total nodes in HPWP Sodium Chloride Injectable Solution seeded subnetwork
 - 93 Proteomics in HPWP Sodium Chloride Injectable Solution seeded network
 - 6 Clinical Labs in HPWP Sodium Chloride Injectable Solution seeded network
Index: 14
Dipyridamole 25 MG Oral Tablet targets PROTE.None.Inflammation.ADA.None
5430 edges in HPWP in Dipyridamole 25 MG Oral Tablet seeded network.
86 total nodes in HPWP Dipyridamole 25 MG Oral Tablet seeded subnetwork
 - 86 Proteomics in HPWP Dipyridamole 25 MG Oral Tablet seeded network
Index: 71
Heparin targets PROTE.None.liver.SERPINC1.TSDQIHFFFAK,PROTE.None.liver.SERPINC1.DDLYVSDAFHK
4481 edges in HPWP in Heparin seeded network.
99 total nodes in HPWP Heparin seeded subnetwork
 - 93 Proteomics in HPWP Heparin seeded network
 - 6 Clinical Labs in HPWP 

In [23]:
# carvedilol
[x for x in subnets[280]['nodes'].values() if x['category'] == 'Clinical Labs']

[{u'_id': u'CHEMS.None.Genova.indoleacetic_acid',
  u'category': u'Clinical Labs',
  u'hmdb': u'HMDB00197',
  u'name': u'indoleacetic_acid',
  u'vendor': u'Genova'},
 {u'_id': u'CHEMS.None.Genova.homa_ir',
  u'category': u'Clinical Labs',
  u'name': u'homa_ir',
  u'vendor': u'Genova'},
 {u'_id': u'CHEMS.None.Genova.c_peptide',
  u'category': u'Clinical Labs',
  u'name': u'c_peptide',
  u'vendor': u'Genova'},
 {u'_id': u'CHEMS.None.Genova.hba1c',
  u'category': u'Clinical Labs',
  u'name': u'hba1c',
  u'vendor': u'Genova'},
 {u'_id': u'CHEMS.None.Genova.interleukin_il6',
  u'category': u'Clinical Labs',
  u'name': u'interleukin_il6',
  u'vendor': u'Genova'},
 {u'_id': u'CHEMS.None.Genova.insulin',
  u'category': u'Clinical Labs',
  u'name': u'insulin',
  u'vendor': u'Genova'},
 {u'_id': u'CHEMS.None.Genova.homogentisic_acid',
  u'category': u'Clinical Labs',
  u'hmdb': u'HMDB00130',
  u'name': u'homogentisic_acid',
  u'vendor': u'Genova'}]

In [25]:
# heparin
[x for x in subnets[71]['nodes'].values() if x['category'] == 'Clinical Labs']

[{u'_id': u'CHEMS.None.Genova.tyrosine_plasma',
  u'category': u'Clinical Labs',
  u'hmdb': u'HMDB00158',
  u'name': u'tyrosine',
  u'vendor': u'Genova'},
 {u'_id': u'CHEMS.None.Genova.hs_crp',
  u'category': u'Clinical Labs',
  u'name': u'hs_crp',
  u'vendor': u'Genova'},
 {u'_id': u'CHEMS.None.Genova.hs_crp_relative_risk',
  u'category': u'Clinical Labs',
  u'name': u'hs_crp_relative_risk',
  u'vendor': u'Genova'},
 {u'_id': u'CHEMS.None.Genova.leptin',
  u'category': u'Clinical Labs',
  u'name': u'leptin',
  u'vendor': u'Genova'},
 {u'_id': u'CHEMS.None.Genova.insulin',
  u'category': u'Clinical Labs',
  u'name': u'insulin',
  u'vendor': u'Genova'},
 {u'_id': u'CHEMS.None.Genova.glutamic_acid_plasma',
  u'category': u'Clinical Labs',
  u'hmdb': u'HMDB00148',
  u'name': u'glutamic_acid',
  u'vendor': u'Genova'}]

In [48]:
print "Out of ", len(my_drugs), " drugs examined  ", len(subnets), " had direct targets in the p100 proteins out of ", len(set(sum(dt_idx_to_uniprot_target.values(), []))), " possible identified targets"
print "%.1f percent" % ((len(subnets)/float(len(my_drugs))) *100)
print "Note there are many repeated drugs at different dosages."

Out of  343  drugs examined   10  had direct targets in the p100 proteins out of  340  possible identified targets
2.9 percent


In [43]:
print "We are measuring %.1f percent of proteome" % ((len(uniprot_to_p100_protein_nodes.keys())/20000.0) * 100)

We are measuring 1.5 percent of proteome
