In [2]:
# get api
# !wget https://raw.githubusercontent.com/bmeg/arachne/v0.2/aql.py

In [3]:
# graph queries
import aql 
# g2p drug normalizer
import drug_normalizer


In [4]:
# setup connection
conn = aql.Connection("http://arachne.compbio.ohsu.edu")
O = conn.graph("bmeg")

In [5]:
#  Individual drugs
tcga_drug_ids = list(O.query().V().where(aql.eq("$.label", "Individual")).outEdge("drugTherapyFrom").distinct("$.to").render(["$.to"]))
assert len(tcga_drug_ids) > 0

# ResponseCurve drugs
response_curve_drug_ids = list(O.query().V().where(aql.eq("$.label", "ResponseCurve")).outEdge("responseTo").distinct("$.to").render(["$.to"]))
assert len(response_curve_drug_ids) > 0

# G2PAssociation drugs
g2p_drug_ids = list(O.query().V().where(aql.eq("$.label", "G2PAssociation")).outEdge("environmentFor").distinct("$.to").render(["$.to"]))
assert len(g2p_drug_ids) > 0

drug_ids = tcga_drug_ids + response_curve_drug_ids + g2p_drug_ids


In [6]:
#
# harvest all based on last component of guid
#

for d in drug_ids:
    for c in d['data']:
        term = c.split(':')[-1]
        d['compounds'] = drug_normalizer.normalize(term)
        







In [18]:
# print 'edges with multiple drugs'
# [d for d in drug_ids if len(d['compounds']) > 1]

In [9]:
print '# incorporate into tcga TSV'
for d in drug_ids:
    for c in d['data']:
        name = c.split(':')[-1]
        if len([p for p in ['CID', 'SID', 'CHEMBL'] if name.startswith(p)]) > 0:
            continue
        if len(d['compounds']) == 0:
            continue
        if len(d['compounds']) > 1:
            print "# WARNING: multiple drugs -  >{}<".format(name)
        print '{}\t{}'.format(name, ','.join([compound['ontology_term'] for compound in d['compounds']]))
    
# [(name, d['compounds']) for name in [c.split(':')[-1] for d in tcga_drug_ids for c in d['data']] if not name.startswith('CID')]
    

# incorporate into tcga TSV
gemzar	CID60749
temsirolimus	CID6918289
torisel	CID6918289
oxaliplatin	CHEMBL414804
leucovorin calcium	CID6006,CHEMBL2146121
pegfilgrastim	CHEMBL1201568
cpt-11	SID597746
eloxatin	SID50018627
vinorelbine tartrate	CID5311497,CID3806114
mithramycin	CID163659
vepesid	CID36462
paraplatin	CID426756
ciclosporin	CID5284373
fareston	CID3005572
taxane	CID9548828
cpt 11	SID597746
vinorelbin	CID44424639
Nutlin-3	CID216345
LBW242	CID11503417
ML239	CID49843203
BRD-K66532283	SID50230990
MST-312	CID13450
NSC30930	CID81146
BRD-K27188169	SID50230990
ML210	CID49766530
BRD-K35604418	SID50230990
nakiterpiosin	CID11966550
BRD-K37390332	SID50230990
BRD-K78574327	SID50230990
BRD-K34222889	SID50230990
cucurbitacin I	SID5011
BRD-K41597374	SID50230990
BRD-K07442505	SID50230990
KHS101	CID71304818
BRD-K64610608	SID50230990
BRD-K71935468	SID50230990
BRD-K51490254	SID50230990
ML006	CID2842253
SRT-1720	CID447315
BRD-K97651142	SID50230990
PRIMA-1-Met	CID6137
erastin	CID11214940
BRD-K4133411

In [10]:
print 'edges with non pubchem compounds?'
for d in drug_ids:
    for c in d['compounds']:
        if not c['ontology_term'].startswith('CID'):
            print d['data'], c['ontology_term']

edges with non pubchem compounds?
[u'compound:UNKNOWN:oxaliplatin'] CHEMBL414804
[u'compound:UNKNOWN:leucovorin calcium'] CHEMBL2146121
[u'compound:UNKNOWN:pegfilgrastim'] CHEMBL1201568
[u'compound:UNKNOWN:cpt-11'] SID597746
[u'compound:UNKNOWN:eloxatin'] SID50018627
[u'compound:UNKNOWN:cpt 11'] SID597746
[u'compound:UNKNOWN:BRD-K66532283'] SID50230990
[u'compound:UNKNOWN:BRD-K27188169'] SID50230990
[u'compound:UNKNOWN:BRD-K35604418'] SID50230990
[u'compound:UNKNOWN:BRD-K37390332'] SID50230990
[u'compound:UNKNOWN:BRD-K78574327'] SID50230990
[u'compound:UNKNOWN:BRD-K34222889'] SID50230990
[u'compound:UNKNOWN:cucurbitacin I'] SID5011
[u'compound:UNKNOWN:BRD-K41597374'] SID50230990
[u'compound:UNKNOWN:BRD-K07442505'] SID50230990
[u'compound:UNKNOWN:BRD-K64610608'] SID50230990
[u'compound:UNKNOWN:CID-5951923'] SID854372
[u'compound:UNKNOWN:BRD-K71935468'] SID50230990
[u'compound:UNKNOWN:BRD-K51490254'] SID50230990
[u'compound:UNKNOWN:BRD-K97651142'] SID50230990
[u'compound:UNKNOWN:BRD-K413

In [11]:
print 'edges with "incomplete" compounds?'
for d in  drug_ids:
    for c in d['compounds']:
        for p in ['synonym', 'taxonomy', 'toxicity']:
            if p not in c:
                print d['data'], '{} missing {}'.format(c['ontology_term'], p)

edges with "incomplete" compounds?
[u'compound:CID38904'] CID38904 missing taxonomy
[u'compound:CID38904'] CID38904 missing toxicity
[u'compound:UNKNOWN:gemzar'] CID60749 missing taxonomy
[u'compound:UNKNOWN:gemzar'] CID60749 missing toxicity
[u'compound:UNKNOWN:torisel'] CID6918289 missing taxonomy
[u'compound:UNKNOWN:torisel'] CID6918289 missing toxicity
[u'compound:CID143'] CID143 missing taxonomy
[u'compound:CID143'] CID143 missing toxicity
[u'compound:UNKNOWN:oxaliplatin'] CHEMBL414804 missing taxonomy
[u'compound:UNKNOWN:oxaliplatin'] CHEMBL414804 missing toxicity
[u'compound:UNKNOWN:leucovorin calcium'] CHEMBL2146121 missing taxonomy
[u'compound:UNKNOWN:leucovorin calcium'] CHEMBL2146121 missing toxicity
[u'compound:UNKNOWN:pegfilgrastim'] CHEMBL1201568 missing taxonomy
[u'compound:UNKNOWN:pegfilgrastim'] CHEMBL1201568 missing toxicity
[u'compound:UNKNOWN:cpt-11'] SID597746 missing taxonomy
[u'compound:UNKNOWN:cpt-11'] SID597746 missing toxicity
[u'compound:UNKNOWN:eloxatin'] SI

[u'compound:CHEMBL1201836'] CHEMBL1201836 missing taxonomy
[u'compound:CHEMBL1201836'] CHEMBL1201836 missing toxicity
[u'compound:CID91864709'] CID91864709 missing taxonomy
[u'compound:CID91864709'] CID91864709 missing toxicity
[u'compound:CID176167'] CID176167 missing toxicity
[u'compound:CHEMBL1743001'] CHEMBL1743001 missing taxonomy
[u'compound:CHEMBL1743001'] CHEMBL1743001 missing toxicity
[u'compound:CHEMBL1201587'] CHEMBL1201587 missing taxonomy
[u'compound:CHEMBL1201587'] CHEMBL1201587 missing toxicity
[u'compound:CHEMBL2108950'] CHEMBL2108950 missing taxonomy
[u'compound:CHEMBL2108950'] CHEMBL2108950 missing toxicity
[u'compound:CID657237'] CID657237 missing toxicity
[u'compound:CID5212'] CID5212 missing toxicity
[u'compound:CID9961878'] CID9961878 missing taxonomy
[u'compound:CID9961878'] CID9961878 missing toxicity
[u'compound:CHEMBL2108531'] CHEMBL2108531 missing taxonomy
[u'compound:CHEMBL2108531'] CHEMBL2108531 missing toxicity
[u'compound:CID60961'] CID60961 missing toxic

[u'compound:CID11570805'] CID11570805 missing toxicity
[u'compound:CHEMBL3039543'] CHEMBL3039543 missing taxonomy
[u'compound:CHEMBL3039543'] CHEMBL3039543 missing toxicity
[u'compound:CID9904'] CID9904 missing toxicity
[u'compound:CID20055360'] CID20055360 missing taxonomy
[u'compound:CID20055360'] CID20055360 missing toxicity
[u'compound:CID65997'] CID65997 missing toxicity
[u'compound:CID9800306'] CID9800306 missing taxonomy
[u'compound:CID9800306'] CID9800306 missing toxicity
[u'compound:CHEMBL1743059'] CHEMBL1743059 missing taxonomy
[u'compound:CHEMBL1743059'] CHEMBL1743059 missing toxicity
[u'compound:CID33042'] CID33042 missing toxicity
[u'compound:CID72551585'] CID72551585 missing taxonomy
[u'compound:CID72551585'] CID72551585 missing toxicity
[u'compound:CID16186062'] CID16186062 missing taxonomy
[u'compound:CID16186062'] CID16186062 missing toxicity
[u'compound:CID16063245'] CID16063245 missing taxonomy
[u'compound:CID16063245'] CID16063245 missing toxicity
[u'compound:CID503

[u'compound:CID8567'] CID8567 missing taxonomy
[u'compound:CID8567'] CID8567 missing toxicity
[u'compound:CID443971'] CID443971 missing taxonomy
[u'compound:CID443971'] CID443971 missing toxicity
[u'compound:CHEMBL1200346'] CHEMBL1200346 missing taxonomy
[u'compound:CHEMBL1200346'] CHEMBL1200346 missing toxicity
[u'compound:CHEMBL1201464'] CHEMBL1201464 missing taxonomy
[u'compound:CHEMBL1201464'] CHEMBL1201464 missing toxicity


In [12]:
a = []
for d in drug_ids:
    for c in d['compounds']:
        source = 'UNKNOWN'
        if 'chembl' in c['source']:
            source = 'CHEMBL'            
        if 'pubchem' in c['source']:
            source = 'PUBCHEM'            
        a.append({'source': source, 'id': c['ontology_term'], 'name': c['synonym']})

In [13]:
set([c['source'] for c in a])

{'CHEMBL', 'PUBCHEM', 'UNKNOWN'}

In [14]:
len(a)

2437

In [15]:
a[1]

{'id': 'CID31703', 'name': 'Doxorubicin', 'source': 'PUBCHEM'}

In [16]:
import json

with open('Compounds.json', 'w') as outfile:
    for c in a:
        json.dump(c, outfile)
        outfile.write('\n')