# Intro

In [1]:
import os
import os.path as op
import pickle
import time

import numpy as np
import scipy as sp
import pandas as pd
import sqlalchemy as sa

from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns

import qgrid
qgrid.nbinstall(overwrite=True)
qgrid.set_defaults(remote_js=True, precision=4)

from common import dat



In [2]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Parse data

This part can be skipped if you already ran it.

## drug_info_release

In [4]:
# Read text file
drug_info_release = pd.read_csv(
    'challenge_data/drug_synergy_data/drug_info_release.csv/'
    'Drug_info_release.csv', sep=',')

drug_info_release_columns = list(drug_info_release.columns)

In [5]:
display(drug_info_release.head())
print(drug_info_release.shape)

Unnamed: 0,ChallengeName,Target(Official Symbol),HBA,cLogP,HBD,Lipinski,SMILES or PubChem ID,MW
0,ADAM17,ADAM17,,,,,,
1,AKT,"AKT1, AKT2, AKT3",8.0,1.18,5.0,0.0,c1cc(ccc1[C@H](CCO)NC(=O)C2(CCN(CC2)c3c4cc[nH]...,428.9
2,AKT_1,AKT*,6.0,3.24,3.0,0.0,c1ccc(cc1)c2cc3c(ccn4c3n[nH]c4=O)nc2c5ccc(cc5)...,407.5
3,AKT_PIK3C,"AKT*,PIK3C*",,,,,,
4,AKT_SGK,"AKT*,SGK*",,,,,,


(119, 8)


In [7]:
# Save results to the database
if False:
    engine = sa.create_engine('mysql://strokach:@192.168.6.19:3306/az_dream')
    drug_info_release.to_sql('drug_info_release', engine, index=False, if_exists='replace')
    engine.execute('create unique index challenge_name_idx on drug_info_release (ChallengeName(255))')

## drug_info_release_2

In [8]:
# Read chemical features from Pubchem
output_filename = (
    'challenge_data/drug_synergy_data/drug_info_release.csv/drug_info_release_2.tsv'
)
if not op.isfile(output_filename):
    from functions import *
    drug_info_release_2 = get_drug_features(drug_info_release)
    drug_info_release_2.to_csv(output_filename, sep='\t', index=False)
else:
    drug_info_release_2 = pd.read_csv(output_filename, sep='\t')

drug_info_release_2_columns = (
    drug_info_release_columns + 
    [c for c in drug_info_release_2.columns if c not in drug_info_release_columns]
)

In [9]:
drug_info_release_2[drug_info_release_2_columns].head()

Unnamed: 0,ChallengeName,Target(Official Symbol),HBA,cLogP,HBD,Lipinski,SMILES or PubChem ID,MW,AtomStereoCount,BondStereoCount,CID,CanonicalSMILES,Charge,Complexity,ConformerCount3D,ConformerModelRMSD3D,CovalentUnitCount,DefinedAtomStereoCount,DefinedBondStereoCount,EffectiveRotorCount3D,ExactMass,FeatureAcceptorCount3D,FeatureAnionCount3D,FeatureCationCount3D,FeatureCount3D,FeatureDonorCount3D,FeatureHydrophobeCount3D,FeatureRingCount3D,Fingerprint2D,HBondAcceptorCount,HBondDonorCount,HeavyAtomCount,IUPACName,InChI,InChIKey,IsomericSMILES,IsotopeAtomCount,MolecularFormula,MolecularWeight,MonoisotopicMass,RotatableBondCount,TPSA,UndefinedAtomStereoCount,UndefinedBondStereoCount,Volume3D,XLogP,XStericQuadrupole3D,YStericQuadrupole3D,ZStericQuadrupole3D
0,ADAM17,ADAM17,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,AKT,"AKT1, AKT2, AKT3",8.0,1.18,5.0,0.0,c1cc(ccc1[C@H](CCO)NC(=O)C2(CCN(CC2)c3c4cc[nH]...,428.9,1.0,0.0,25227436.0,C1CN(CCC1(C(=O)NC(CCO)C2=CC=C(C=C2)Cl)N)C3=NC=...,0.0,580.0,256.0,1.0,1.0,1.0,0.0,8.2,428.172752,2.0,0.0,4.0,14.0,4.0,0.0,4.0,AAADceB7sAAEAAAAAAAAAAAAAAAAAWAAAAA8WIAAAAAAAF...,6.0,4.0,30.0,4-amino-N-[(1S)-1-(4-chlorophenyl)-3-hydroxypr...,InChI=1S/C21H25ClN6O2/c22-15-3-1-14(2-4-15)17(...,JDUBGYFRJFOXQC-KRWDZBQOSA-N,C1CN(CCC1(C(=O)N[C@@H](CCO)C2=CC=C(C=C2)Cl)N)C...,0.0,C21H25ClN6O2,428.9152,428.172752,6.0,120.0,0.0,0.0,319.3,1.7,14.27,5.17,1.08
2,AKT_1,AKT*,6.0,3.24,3.0,0.0,c1ccc(cc1)c2cc3c(ccn4c3n[nH]c4=O)nc2c5ccc(cc5)...,407.5,0.0,0.0,24964624.0,C1CC(C1)(C2=CC=C(C=C2)C3=C(C=C4C(=N3)C=CN5C4=N...,0.0,760.0,4.0,0.8,1.0,0.0,0.0,3.8,407.17461,2.0,0.0,1.0,11.0,2.0,0.0,6.0,AAADceB7oAAAAAAAAAAAAAAAAABgAQAAAAA8YIEAAAAAAE...,4.0,2.0,31.0,8-[4-(1-aminocyclobutyl)phenyl]-9-phenyl-2H-[1...,InChI=1S/C25H21N5O/c26-25(12-4-13-25)18-9-7-17...,ULDXWLCXEDXJGE-UHFFFAOYSA-N,C1CC(C1)(C2=CC=C(C=C2)C3=C(C=C4C(=N3)C=CN5C4=N...,0.0,C25H21N5O,407.46714,407.17461,3.0,83.6,0.0,0.0,317.2,3.0,16.28,4.11,1.07
3,AKT_PIK3C,"AKT*,PIK3C*",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,AKT_SGK,"AKT*,SGK*",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
# Save results to the database
engine = sa.create_engine('mysql://strokach:@192.168.6.19:3306/az_dream')
drug_info_release_2[drug_info_release_2_columns].to_sql('drug_info_release_2', engine, if_exists='replace', index=False)
engine.execute('create unique index challenge_name_idx on drug_info_release_2 (ChallengeName(255))')

## drug_info_release_3

Add names, targets, ATC codes...

In [None]:
# Get data from Pubchem
from functions import *

#
cids = drug_info_release_2[['ChallengeName', 'CID']].dropna().drop_duplicates()
cids['CID'] = cids['CID'].astype(int)

#
challenge_name_to_cid = []
for challenge_name, cid in cids.values:
    for partner_cid in get_cids(cid):
        challenge_name_to_cid.append((challenge_name, partner_cid))
        
# 
cid_to_sid = get_sids(set(list(zip(*challenge_name_to_cid))[1]))

#
sid_xrefs = get_sid_data(set(list(zip(*cid_to_sid))[1]))

# 
(cid_to_names, cid_to_mesh_synonyms, cid_to_depositor_supplied_synonyms, 
 cid_to_atc, cid_to_partner, cid_to_function) = get_atcs(set(list(zip(*cid_to_sid))[0]))

In [11]:
# Save results to the database
challenge_name_to_cid_df = (
    pd.DataFrame(challenge_name_to_cid, columns=['ChallengeName', 'cid'])
    .drop_duplicates()
)
challenge_name_to_cid_df.to_sql('drug_to_cid', engine, if_exists='replace', index=False)
engine.execute('create unique index a on drug_to_cid (ChallengeName(255), cid)')
engine.execute('create unique index b on drug_to_cid (cid, ChallengeName(255))')


cid_to_sid_df = (
    pd.DataFrame(cid_to_sid, columns=['cid', 'sid'])
    .drop_duplicates()
)
cid_to_sid_df.to_sql('cid_to_sid', engine, if_exists='replace', index=False)
engine.execute('create unique index cid_to_sid_idx on cid_to_sid (cid, sid)')
engine.execute('create unique index sid_to_cid_idx on cid_to_sid (sid, cid)')


sid_xrefs_df = (
    pd.DataFrame(sid_xrefs, columns=['sid', 'xref_type', 'xref_id'])
    .drop_duplicates()
)
sid_xrefs_df.to_sql('sid_xrefs', engine, if_exists='replace', index=False)
engine.execute('create unique index sid_xref_id_idx on sid_xrefs (sid, xref_id(255))')
engine.execute('create unique index xref_id_sid_idx on sid_xrefs (xref_id(255), sid)')
engine.execute('create unique index xref_type_xref_id_sid_idx on sid_xrefs (xref_type(255), xref_id(255), sid)')


cid_to_name_df = (
    pd.DataFrame(cid_to_names, columns=['cid', 'name'])
    .drop_duplicates()
)
cid_to_name_df.to_sql('cid_to_name', engine, if_exists='replace', index=False)
engine.execute('create unique index cid_to_name_idx on cid_to_name (cid, name(255))')
engine.execute('create unique index sid_to_name_idx on cid_to_name (name(255), cid)')


cid_to_mesh_synonyms = (
    pd.DataFrame(cid_to_mesh_synonyms, columns=['cid', 'synonyms'])
    .drop_duplicates()
)
cid_to_mesh_synonyms.to_sql('cid_to_mesh_synonyms', engine, if_exists='replace', index=False)
engine.execute('create index cid_idx on cid_to_mesh_synonyms (cid)')
engine.execute('create index synonyms_idx on cid_to_mesh_synonyms (synonyms)')


cid_to_depositor_supplied_synonyms_df = (
    pd.DataFrame(cid_to_depositor_supplied_synonyms, columns=['cid', 'synonyms'])
    .drop_duplicates()
)
cid_to_depositor_supplied_synonyms_df.to_sql(
    'cid_to_depositor_supplied_synonyms', engine, if_exists='replace', index=False)
engine.execute('create index cid_idx on cid_to_depositor_supplied_synonyms (cid)')
engine.execute('create index synonyms_idx on cid_to_depositor_supplied_synonyms (synonyms(255))')


cid_to_atc_df = (
    pd.DataFrame(cid_to_atc, columns=['cid', 'atc'])
    .drop_duplicates()
)
cid_to_atc_df.to_sql('cid_to_atc', engine, if_exists='replace', index=False)
engine.execute('create unique index cid_to_atc_idx on cid_to_atc (cid, atc(255))')
engine.execute('create unique index atc_to_cid_idx on cid_to_atc (atc(255), cid)')


cid_to_partner_df = (
    pd.DataFrame(cid_to_partner, columns=['cid', 'partner_type', 'partner_name', 'partner_bioentity'])
    .drop_duplicates()
)
cid_to_partner_df.to_sql('cid_to_partner', engine, if_exists='replace', index=False)
engine.execute('create unique index cid_partner_type_partner_name_idx on cid_to_partner '
               '(cid, partner_type(255), partner_name(255))')
engine.execute('create unique index partner_name_to_cid_idx on cid_to_partner '
               '(partner_type(255), partner_name(255), cid)')
engine.execute('create index cid_to_partner_bioentity_idx on cid_to_partner (cid, partner_bioentity(255))')
engine.execute('create index partner_bioentity_to_cid_idx on cid_to_partner (partner_bioentity(255), cid)')



cid_to_function_df = (
    pd.DataFrame(cid_to_function, columns=['cid', 'function'])
    .drop_duplicates()
)
cid_to_function_df.to_sql('cid_to_function', engine, if_exists='replace', index=False)
engine.execute('create unique index cid_function_idx on cid_to_function (cid, function(255))')
engine.execute('create unique index function_cid_idx on cid_to_function (function(255), cid)')

NameError: name 'challenge_name_to_cid' is not defined

In [13]:
# Load reformatted data from database
engine = sa.create_engine('mysql://strokach:@192.168.6.19:3306/az_dream')
engine.execute('SET SESSION group_concat_max_len = 10000000;')
sql_query = """
SELECT 
    main.*,
    GROUP_CONCAT(DISTINCT x.cid) cids,
    GROUP_CONCAT(DISTINCT target.partner_name) targets,
    GROUP_CONCAT(DISTINCT target.partner_bioentity) targets_bioentities,
    GROUP_CONCAT(DISTINCT enzyme.partner_name) enzymes,
    GROUP_CONCAT(DISTINCT enzyme.partner_bioentity) enzymes_bioentities,
    GROUP_CONCAT(DISTINCT transporter.partner_name) transporters,
    GROUP_CONCAT(DISTINCT transporter.partner_bioentity) transporters_bioentities,
    GROUP_CONCAT(DISTINCT atc) atcs
FROM
drug_info_release_2 main
    LEFT JOIN
drug_to_cid x USING (ChallengeName)
    LEFT JOIN
cid_to_partner target ON (x.cid = target.cid and target.partner_type = 'Target')
    LEFT JOIN
cid_to_partner enzyme ON (x.cid = enzyme.cid and enzyme.partner_type = 'Enzyme')
    LEFT JOIN
cid_to_partner transporter ON (x.cid = transporter.cid and transporter.partner_type = 'Transporter')
    LEFT JOIN
cid_to_atc atc ON (x.cid = atc.cid)
GROUP BY main.`ChallengeName`, main.`Target(Official Symbol)`, main.`SMILES or PubChem ID` , 
         main.CID ,  main.`InChI`, main.`InChIKey`
"""
drug_info_release_3 = pd.read_sql_query(sql_query, engine)

In [14]:
# Load reformatted data from database
engine = sa.create_engine('mysql://strokach:@192.168.6.19:3306/az_dream')
engine.execute('SET SESSION group_concat_max_len = 10000000;')
sql_query = """
SELECT 
    main.*,
    GROUP_CONCAT(DISTINCT x.cid) cids,
    GROUP_CONCAT(DISTINCT target.partner_name) targets,
    GROUP_CONCAT(DISTINCT target.partner_bioentity) targets_bioentities,
    GROUP_CONCAT(DISTINCT enzyme.partner_name) enzymes,
    GROUP_CONCAT(DISTINCT enzyme.partner_bioentity) enzymes_bioentities,
    GROUP_CONCAT(DISTINCT transporter.partner_name) transporters,
    GROUP_CONCAT(DISTINCT transporter.partner_bioentity) transporters_bioentities,
    GROUP_CONCAT(DISTINCT atc) atcs
FROM
drug_info_release_2 main
    LEFT JOIN
drug_to_cid x USING (ChallengeName)
    LEFT JOIN
cid_to_partner target ON (x.cid = target.cid and target.partner_type = 'Target')
    LEFT JOIN
cid_to_partner enzyme ON (x.cid = enzyme.cid and enzyme.partner_type = 'Enzyme')
    LEFT JOIN
cid_to_partner transporter ON (x.cid = transporter.cid and transporter.partner_type = 'Transporter')
    LEFT JOIN
cid_to_atc atc ON (x.cid = atc.cid)
GROUP BY main.`ChallengeName`, main.`Target(Official Symbol)`, main.`SMILES or PubChem ID` , 
         main.CID ,  main.`InChI`, main.`InChIKey`
"""
drug_info_release_3 = pd.read_sql_query(sql_query, engine)

In [15]:
drug_info_release_3.head()

Unnamed: 0,ChallengeName,Target(Official Symbol),HBA,cLogP,HBD,Lipinski,SMILES or PubChem ID,MW,AtomStereoCount,BondStereoCount,CID,CanonicalSMILES,Charge,Complexity,ConformerCount3D,ConformerModelRMSD3D,CovalentUnitCount,DefinedAtomStereoCount,DefinedBondStereoCount,EffectiveRotorCount3D,ExactMass,FeatureAcceptorCount3D,FeatureAnionCount3D,FeatureCationCount3D,FeatureCount3D,FeatureDonorCount3D,FeatureHydrophobeCount3D,FeatureRingCount3D,Fingerprint2D,HBondAcceptorCount,HBondDonorCount,HeavyAtomCount,IUPACName,InChI,InChIKey,IsomericSMILES,IsotopeAtomCount,MolecularFormula,MolecularWeight,MonoisotopicMass,RotatableBondCount,TPSA,UndefinedAtomStereoCount,UndefinedBondStereoCount,Volume3D,XLogP,XStericQuadrupole3D,YStericQuadrupole3D,ZStericQuadrupole3D,cids,targets,targets_bioentities,enzymes,enzymes_bioentities,transporters,transporters_bioentities,atcs
0,ADAM17,ADAM17,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,AKT,"AKT1, AKT2, AKT3",8.0,1.18,5.0,0.0,c1cc(ccc1[C@H](CCO)NC(=O)C2(CCN(CC2)c3c4cc[nH]...,428.9,1.0,0.0,25227436.0,C1CN(CCC1(C(=O)NC(CCO)C2=CC=C(C=C2)Cl)N)C3=NC=...,0.0,580.0,256.0,1.0,1.0,1.0,0.0,8.2,428.172752,2.0,0.0,4.0,14.0,4.0,0.0,4.0,AAADceB7sAAEAAAAAAAAAAAAAAAAAWAAAAA8WIAAAAAAAF...,6.0,4.0,30.0,4-amino-N-[(1S)-1-(4-chlorophenyl)-3-hydroxypr...,InChI=1S/C21H25ClN6O2/c22-15-3-1-14(2-4-15)17(...,JDUBGYFRJFOXQC-KRWDZBQOSA-N,C1CN(CCC1(C(=O)N[C@@H](CCO)C2=CC=C(C=C2)Cl)N)C...,0.0,C21H25ClN6O2,428.9152,428.172752,6.0,120.0,0.0,0.0,319.3,1.7,14.27,5.17,1.08,252274364260226057750340,,,,,,,
2,AKT_1,AKT*,6.0,3.24,3.0,0.0,c1ccc(cc1)c2cc3c(ccn4c3n[nH]c4=O)nc2c5ccc(cc5)...,407.5,0.0,0.0,24964624.0,C1CC(C1)(C2=CC=C(C=C2)C3=C(C=C4C(=N3)C=CN5C4=N...,0.0,760.0,4.0,0.8,1.0,0.0,0.0,3.8,407.17461,2.0,0.0,1.0,11.0,2.0,0.0,6.0,AAADceB7oAAAAAAAAAAAAAAAAABgAQAAAAA8YIEAAAAAAE...,4.0,2.0,31.0,8-[4-(1-aminocyclobutyl)phenyl]-9-phenyl-2H-[1...,InChI=1S/C25H21N5O/c26-25(12-4-13-25)18-9-7-17...,ULDXWLCXEDXJGE-UHFFFAOYSA-N,C1CC(C1)(C2=CC=C(C=C2)C3=C(C=C4C(=N3)C=CN5C4=N...,0.0,C25H21N5O,407.46714,407.17461,3.0,83.6,0.0,0.0,317.2,3.0,16.28,4.11,1.07,24964624,,,,,,,
3,AKT_PIK3C,"AKT*,PIK3C*",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,AKT_SGK,"AKT*,SGK*",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [17]:
# Save reformatted data to the database
if False:
    engine = sa.create_engine('mysql://strokach:@192.168.6.19:3306/az_dream')
    drug_info_release_3.to_sql('drug_info_release_3', engine, if_exists='replace', index=False)
    engine.execute('create unique index challenge_name_idx on drug_info_release_3 (ChallengeName(255))')

## durgs_to_targets

### Using provided targets

In [18]:
drugs_to_targets_list = [
    (drug, target.strip())
    for (drug, targets) 
    in drug_info_release_3[['ChallengeName', 'Target(Official Symbol)']].values
    for target in targets.split(',')    
]
drugs_to_targets = pd.DataFrame(drugs_to_targets_list, columns=['drug', 'target_original'])

In [19]:
# Corrections to the provided targets
target_corrections = {
    'BRAF_mut': 'BRAF',
    'BRAF_V600E': 'BRAF',
    'CD19 antibody': 'CD19',
    'VEGFR2': 'KDR',
    'NIAP': 'NAIP',
    'TNFA': 'TNF',
    'NAE2': 'UBA3',
    'TIE2': 'TEK',
    'cMET': 'MET',
    'Gamma secretase': 'APH*',
    'Proteasome': 'PSM*',
}
drugs_to_targets['target'] = drugs_to_targets['target_original'].apply(lambda x: target_corrections.get(x,x))

In [20]:
duplicates = drugs_to_targets[
    (drugs_to_targets['target_original'] != drugs_to_targets['target']) &
    (drugs_to_targets.duplicated(subset=['drug', 'target'], keep=False))
]
duplicates

Unnamed: 0,drug,target_original,target
35,BRAF_VEGFR2,VEGFR2,KDR
118,NAE,NAE2,UBA3
154,TKI_2,VEGFR2,KDR
155,TKI_2,TIE2,TEK


In [21]:
drugs_to_targets = drugs_to_targets.drop(duplicates.index)

In [22]:
# Get Ensembl IDs for the provided targets
engine = sa.create_engine('mysql://strokach:@192.168.6.19:3306/az_dream')
sql_query_template = """
select *
from ensembl_sequence_38.hgnc2ensembl
where hgnc_name like '{}'
"""
results = []
for target in drugs_to_targets['target'].drop_duplicates():
    target_like = target.replace('*', '%%').strip()
    sql_query = sql_query_template.format(target_like)
    mapped_targets = pd.read_sql_query(sql_query, engine)
    mapped_targets['target'] = target
    results.append(mapped_targets)
targets_mapped = pd.concat(results, ignore_index=True)

In [23]:
drugs_to_targets_mapped = (
    drugs_to_targets
    .merge(targets_mapped, on=['target'])
)

In [24]:
display(drugs_to_targets_mapped.head())
dat.print2('Number of drug-target pairs:', len(drugs_to_targets_mapped))
dat.print2('Total number of drugs:', len(drugs_to_targets['drug'].drop_duplicates()))
dat.print2('Number of drugs with mapped targets:', len(drugs_to_targets_mapped['drug'].drop_duplicates()))

Unnamed: 0,drug,target_original,target,hgnc_name,hgnc_acc,hgnc_description,ensg,enst,ensp
0,ADAM17,ADAM17,ADAM17,ADAM17,HGNC:195,ADAM metallopeptidase domain 17,ENSG00000151694,ENST00000310823,ENSP00000309968
1,ADAM17,ADAM17,ADAM17,ADAM17,HGNC:195,ADAM metallopeptidase domain 17,ENSG00000151694,ENST00000497134,ENSP00000418728
2,ADAM17,ADAM17,ADAM17,ADAM17,HGNC:195,ADAM metallopeptidase domain 17,ENSG00000151694,ENST00000618923,ENSP00000480552
3,AKT,AKT1,AKT1,AKT1,HGNC:391,v-akt murine thymoma viral oncogene homolog 1,ENSG00000142208,ENST00000554581,ENSP00000451828
4,AKT,AKT1,AKT1,AKT1,HGNC:391,v-akt murine thymoma viral oncogene homolog 1,ENSG00000142208,ENST00000407796,ENSP00000384293


Number of drug-target pairs:                                2635
Total number of drugs:                                      119
Number of drugs with mapped targets:                        111


### Using STITICH to find missing targets

In [25]:
missing_drugs = set(drugs_to_targets['drug']) - set(drugs_to_targets_mapped['drug'])
missing_drugs

{'Azacytidine',
 'CarboTaxol',
 'Carboplatin',
 'Chloroquine',
 'Cisplatin',
 'FOLFIRI',
 'FOLFOX',
 'Oxaliplatin'}

In [26]:
missing_targets = set(drugs_to_targets['target']) - set(drugs_to_targets_mapped['target'])
missing_targets

{'CHK1', 'DNA', 'Methylation', 'Thiamine', 'microtubule'}

In [27]:
drugs_to_targets[drugs_to_targets['drug'].isin(missing_drugs)]

Unnamed: 0,drug,target_original,target
20,Azacytidine,Methylation,Methylation
37,Carboplatin,DNA,DNA
38,CarboTaxol,DNA,DNA
43,Chloroquine,Thiamine,Thiamine
44,Cisplatin,DNA,DNA
71,FOLFIRI,DNA,DNA
72,FOLFOX,DNA,DNA
120,Oxaliplatin,DNA,DNA


In [None]:
drug_corrections = 
{
    'Azacytidine': 'DNMT1,CDKN2B,MGMT,GADD45A,DNMT3B,AZI1',
    # from pubchem (not happy with these)
    'Carboplatin': 'XDH,MPO,GSTT1,MT1A,MT2A,SOD1,GSTP1,GSTM1,GSTM1,NQO1,SLC31A1,SLC31A2,ATP7A,ATP7B,ABCG2', 
    # CarboTaxol is made up of paclitaxel and carboplatin, microtubule
    'CarboTaxol': 'FOXM1,KIF20A,',
    # paclitaxel: 'BCL2,TUBB1,NR1I2,MAP4,MAP2,MAPT
    'Chloroquine': 'GSTA2,TNF,TLR9,GST',
    
    'Cisplatin': '',
    'FOLFIRI': '',
    'FOLFOX': '',
    'Oxaliplatin':,
}

In [40]:
drugs_to_targets[drugs_to_targets['drug'].isin(['paclitaxel'])]

Unnamed: 0,drug,target_original,target


In [28]:
# Get full drug data for drugs which are missing targets
df = drug_info_release_3[drug_info_release_3['ChallengeName'].isin(missing_drugs)].copy()
df['CID'] = df['CID'].apply(lambda x: str(int(float(x))))
display(df)

Unnamed: 0,ChallengeName,Target(Official Symbol),HBA,cLogP,HBD,Lipinski,SMILES or PubChem ID,MW,AtomStereoCount,BondStereoCount,CID,CanonicalSMILES,Charge,Complexity,ConformerCount3D,ConformerModelRMSD3D,CovalentUnitCount,DefinedAtomStereoCount,DefinedBondStereoCount,EffectiveRotorCount3D,ExactMass,FeatureAcceptorCount3D,FeatureAnionCount3D,FeatureCationCount3D,FeatureCount3D,FeatureDonorCount3D,FeatureHydrophobeCount3D,FeatureRingCount3D,Fingerprint2D,HBondAcceptorCount,HBondDonorCount,HeavyAtomCount,IUPACName,InChI,InChIKey,IsomericSMILES,IsotopeAtomCount,MolecularFormula,MolecularWeight,MonoisotopicMass,RotatableBondCount,TPSA,UndefinedAtomStereoCount,UndefinedBondStereoCount,Volume3D,XLogP,XStericQuadrupole3D,YStericQuadrupole3D,ZStericQuadrupole3D,cids,targets,targets_bioentities,enzymes,enzymes_bioentities,transporters,transporters_bioentities,atcs
15,Azacytidine,Methylation,,,,,9444,,4,0,9444,C1=NC(=NC(=O)N1C2C(C(C(O2)CO)O)O)N,0,384,14,0.6,1,4,0,3.0,244.08077,5.0,0.0,0.0,12.0,5.0,0.0,2.0,AAADccBzuAAAAAAAAAAAAAAAAAAAASAAAAAgAAAAAAAAAA...,5,4,17,"4-amino-1-[(2R,3R,4S,5R)-3,4-dihydroxy-5-(hydr...",InChI=1S/C8H12N4O5/c9-7-10-2-12(8(16)11-7)6-5(...,NMUSYJAQQFHJEW-KVTDHHQDSA-N,C1=NC(=NC(=O)N1[C@H]2[C@@H]([C@@H]([C@H](O2)CO...,0,C8H12N4O5,244.20468,244.08077,2,141.0,0,0,168.2,-2.2,6.14,2.47,0.84,"1805,9444,47751,460485,460518,1265902,6420049,...","DNA,DNA a class=""pubchem-internal-link CID-597...","BE0004796,BE0000892,BE0004810",Cytidine deaminase,BE0002443,,,L01BC07
27,Carboplatin,DNA,6.0,-2.34,4.0,0.0,C1CC2(C1)C(=O)O[Pt](OC2=O)(N)N,369.2,0,0,426756,C1CC(C1)(C(=O)O)C(=O)O.[NH2-].[NH2-].[Pt+2],0,177,0,,4,0,0,,371.044498,,,,,,,,AAADccBjOAAAAAAAAAAABAAAAABgAAAAAAAkAAAAAAAAAA...,6,4,13,"azanide;cyclobutane-1,1-dicarboxylic acid;plat...",InChI=1S/C6H8O4.2H2N.Pt/c7-4(8)6(5(9)10)2-1-3-...,VSRXQHXAPYXROS-UHFFFAOYSA-N,C1CC(C1)(C(=O)O)C(=O)O.[NH2-].[NH2-].[Pt+2],0,C6H12N2O4Pt,371.25448,371.044498,0,76.6,0,0,,,,,,42675646780173,DNA,BE0004796,"Glutathione S-transferase Mu 1,Glutathione S-t...","BE0000807,BE0000814,BE0000818,BE0004778,BE0004...","ATP-binding cassette sub-family G member 2,Can...","BE0001067,BE0001069,BE0003666,BE0003664,BE0003...",L01XA02
28,CarboTaxol,DNA,,,,,441276,,11,0,441276,CC1=C2C(C(=O)C3(C(CC4C(C3C(C(C2(C)C)(CC1OC(=O)...,0,1790,0,,1,10,0,,853.330955,,,,,,,,AAADcfB+PAAAAAAAAAAAAAAAAABIAAAAAAAwYMGCAAAaIA...,14,4,62,,InChI=1S/C47H51NO14/c1-25-31(60-43(56)36(52)35...,RCINICONZNJXQF-VAZQATRQSA-N,CC1=C2[C@H](C(=O)[C@@]3([C@H](C[C@@H]4[C@](C3[...,0,C47H51NO14,853.90614,853.330955,14,221.0,1,0,,2.5,,,,"4666,36314,60915,177409,184492,441276,5317123,...","Apoptosis regulator Bcl-2,Microtubule-associat...","BE0000246,BE0001100,BE0003475,BE0003474,BE0000...","Cytochrome P450 19A1,Cytochrome P450 1B1,Cytoc...","BE0002090,BE0001111,BE0002887,BE0002793,BE0002...","ATP-binding cassette sub-family G member 2,Bil...","BE0001067,BE0000703,BE0001069,BE0001032,BE0000...",L01CD01
32,Chloroquine,Thiamine,,,,,2719,,1,0,2719,CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl,0,309,324,0.8,1,0,0,8.0,319.181526,1.0,0.0,2.0,6.0,1.0,0.0,2.0,AAADceB7AAAEAAAAAAAAAAAAAAAAAAAAAAA8QAAAAAAAAA...,3,1,22,"4-N-(7-chloroquinolin-4-yl)-1-N,1-N-diethylpen...",InChI=1S/C18H26ClN3/c1-4-22(5-2)12-6-7-14(3)21...,WHTVZRBIWZFKQO-UHFFFAOYSA-N,CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl,0,C18H26ClN3,319.87214,319.181526,8,28.2,1,0,255.7,4.6,10.27,3.1,1.88,"2719,444810,639540,10087451,49849611,49849612,...","Fe-protoporphyrin IX,Glutathione S-transferase...","BE0004805,BE0002659,BE0002189,BE0001171,BE0000704","Cytochrome P450 1A1,Cytochrome P450 2C8,Cytoch...","BE0003543,BE0002887,BE0002363,BE0002638,BE0002362",Multidrug resistance protein 1,BE0001032,P01BA01
33,Cisplatin,DNA,,,,,2767,,0,0,2767,[NH2-].[NH2-].Cl[Pt+2]Cl,0,7,0,,3,0,0,,296.939945,,,,,,,,AAADcYADAAAGAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAA...,2,2,5,azanide;dichloroplatinum(2+),InChI=1S/2ClH.2H2N.Pt/h2*1H;2*1H2;/q;;2*-1;+4/p-2,DQLATGHUWYMOKM-UHFFFAOYSA-L,[NH2-].[NH2-].Cl[Pt+2]Cl,0,Cl2H4N2Pt,298.03516,296.939945,0,2.0,0,0,,,,,,2767450696,DNA,BE0004796,"Arylamine N-acetyltransferase,Cholinesterase,C...","BE0004429,BE0002180,BE0003549,BE0002793,BE0000...","ATP-binding cassette sub-family G member 2,Can...","BE0001067,BE0001069,BE0001061,BE0003666,BE0003...",
50,FOLFIRI,DNA,,,,,56842117,,3,0,56842117,CCC1=C2C=C(C=CC2=NC3=C1CN4C3=CC5=C(C4=O)COC(=O...,0,2310,0,,3,2,0,,1189.462887,,,,,,,,AAADcfB//QAAAAAAAAAAAAAAAAAAAWAAAAA8eLECAAAAAF...,20,10,86,,InChI=1S/C33H38N4O6.C20H23N7O7.C4H3FN2O2/c1-3-...,JYEFSHLLTQIXIO-SMNQTINBSA-N,CCC1=C2C=C(C=CC2=NC3=C1CN4C3=CC5=C(C4=O)COC(=O...,0,C57H64FN13O15,1190.194563,1189.462887,14,386.0,1,0,,,,,,56842117,,,,,,,
51,FOLFOX,DNA,,,,,56842121,,4,0,56842121,[CH2-]C1CCCCC1[CH2-].C1C(N(C2=C(N1)NC(=NC2=O)N...,0,1290,0,,5,3,0,,998.253402,,,,,,,,AAADcfB//QAAAAAAAAAABAAAAAAAASJAAAAwYIECAAAAAE...,18,11,58,"(2S)-2-[[4-[(2-amino-5-formyl-4-oxo-1,6,7,8-te...",InChI=1S/C20H23N7O7.C8H14.C4H3FN2O2.C2H2O4.Pt/...,YXTKHLHCVFUPPT-YYFJYKOTSA-N,[CH2-][C@@H]1CCCC[C@H]1[CH2-].C1C(N(C2=C(N1)NC...,0,C34H42FN9O13Pt,998.832183,998.253402,9,348.0,1,0,,,,,,56842121,,,,,,,
79,Oxaliplatin,DNA,2.0,-0.62,4.0,0.0,C1CC[C@H]([C@@H](C1)N)N,114.2,2,0,43806,C1CCC(C(C1)N)N,0,62,2,0.4,1,2,0,1.2,114.115698,0.0,0.0,2.0,5.0,2.0,0.0,1.0,AAADccBjAAAAAAAAAAAAAAAAAAAAAAAAAAAwAAAAAAAAAA...,2,2,8,"(1R,2R)-cyclohexane-1,2-diamine","InChI=1S/C6H14N2/c7-5-3-1-2-4-6(5)8/h5-6H,1-4,...",SSJXIUAHEKJCMH-PHDIDXHHSA-N,C1CC[C@H]([C@@H](C1)N)N,0,C6H14N2,114.18876,114.115698,0,52.0,0,0,93.4,-0.3,1.8,1.74,1.05,"4610,43806,342917,374014,479307,10313027,10750...",,,,,,,


In [29]:
# Get STITCH target data for drugs which are missing targets
engine = sa.create_engine('mysql://strokach:@192.168.6.19:3306/stitch_4_0')
sql_query = """
select *
from chemical_sources cs
join protein_chemical_links pcl on (pcl.chemical = cs.flat_chemical)
where protein like '9606.%%'
and source_name = 'PC'
and source_id in ('{}')
""".format("', '".join(str(int(f)) for f in df['CID']))
df2 = pd.read_sql_query(sql_query, engine)

In [30]:
df2['taxid'], df2['protein_id'] = (
    list(zip(*df2['protein'].apply(lambda x: x.split('.'))))
)

In [31]:
display(df2.head())
display(df2.dtypes)

Unnamed: 0,flat_chemical,stereo_chemical,source_name,source_id,chemical,protein,experimental,databaseID,textmining,combined_score,taxid,protein_id
0,CID100002719,CID000002719,PC,2719,CID100002719,9606.ENSP00000000412,0,0,0,200,9606,ENSP00000000412
1,CID100002719,CID000002719,PC,2719,CID100002719,9606.ENSP00000011653,0,0,0,817,9606,ENSP00000011653
2,CID100002719,CID000002719,PC,2719,CID100002719,9606.ENSP00000037243,0,0,0,175,9606,ENSP00000037243
3,CID100002719,CID000002719,PC,2719,CID100002719,9606.ENSP00000072516,0,0,0,252,9606,ENSP00000072516
4,CID100002719,CID000002719,PC,2719,CID100002719,9606.ENSP00000161559,0,0,0,292,9606,ENSP00000161559


flat_chemical      object
stereo_chemical    object
source_name        object
source_id          object
chemical           object
protein            object
experimental        int64
databaseID          int64
textmining          int64
combined_score      int64
taxid              object
protein_id         object
dtype: object

In [32]:
# Get HGNC IDs for targets that were mapped using STITCH
engine = sa.create_engine('mysql://strokach:@192.168.6.19:3306/ensembl_sequence_38')
sql_query = """
select *
from hgnc2ensembl
where ensp in ('{}')
""".format("', '".join(df2['protein_id'].drop_duplicates()))
targets_mapped_2 = pd.read_sql_query(sql_query, engine)

In [33]:
targets_mapped_2.head()

Unnamed: 0,hgnc_name,hgnc_acc,hgnc_description,ensg,enst,ensp
0,M6PR,HGNC:6752,mannose-6-phosphate receptor (cation dependent),ENSG00000003056,ENST00000000412,ENSP00000000412
1,CFTR,HGNC:1884,cystic fibrosis transmembrane conductance regu...,ENSG00000001626,ENST00000003084,ENSP00000003084
2,SKAP2,HGNC:15687,src kinase associated phosphoprotein 2,ENSG00000005020,ENST00000345317,ENSP00000005587
3,CX3CL1,HGNC:10647,chemokine (C-X3-C motif) ligand 1,ENSG00000006210,ENST00000006053,ENSP00000006053
4,CRY1,HGNC:2384,cryptochrome circadian clock 1,ENSG00000008405,ENST00000008527,ENSP00000008527


In [34]:
# Create a drug -> target dataframe
columns = [
    'drug', 'target_original', 'target', 
    'hgnc_name', 'hgnc_acc', 'hgnc_description',
    'ensg', 'enst', 'ensp',
]
column_renames = {
    'ChallengeName': 'drug', 
    'Target(Official Symbol)': 'target_original', 
}
drugs_to_targets_mapped_2 = (
    df.rename(columns=column_renames)
    .merge(df2, left_on=['CID'], right_on=['source_id'])
    .merge(targets_mapped_2, left_on=['protein_id'], right_on=['ensp'])
)
drugs_to_targets_mapped_2['target'] = drugs_to_targets_mapped_2['target_original']
drugs_to_targets_mapped_2 = drugs_to_targets_mapped_2[columns]

In [35]:
drugs_to_targets_mapped_2.head()

Unnamed: 0,drug,target_original,target,hgnc_name,hgnc_acc,hgnc_description,ensg,enst,ensp
0,Azacytidine,Methylation,Methylation,CD4,HGNC:1678,CD4 molecule,ENSG00000010610,ENST00000011653,ENSP00000011653
1,CarboTaxol,DNA,DNA,CD4,HGNC:1678,CD4 molecule,ENSG00000010610,ENST00000011653,ENSP00000011653
2,Chloroquine,Thiamine,Thiamine,CD4,HGNC:1678,CD4 molecule,ENSG00000010610,ENST00000011653,ENSP00000011653
3,Cisplatin,DNA,DNA,CD4,HGNC:1678,CD4 molecule,ENSG00000010610,ENST00000011653,ENSP00000011653
4,Azacytidine,Methylation,Methylation,NGFR,HGNC:7809,nerve growth factor receptor,ENSG00000064300,ENST00000172229,ENSP00000172229


In [37]:
for drug, gene in drugs_to_targets_mapped_2.groupby('drug')['hgnc_name'].agg(lambda x: ','.join(x)).items():
    print(drug)
    print(gene)

Azacytidine
CD4,NGFR,TSPAN32,ESR1,CHGA,NXT2,GUCY2F,TSC2,RIPK2,SLC5A5,ACTA2,MYH1,IL2,MYF5,GAPDH,ENO2,NANOG,LDHB,AICDA,IL4,RARS,CDX1,MLH1,MSH2,ODC1,MSH6,ADAT2,HELLS,PANK3,TNFSF10,FLT3,MYOG,AHR,PLCG1,CDKN1A,NRN1,MYH2,MYOD1,RRM2B,H3F3B,MYH4,CCNA1,NTS,KRAS,CCNB1,HRK,LMO2,ACSBG1,BLK,FAM8A1,TLR2,MRPL15,CENPO,METTL5,TRMT5,LYZ,RASAL1,CDH1,TNFRSF8,CTCF,DNMT3A,PPAP2A,STC2,CASP6,SLC26A4,PMS2,TIMP3,SLC5A4,HDC,PML,CDH13,C17orf64,TP53,DNMT3L,PELO,RSPO2,UHRF2,CDKN2B,DUSP6,PTPRO,TJP1,SMARCA5,CHD1,LY96,DCK,SLC28A1,HBE1,BAX,AZIN2,TGFA,GABRB1,FSTL1,S100P,HPGD,ITGA2,NOS3,COL14A1,TSHR,HPRT1,CLDN10,RUNX1,C3AR1,MGMT,TRIM8,ITGB2,TRH,HLTF,TERT,HRAS,CD34,HSPA6,TRIB1,DMAP1,TMC6,HIC1,TFE3,DKK3,CCNG2,AGGF1,CIITA,PRTFDC1,EZH2,AZIN1,NHLH2,SOX11,OXTR,NOS2,NKX2-5,DNMT3B,CLDN6,TSPO,BCL2,TPO,FBLN1,CALCA,RARB,HIST2H3D,MX2,HBB,GATA4,WDR20,KIR2DL1,CENPA,HBG2,PLCB1,BCL11A,FOXO3,PAX4,LITAF,CD59,DLK1,CYP1A2,KIR2DL3,H1F0,BTG3,RPL27A,UCKL1,NKX2-1,HIST1H4J,MYH7,MEGF6,MAGEA1,RASSF1,SLC29A2,BRCA1,HIST4H4,DAPK1,CASP8,PLCG2,TECPR2,DN

### Combine provided and STITCH targets

In [113]:
drugs_to_targets_mapped_all = (
    pd.concat([
        drugs_to_targets_mapped,
        drugs_to_targets_mapped_2        
    ], ignore_index=True)
)

In [116]:
drugs_to_targets_mapped_all['ensp_idx'] = (
    drugs_to_targets_mapped_all['ensp']
    .apply(lambda x: int(x[4:]))
)

In [118]:
drugs_to_targets_mapped_all.head()

Unnamed: 0,drug,target_original,target,hgnc_name,hgnc_acc,hgnc_description,ensg,enst,ensp,ensp_idx
0,ADAM17,ADAM17,ADAM17,ADAM17,HGNC:195,ADAM metallopeptidase domain 17,ENSG00000151694,ENST00000310823,ENSP00000309968,309968
1,ADAM17,ADAM17,ADAM17,ADAM17,HGNC:195,ADAM metallopeptidase domain 17,ENSG00000151694,ENST00000497134,ENSP00000418728,418728
2,ADAM17,ADAM17,ADAM17,ADAM17,HGNC:195,ADAM metallopeptidase domain 17,ENSG00000151694,ENST00000618923,ENSP00000480552,480552
3,AKT,AKT1,AKT1,AKT1,HGNC:391,v-akt murine thymoma viral oncogene homolog 1,ENSG00000142208,ENST00000554581,ENSP00000451828,451828
4,AKT,AKT1,AKT1,AKT1,HGNC:391,v-akt murine thymoma viral oncogene homolog 1,ENSG00000142208,ENST00000407796,ENSP00000384293,384293


In [119]:
dat.print2('Number of drug -> target tupels:', len(drugs_to_targets_mapped_all))
dat.print2('Number of original drug -> target tuples:', 
           len(drugs_to_targets_mapped_all[['drug','target_original']].drop_duplicates()))
dat.print2('Number of corrected drug -> target tuples:',
           len(drugs_to_targets_mapped_all[['drug','target']].drop_duplicates()))
dat.print2('Number of HGNC drug -> target tuples:',
           len(drugs_to_targets_mapped_all[['drug','hgnc_name']].drop_duplicates()))
dat.print2('Number of ENSG drug -> target tuples:',
           len(drugs_to_targets_mapped_all[['drug','ensg']].drop_duplicates()))
dat.print2('Number of ENST drug -> target tuples:',
           len(drugs_to_targets_mapped_all[['drug','enst']].drop_duplicates()))
dat.print2('Number of ENSP drug -> target tuples:',
           len(drugs_to_targets_mapped_all[['drug','ensp']].drop_duplicates()))

Number of drug -> target tupels:                            4606
Number of original drug -> target tuples:                   170
Number of corrected drug -> target tuples:                  170
Number of HGNC drug -> target tuples:                       2324
Number of ENSG drug -> target tuples:                       2412
Number of ENST drug -> target tuples:                       4606
Number of ENSP drug -> target tuples:                       4606


In [120]:
# Make sure that we have at least one target for all drugs
assert not (set(drug_info_release_3['ChallengeName']) - set(drugs_to_targets_mapped_all['drug']))

In [122]:
# Save results to the database
engine = sa.create_engine('mysql://strokach:@192.168.6.19:3306/az_dream')
drugs_to_targets_mapped_all.to_sql('drug_to_target', engine, if_exists='replace', index=False)

engine.execute('create index a on drug_to_target (drug(255), hgnc_name(255))')
engine.execute('create index b on drug_to_target (drug(255), ensg(255))')
engine.execute('create index c on drug_to_target (drug(255), enst(255))')
engine.execute('create unique index d on drug_to_target (drug(255), ensp(255))')

engine.execute('create index e on drug_to_target (hgnc_name(255), drug(255))')
engine.execute('create index f on drug_to_target (ensg(255), drug(255))')
engine.execute('create index g on drug_to_target (enst(255), drug(255))')
engine.execute('create unique index h on drug_to_target (ensp(255), drug(255))')

engine.execute('create index i on drug_to_target (drug(255), ensp_idx)')
engine.execute('create index j on drug_to_target (ensp_idx, drug(255))')

<sqlalchemy.engine.result.ResultProxy at 0x7f7d7254e9b0>

## lincs data

In [7]:
# http://inventwithpython.com/automate11-web_scraping.pdf#page=29
from selenium import webdriver
browser = webdriver.Firefox()

In [52]:
cids = list(set([x for xx in drug_info_release_3['cids'].dropna().values for x in xx.split(',')]))

In [54]:
# Get data from lincs
columns = [
    'summarySmallMoleculeName', 
    'pubchemId',
    'lincsId',
    'facilityId',
    'data-bioAvailability',
    'data-lipinsky3',
    'data-lipinsky5',
    'data-leadLikeness',
]

In [53]:
# xx
browser.get('http://life.ccs.miami.edu/life/summary?mode=SmallMolecule&source=PubChem&input={}'.format(cids[9]))
x = browser.find_element_by_id('summarySmallMoleculeName')
x.text

''

In [None]:
results = []
for cid in cids:
    browser.get('http://life.ccs.miami.edu/life/summary?mode=SmallMolecule&source=PubChem&input={}'.format(cid))
    time.sleep(10)
    result = []
    for elem_id in columns:
        x = browser.find_element_by_id(elem_id)
        if not x.text:
            break
        result.append(x.text.strip())
    if result:
        results.append(result)
        #browser.execute_script('downloadCFData();')

In [None]:
results

In [25]:
with open('output/lincs_life_wrx.pickle', 'wb') as ofh:
    pickle.dump(results, ofh)

### Load previously calculated

In [56]:
with open('output/lincs/lincs_life_wrx.pickle', 'rb') as ofh:
    results = pickle.load(ofh)

In [57]:
cids_with_results = list(zip(*results))[1]

In [58]:
cids_with_results[:5]

('56655374', '11327430', '11213558', '2719', '15951529')

In [60]:
from selenium import webdriver
profile = webdriver.FirefoxProfile()
profile.set_preference("browser.download.panel.shown", False)
profile.set_preference("browser.helperApps.neverAsk.openFile","text/csv,application/vnd.ms-excel")
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv,application/vnd.ms-excel")
profile.set_preference("browser.download.folderList", 2);
profile.set_preference("browser.download.dir", op.join(os.getcwd(), 'output', 'lincs'))
browser = webdriver.Firefox(profile)

In [61]:
results_full = []
for cid in cids_with_results:
    browser.get('http://life.ccs.miami.edu/life/summary?mode=SmallMolecule&source=PubChem&input={}'.format(cid))
    time.sleep(20)
    result = []
    for elem_id in columns:
        x = browser.find_element_by_id(elem_id)
        result.append(x.text.strip())
    #browser.execute_script('downloadCFData();')
    #time.sleep(20)
    results_full.append(result)

In [79]:
# Get data from lincs
columns = [
    'drug', 
    'cid',
    'lincs_id',
    'facility_id',
    'bio_availability',
    'lipinsky3',
    'lipinsky5',
    'lead_likeness',
]
lincs_drug_info = pd.DataFrame(results_full, columns=columns)

In [80]:
lincs_drug_info.head()

Unnamed: 0,drug,cid,lincs_id,facility_id,bio_availability,lipinsky3,lipinsky5,lead_likeness
0,HG-14-10-04,56655374,LSM-6345,,Fail,Fail,Fail,Fail
1,NU7441,11327430,LSM-1061,BRD-K00337317,Pass,Fail,Fail,Fail
2,R406,11213558,LSM-1040,BRD-K20285085,Pass,Fail,Pass,Fail
3,chloroquine,2719,LSM-1901,BRD-A91699651,Pass,Fail,Pass,Pass
4,Enzalutamide,15951529,LSM-6254,BRD-K56851771,Pass,Fail,Pass,Fail


In [62]:
endpoints = [
    'Cell cycle: percent non-arrested',
    'down regulation',
    'Cell cycle: percent G2-M arrested',
    'Cell cycle: percent interphase',
    'Cell cycle: percent mitosis',
    'up regulation'
]
data = {e: [] for e in endpoints}
for i, cid in enumerate(cids_with_results):
    filename = 'output/lincs/data{}.csv'.format('({})'.format(i) if i else '')
    try:
        df = pd.read_csv(filename)
    except ValueError:
        print('Empty file for i: {}, cid: {}'.format(i, cid))
        continue
    df['cid'] = cid
    if df['Value'].isnull().all():
        df['Value'] = df['Endpoint']
        df['Endpoint'] = None
    for endpoint in endpoints:
        df_sub = df[df['Endpoint'] == endpoint]
        if not df_sub.empty:
            data[endpoint].append(df_sub)

Empty file for i: 0, cid: 56655374
Empty file for i: 8, cid: 11640390
Empty file for i: 12, cid: 43806
Empty file for i: 17, cid: 36462
Empty file for i: 19, cid: 16117018
Empty file for i: 22, cid: 3081361
Empty file for i: 26, cid: 11625818
Empty file for i: 32, cid: 11707110
Empty file for i: 34, cid: 11167602
Empty file for i: 38, cid: 16654980
Empty file for i: 40, cid: 46861588
Empty file for i: 41, cid: 148124
Empty file for i: 42, cid: 56649450
Empty file for i: 47, cid: 5311497
Empty file for i: 55, cid: 53398658
Empty file for i: 58, cid: 51039095


In [81]:
print(data.keys())

dict_keys(['Cell cycle: percent G2-M arrested', 'Cell cycle: percent interphase', 'down regulation', 'Cell cycle: percent non-arrested', 'Cell cycle: percent mitosis', 'up regulation'])


In [84]:
lincs_drug_expression = (
    pd.concat([data['down regulation'], data['up regulation']])
    .rename(columns={
            'SmallMolecule': 'drug',
            'Cell': 'cell_line',
            'Time Point (hrs)': 'time_point',
            u'Concentration (\u03bcM)': 'concentration',
            'Endpoint': 'endpoint',
            'Value': 'value',
            'Gene Name': 'gene_name',            
        })
    )

In [85]:
len(set(lincs_drug_expression['cid']))

41

In [86]:
display(lincs_drug_expression.head())
display(lincs_drug_expression.dtypes)

Unnamed: 0,drug,cell_line,time_point,concentration,endpoint,value,gene_name,cid
1300,NU7441,A549,6,10,down regulation,-1.038,INTS3,11327430
1301,NU7441,HT-29,6,10,down regulation,-1.068,SMARCD2,11327430
1302,NU7441,A549,6,10,down regulation,-1.21,KIAA0494,11327430
1303,NU7441,A549,24,10,down regulation,-1.224,PIGB,11327430
1304,NU7441,A549,6,10,down regulation,-1.239,ARL4C,11327430


drug              object
cell_line         object
time_point         int64
concentration    float64
endpoint          object
value            float64
gene_name         object
cid               object
dtype: object

In [89]:
# Save to database
engine = sa.create_engine('mysql://strokach:@192.168.6.19:3306/az_dream')

lincs_drug_info.to_sql('lincs_drug_info', engine, if_exists='replace', index=False)
engine.execute('create index a on lincs_drug_info (drug(255), cid(255))')
engine.execute('create index b on lincs_drug_info (cid(255), drug(255))')

lincs_drug_expression.to_sql('lincs_drug_expression', engine, if_exists='replace', index=False)
engine.execute('create index a on lincs_drug_expression (drug(255), cell_line(255), gene_name(255))')
engine.execute('create index b on lincs_drug_expression (cell_line(255), gene_name(255), drug(255))')
engine.execute('create index c on lincs_drug_expression (drug(255), gene_name(255))')
engine.execute('create index d on lincs_drug_expression (gene_name(255), drug(255))')

<sqlalchemy.engine.result.ResultProxy at 0x7f7d74cfa940>

# Drug Pair -> Target Pair

In [7]:
# Load drug info data
engine = sa.create_engine('mysql://strokach:@192.168.6.19:3306/az_dream')
all_drug_pairs = pd.read_sql('all_drug_pairs', engine)
drug_to_cid = pd.read_sql('drug_to_cid', engine)
drug_to_target = pd.read_sql('drug_to_target', engine)

In [9]:
all_drug_pairs.sort_values(['COMPOUND_A', 'COMPOUND_B'], inplace=True)

In [104]:
all_drug_pairs.head(2)

Unnamed: 0,UNIQUE_ID,COMPOUND_A,COMPOUND_B
0,ADAM17.AKT,ADAM17,AKT
1,ADAM17.AKT_1,ADAM17,AKT_1


In [105]:
drug_to_cid.head(2)

Unnamed: 0,ChallengeName,cid
0,AKT,25227436
1,AKT,57750340


In [107]:
drug_to_cid.dtypes

ChallengeName    object
cid               int64
dtype: object

In [106]:
drug_to_target.head(2)

Unnamed: 0,drug,target_original,target,hgnc_name,hgnc_acc,hgnc_description,ensg,enst,ensp,ensp_idx
0,ADAM17,ADAM17,ADAM17,ADAM17,HGNC:195,ADAM metallopeptidase domain 17,ENSG00000151694,ENST00000310823,ENSP00000309968,309968
1,ADAM17,ADAM17,ADAM17,ADAM17,HGNC:195,ADAM metallopeptidase domain 17,ENSG00000151694,ENST00000497134,ENSP00000418728,418728


### drug_pair_to_cid_pair

In [126]:
drug_pair_to_cid_pair = (
    all_drug_pairs
    .merge(
        drug_to_cid[['ChallengeName', 'cid']]
            .rename(columns={'ChallengeName': 'COMPOUND_A', 'cid': 'cid_1'}), 
        on=['COMPOUND_A'])
    .merge(
        drug_to_cid[['ChallengeName', 'cid']]
            .rename(columns={'ChallengeName': 'COMPOUND_B', 'cid': 'cid_2'}), 
        on=['COMPOUND_B'])
)
#
idx = (drug_pair_to_cid_pair['cid_1'] > drug_pair_to_cid_pair['cid_2'])
(drug_pair_to_cid_pair.loc[idx, 'cid_1'], drug_pair_to_cid_pair.loc[idx, 'cid_2']) = (
    drug_pair_to_cid_pair.loc[idx, 'cid_2'], drug_pair_to_cid_pair.loc[idx, 'cid_1']
)
drug_pair_to_cid_pair['cid_pair'] = (
    drug_pair_to_cid_pair['cid_1'].astype(int).astype(str) + ',' + 
    drug_pair_to_cid_pair['cid_2'].astype(int).astype(str)
)

In [127]:
print('{:,d}'.format(drug_pair_to_cid_pair.shape[0]))
drug_pair_to_cid_pair = drug_pair_to_cid_pair.drop_duplicates()
print('{:,d}'.format(drug_pair_to_cid_pair.drop_duplicates().shape[0]))

636,149
636,148


In [128]:
drug_pair_to_cid_pair.head(2)

Unnamed: 0,UNIQUE_ID,COMPOUND_A,COMPOUND_B,cid_1,cid_2,cid_pair
0,AKT.AKT_1,AKT,AKT_1,24964624,25227436,2496462425227436
1,AKT.AKT_1,AKT,AKT_1,24964624,57750340,2496462457750340


In [129]:
drug_pair_to_cid_pair.dtypes

UNIQUE_ID     object
COMPOUND_A    object
COMPOUND_B    object
cid_1          int64
cid_2          int64
cid_pair      object
dtype: object

In [130]:
assert not any(drug_pair_to_cid_pair['cid_1'] > drug_pair_to_cid_pair['cid_2'])

In [131]:
dtypes = {
    'UNIQUE_ID': sa.VARCHAR(255),
    'COMPOUND_A': sa.VARCHAR(255),
    'COMPOUND_B': sa.VARCHAR(255),
    'cid_1': sa.dialects.mysql.INTEGER,
    'cid_2': sa.dialects.mysql.INTEGER,
    'cid_pair': sa.VARCHAR(255),
}
drug_pair_to_cid_pair.to_sql('drug_pair_to_cid_pair', engine, dtype=dtypes, index=False, if_exists='replace')

In [132]:
engine.execute('create index a on drug_pair_to_cid_pair (UNIQUE_ID, COMPOUND_A, COMPOUND_B)')
engine.execute('create index b on drug_pair_to_cid_pair (COMPOUND_A, COMPOUND_B, UNIQUE_ID)')
engine.execute('create index c on drug_pair_to_cid_pair (UNIQUE_ID, cid_1, cid_2)')
engine.execute('create index d on drug_pair_to_cid_pair (cid_1, cid_2, UNIQUE_ID)')
engine.execute('create index e on drug_pair_to_cid_pair (UNIQUE_ID, cid_pair)')
engine.execute('create index f on drug_pair_to_cid_pair (cid_pair, UNIQUE_ID)')

<sqlalchemy.engine.result.ResultProxy at 0x7f11dda18908>

### drug_pair_to_ensp_pair

In [59]:
drug_pair_to_ensp_pair = (
    all_drug_pairs
    .merge(
        drug_to_target[['drug', 'ensp']]
            .rename(columns={'drug': 'COMPOUND_A', 'ensp': 'ensp_1'}), 
        on=['COMPOUND_A'])
    .merge(
        drug_to_target[['drug', 'ensp']]
            .rename(columns={'drug': 'COMPOUND_B', 'ensp': 'ensp_2'}), 
        on=['COMPOUND_B'])
)
#
idx = (drug_pair_to_ensp_pair['ensp_1'] > drug_pair_to_ensp_pair['ensp_2'])
(drug_pair_to_ensp_pair.loc[idx, 'ensp_1'], drug_pair_to_ensp_pair.loc[idx, 'ensp_2']) = (
    drug_pair_to_ensp_pair.loc[idx, 'ensp_2'], drug_pair_to_ensp_pair.loc[idx, 'ensp_1']
)
drug_pair_to_ensp_pair['ensp_pair'] = (
    drug_pair_to_ensp_pair['ensp_1'] + ',' + drug_pair_to_ensp_pair['ensp_2']
)

In [75]:
print('{:,d}'.format(drug_pair_to_ensp_pair.shape[0]))
drug_pair_to_ensp_pair = drug_pair_to_ensp_pair.drop_duplicates()
print('{:,d}'.format(drug_pair_to_ensp_pair.drop_duplicates().shape[0]))

10,026,848
9,901,172


In [76]:
display(drug_pair_to_ensp_pair.head())

Unnamed: 0,UNIQUE_ID,COMPOUND_A,COMPOUND_B,ensp_1,ensp_2,ensp_pair
0,ADAM17.AKT,ADAM17,AKT,ENSP00000309968,ENSP00000451828,"ENSP00000309968,ENSP00000451828"
1,ADAM17.AKT,ADAM17,AKT,ENSP00000309968,ENSP00000384293,"ENSP00000309968,ENSP00000384293"
2,ADAM17.AKT,ADAM17,AKT,ENSP00000270202,ENSP00000309968,"ENSP00000270202,ENSP00000309968"
3,ADAM17.AKT,ADAM17,AKT,ENSP00000309968,ENSP00000385326,"ENSP00000309968,ENSP00000385326"
4,ADAM17.AKT,ADAM17,AKT,ENSP00000309968,ENSP00000450688,"ENSP00000309968,ENSP00000450688"


In [88]:
drug_pair_to_ensp_pair.to_csv('/tmp/drug_pair_to_ensp_pair.tsv', sep='\t', index=False, header=False)

In [89]:
engine = sa.create_engine('mysql://strokach:@192.168.6.19:3306/az_dream')
engine.execute('drop table drug_pair_to_ensp_pair')
engine.execute("""
create table drug_pair_to_ensp_pair (
UNIQUE_ID varchar(255),
COMPOUND_A varchar(255),
COMPOUND_B varchar(255),
ensp_1 varchar(255),
ensp_2 varchar(255),
ensp_pair varchar(255)
);
""")

<sqlalchemy.engine.result.ResultProxy at 0x7f11db116c18>

In [83]:
!mysql -u strokach -h 192.168.6.19 az_dream \
    -e "load data infile '/tmp/drug_pair_to_ensp_pair.tsv' into table drug_pair_to_ensp_idx_pair"

strokach@192.168.6.19's password: 



In [None]:
!rm "/tmp/drug_pair_to_ensp_pair.tsv"

In [None]:
engine.execute('create index a on drug_pair_to_ensp_pair (UNIQUE_ID, COMPOUND_A, COMPOUND_B)')
engine.execute('create index b on drug_pair_to_ensp_pair (COMPOUND_A, COMPOUND_B, UNIQUE_ID)')
engine.execute('create index c on drug_pair_to_ensp_pair (UNIQUE_ID, ensp_1, ensp_2)')
engine.execute('create index d on drug_pair_to_ensp_pair (ensp_1, ensp_2, UNIQUE_ID)')
engine.execute('create index e on drug_pair_to_ensp_pair (UNIQUE_ID, ensp_pair)')
engine.execute('create index f on drug_pair_to_ensp_pair (ensp_pair, UNIQUE_ID)')

### drug_pair_to_ensp_idx_pair

In [61]:
drug_pair_to_ensp_idx_pair = (
    all_drug_pairs
    .merge(
        drug_to_target[['drug', 'ensp_idx']]
            .rename(columns={'drug': 'COMPOUND_A', 'ensp_idx': 'ensp_1'}), 
        on=['COMPOUND_A'])
    .merge(
        drug_to_target[['drug', 'ensp_idx']]
            .rename(columns={'drug': 'COMPOUND_B', 'ensp_idx': 'ensp_2'}), 
        on=['COMPOUND_B'])
)
#
idx = (drug_pair_to_ensp_idx_pair['ensp_1'] > drug_pair_to_ensp_idx_pair['ensp_2'])
(drug_pair_to_ensp_idx_pair.loc[idx, 'ensp_1'], drug_pair_to_ensp_idx_pair.loc[idx, 'ensp_2']) = (
    drug_pair_to_ensp_idx_pair.loc[idx, 'ensp_2'], drug_pair_to_ensp_idx_pair.loc[idx, 'ensp_1']
)
drug_pair_to_ensp_idx_pair['ensp_idx_pair'] = (
    drug_pair_to_ensp_idx_pair['ensp_1'].astype(int).astype(str) + ',' + 
    drug_pair_to_ensp_idx_pair['ensp_2'].astype(int).astype(str)
)

In [71]:
print('{:,d}'.format(drug_pair_to_ensp_idx_pair.shape[0]))
drug_pair_to_ensp_idx_pair = drug_pair_to_ensp_idx_pair.drop_duplicates()
print('{:,d}'.format(drug_pair_to_ensp_idx_pair.shape[0]))

10,026,848
9,901,172


In [72]:
display(drug_pair_to_ensp_idx_pair.head())

Unnamed: 0,UNIQUE_ID,COMPOUND_A,COMPOUND_B,ensp_1,ensp_2,ensp_idx_pair
0,ADAM17.AKT,ADAM17,AKT,309968,451828,309968451828
1,ADAM17.AKT,ADAM17,AKT,309968,384293,309968384293
2,ADAM17.AKT,ADAM17,AKT,270202,309968,270202309968
3,ADAM17.AKT,ADAM17,AKT,309968,385326,309968385326
4,ADAM17.AKT,ADAM17,AKT,309968,450688,309968450688


In [100]:
drug_pair_to_ensp_idx_pair.to_csv('/tmp/drug_pair_to_ensp_idx_pair.tsv', sep='\t', index=False, header=False)

In [99]:
engine = sa.create_engine('mysql://strokach:@192.168.6.19:3306/az_dream')
engine.execute('drop table drug_pair_to_ensp_idx_pair')
engine.execute("""
create table drug_pair_to_ensp_idx_pair (
UNIQUE_ID varchar(255),
COMPOUND_A varchar(255),
COMPOUND_B varchar(255),
ensp_1 int,
ensp_2 int,
ensp_pair varchar(255)
);
""")

<sqlalchemy.engine.result.ResultProxy at 0x7f11db32bc18>

In [102]:
!mysql -u strokach -h 192.168.6.19 az_dream \
    -e "load data infile '/tmp/drug_pair_to_ensp_idx_pair.tsv' into table drug_pair_to_ensp_idx_pair"

In [None]:
!rm "/tmp/drug_pair_to_ensp_idx_pair.tsv"

In [None]:
engine.execute('create index a on drug_pair_to_ensp_idx_pair (UNIQUE_ID, COMPOUND_A, COMPOUND_B)')
engine.execute('create index b on drug_pair_to_ensp_idx_pair (COMPOUND_A, COMPOUND_B, UNIQUE_ID)')
engine.execute('create index c on drug_pair_to_ensp_idx_pair (UNIQUE_ID, ensp_1, ensp_2)')
engine.execute('create index d on drug_pair_to_ensp_idx_pair (ensp_1, ensp_2, UNIQUE_ID)')
engine.execute('create index e on drug_pair_to_ensp_idx_pair (UNIQUE_ID, ensp_pair)')
engine.execute('create index f on drug_pair_to_ensp_idx_pair (ensp_pair, UNIQUE_ID)')

### drug_pair_to_gene_pair

In [69]:
drug_pair_to_gene_pair = (
    all_drug_pairs
    .merge(
        drug_to_target[['drug', 'hgnc_name']]
            .rename(columns={'drug': 'COMPOUND_A', 'hgnc_name': 'gene_1'}), 
        on=['COMPOUND_A'])
    .merge(
        drug_to_target[['drug', 'hgnc_name']]
            .rename(columns={'drug': 'COMPOUND_B', 'hgnc_name': 'gene_2'}), 
        on=['COMPOUND_B'])
)
#
idx = (drug_pair_to_gene_pair['gene_1'] > drug_pair_to_gene_pair['gene_2'])
(drug_pair_to_gene_pair.loc[idx, 'gene_1'], drug_pair_to_gene_pair.loc[idx, 'gene_2']) = (
    drug_pair_to_gene_pair.loc[idx, 'gene_2'], drug_pair_to_gene_pair.loc[idx, 'gene_1']
)
drug_pair_to_gene_pair['gene_pair'] = (
    drug_pair_to_gene_pair['gene_1'] + ',' + drug_pair_to_gene_pair['gene_2']
)

In [73]:
print('{:,d}'.format(drug_pair_to_gene_pair.shape[0]))
drug_pair_to_gene_pair = drug_pair_to_gene_pair.drop_duplicates()
print('{:,d}'.format(drug_pair_to_gene_pair.shape[0]))

10,026,848
2,234,288


In [74]:
display(drug_pair_to_gene_pair.head())

Unnamed: 0,UNIQUE_ID,COMPOUND_A,COMPOUND_B,gene_1,gene_2,gene_pair
0,ADAM17.AKT,ADAM17,AKT,ADAM17,AKT1,"ADAM17,AKT1"
12,ADAM17.AKT,ADAM17,AKT,ADAM17,AKT2,"ADAM17,AKT2"
40,ADAM17.AKT,ADAM17,AKT,ADAM17,AKT3,"ADAM17,AKT3"
147,ADAM17.AKT_1,ADAM17,AKT_1,ADAM17,AKT1,"ADAM17,AKT1"
159,ADAM17.AKT_1,ADAM17,AKT_1,ADAM17,AKT1S1,"ADAM17,AKT1S1"


In [97]:
drug_pair_to_gene_pair.to_csv('/tmp/drug_pair_to_gene_pair.tsv', sep='\t', index=False, header=False)

In [98]:
engine = sa.create_engine('mysql://strokach:@192.168.6.19:3306/az_dream')
engine.execute('drop table drug_pair_to_gene_pair')
engine.execute("""
create table drug_pair_to_gene_pair (
UNIQUE_ID varchar(255),
COMPOUND_A varchar(255),
COMPOUND_B varchar(255),
gene_1 varchar(255),
gene_2 varchar(255),
gene_pair varchar(255)
);
""")

<sqlalchemy.engine.result.ResultProxy at 0x7f11db10ef98>

In [None]:
!mysql -u strokach -h 192.168.6.19 az_dream \
    -e "load data infile '/tmp/drug_pair_to_gene_pair.tsv' into table drug_pair_to_gene_pair"

In [None]:
!rm "/tmp/drug_pair_to_gene_pair.tsv"

In [None]:
engine.execute('create index a on drug_pair_to_gene_pair (UNIQUE_ID, COMPOUND_A, COMPOUND_B)')
engine.execute('create index b on drug_pair_to_gene_pair (COMPOUND_A, COMPOUND_B, UNIQUE_ID)')
engine.execute('create index c on drug_pair_to_gene_pair (UNIQUE_ID, gene_1, gene_2)')
engine.execute('create index d on drug_pair_to_gene_pair (gene_1, gene_2, UNIQUE_ID)')
engine.execute('create index e on drug_pair_to_gene_pair (UNIQUE_ID, gene_pair)')
engine.execute('create index f on drug_pair_to_gene_pair (gene_pair, UNIQUE_ID)')

# Done!

In [None]:
engine = sa.create_engine('mysql://strokach:@192.168.6.19:3306/az_dream')
drug_info_release_3 = pd.read_sql_table('drug_info_release_3', engine)

In [None]:
data = []
for column in drug_info_release_3.columns:
    data.append((column, drug_info_release_3[column].notnull().sum()))
df = pd.DataFrame(data, columns=['feature', 'num drugs'])

In [None]:
df.sort('num drugs', ascending=False, inplace=True)

In [None]:
df

In [None]:
drug_info_release_3[drug_info_release_3['atcs'].notnull()]['atcs']