In [5]:
from support_functions import log_progress, clean_drugbank, query_chembl
import pickle
import pandas as pd

# Extracting drug targets from Drugbank

The original Drugbank database is stored in XML format and contains a ton of information that we don't need. Our  goal is to clean this data and create a stripped-down dictionary of drug-target pairings that we can use to lookup the gene targets of drugs tested in the LINCS library. The keys of this dictionary will be the drugs' InChiKeys and the values will be the targets' Hugo Gene Symbols. 

*In the future we may want to use additional metadata from the DrugBank database, such as whether the drugs are inhibitors or activators.*

First we need to clean the data, as the target names are difficult to extract for several reasons: 

* Some drugs are not small molecules but polymers or other macromolecules we are not interested in
* Some drugs have multiple targets with their names stored in a list structure rather than an ordered dictionary
* Some drugs target protein complexes, composed of multiple distinct proteins with different names
* Some target names are missing

In [2]:
# parse the downloaded drugbank database into a dictionary
drugbank_db_path = 'data/full_drugbank_database.xml'
drugbank_target_dict = clean_drugbank(drugbank_db_path)

In [None]:
# save the drugbank dictionary to file
db_filename = 'checkpoint_files/drugbank_target_dict.sav'
pickle.dump(drugbank_target_dict, open(db_filename, 'wb'))

## Extracting drug targets from ChEMBL

The ChEMBL database does not appear to be downloadable, so we'll have to query the compounds one by one using the REST API:

In [13]:
# load list of compounds we'll lookup 
pert_info_1 = pd.read_csv('data/GSE92742_Broad_LINCS_pert_info.txt', sep='\t', header=0)
pert_info_2 = pd.read_csv('data/GSE70138_Broad_LINCS_pert_info.txt', sep='\t', header=0)
pert_info = pd.concat([pert_info_1, pert_info_2], ignore_index=True)
all_LINCS_inchis = pert_info[pert_info.pert_type == 'trt_cp'].inchi_key.unique().astype(str)

In [24]:
# lookup targets and store results in a dictionary
seen_cpds = set()
chembl_target_dict = {}

In [25]:
# this step occasionally times out due to network connectivity issues,
# so we'll make sure to not repeat ourselves
for inchi in log_progress(all_LINCS_inchis, every=1):
    if inchi not in seen_cpds:
        human_target_genes = query_chembl(inchi)
        if len(human_target_genes) > 0:
            chembl_target_dict[inchi] = human_target_genes
        seen_cpds.add(inchi)

In [None]:
# save the chembl dictionary to file
chembl_filename = 'checkpoint_files/chembl_target_dict.sav'
pickle.dump(chembl_target_dict, open(chembl_filename, 'wb'))

# Combine DrugBank and ChEMBL targets

In [None]:
# Load targets extracted from Drugbank and Chembl
drugbank_target_dict = pickle.load(open('checkpoint_files/drugbank_target_dict.sav', 'rb'))
chembl_target_dict = pickle.load(open('checkpoint_files/chembl_target_dict.sav', 'rb'))

# Lookup pert_id <-> inchi_key mappings
cpd_pert_info = pert_info[pert_info.pert_type == 'trt_cp']
cpd_inchis = cpd_pert_info[['pert_id', 'inchi_key']]

# Combine Drugbank and Chembl dicts, using pert_id for keys
LINCS_target_dict = {}

for inchi, targets in log_progress(drugbank_target_dict.items()):
    cpd = cpd_inchis[cpd_inchis.inchi_key == inchi].pert_id.values
    if cpd.size > 0: 
        cpd = cpd[0]
        LINCS_target_dict[cpd] = set(targets)

for inchi, targets in log_progress(chembl_target_dict.items()):
    cpd = cpd_inchis[cpd_inchis.inchi_key == inchi].pert_id.values
    if cpd.size > 0: 
        cpd = cpd[0]
        if cpd in LINCS_target_dict.keys():
            all_targets = LINCS_target_dict[cpd].union(set(targets))
            LINCS_target_dict[cpd] = all_targets
        else:
            LINCS_target_dict[cpd] = set(targets)

# Save the LINCS target dictionary            
filename = 'checkpoint_files/LINCS_target_dict.sav'
pickle.dump(LINCS_target_dict, open(filename, 'wb'))