# Data profiling

In [284]:
from support_functions import log_progress, clean_drugbank, query_chembl
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

## 1. Lookup known targets

In [53]:
# load perturbation info
pert_info_1 = pd.read_csv('data/GSE92742_Broad_LINCS_pert_info.txt', sep='\t', header=0)
pert_info_2a = pd.read_csv('data/GSE70138_Broad_LINCS_pert_info.txt', sep='\t', header=0)
all_pert_info = pd.concat([pert_info_1, pert_info_2a], ignore_index=True)

In [68]:
# lookup cpd inchi keys
all_cpd_pert_info = all_pert_info[pert_info.pert_type == 'trt_cp']
all_cpd_inchis = all_cpd_pert_info[['pert_id','inchi_key']].set_index('pert_id').copy()
# remove duplicates, store as series
all_cpd_inchis = all_cpd_inchis[~all_cpd_inchis.index.duplicated()].inchi_key

### 1a. Lookup Drugbank targets

In [None]:
# lookup and store drugbank targets as dictionary: { inchi: [targets]}
db_db_path = 'data/full_drugbank_database.xml'
drugbank_target_dict = clean_drugbank(db_db_path)

In [128]:
# save to file
filename = 'checkpoint_files/drugbank_target_dict.sav'
pickle.dump(drugbank_target_dict, open(filename, 'wb'))

### 1b. Lookup Chembl targets

In [119]:
# lookup and store chembl targets as dictionary: { inchi: [targets]}
chembl_target_dict = {}
tested_inchis = [] # b/c this bitch keeps crashing

In [122]:
# query CHEMBL API using cpd inchi_keys to find human targets
for inchi in log_progress(all_cpd_inchis.values, every=2):
    if inchi not in tested_inchis:
        human_target_genes = query_chembl(inchi)
        if len(human_target_genes) > 0:
            chembl_target_dict[inchi] = human_target_genes
        tested_inchis.append(inchi)

In [127]:
# save to file
filename = 'checkpoint_files/chembl_target_dict.sav'
pickle.dump(chembl_target_dict, open(filename, 'wb'))

### 1c. Combine targets

In [153]:
drugbank_target_dict = pickle.load(open('checkpoint_files/drugbank_target_dict.sav', 'rb'))
chembl_target_dict = pickle.load(open('checkpoint_files/chembl_target_dict.sav', 'rb'))

combined_target_dict = drugbank_target_dict.copy()
for cpd, chembl_targets in log_progress(chembl_target_dict.items()):
    try:
        drugbank_targets = drugbank_target_dict[cpd]
        combined_targets = list(set().union(chembl_targets,drugbank_targets))
        combined_target_dict[cpd] = combined_targets
    except KeyError:
        combined_target_dict[cpd] = chembl_targets

## 2. Identify cpd-KD pairs (pair set 1)

The first of several filtering steps, this set is composed of pairs of known cpd-target interactions where both the cpd and the target KD have been tested in at least one LINCS experiment.

In [184]:
all_kd_pert_info = all_pert_info[pert_info.pert_type == 'trt_sh']
kd_genes = all_kd_pert_info.pert_iname.unique()

In [207]:
cpd_1_, target_1_ = [], []
for index, row in log_progress(all_cpd_pert_info.iterrows(), every=1):
    inchi = row.inchi_key
    if inchi in combined_target_dict.keys():
        pert_id = row.pert_id
        targets = combined_target_dict[inchi]
        for target in targets:
            if target in kd_genes:
                cpd_1_.append(pert_id)
                target_1_.append(target)

  


In [208]:
pair_set_1_df = pd.DataFrame({'cpd': cpd_1_, 'target': target_1_})
print('Pair set 1 statistics')
print('No. unique cpds:\t', len(pair_set_1_df.cpd.unique()))
print('No. unique KDs:\t\t', len(pair_set_1_df.target.unique()))
print('No. interactions:\t', pair_set_1_df.shape[0])

Pair set 1 statistics
No. unique cpds:	 1076
No. unique KDs:		 624
No. interactions:	 4220


## 3. Identify same-cell cpd-KD pairs (pair set 2)

The second of several filtering steps, here we filter out cpd-KD pairs that have not both been tested in at least one common cell line.

In [202]:
# load the signature information
all_sig_info = pd.DataFrame.from_csv('checkpoint_files/all_sig_info.csv')
all_cpd_sig_info = all_sig_info[all_sig_info.pert_type == 'trt_cp']
all_kd_sig_info = all_sig_info[all_sig_info.pert_type == 'trt_sh']

  if self.run_code(code, result):


In [217]:
cpd_2_, target_2_, common_cells_2_ = [], [], []
for index, row in log_progress(pair_set_1_df.iterrows(), every=1):
    cpd = row.cpd
    target = row.target
    # extract signatures
    cpd_sigs = all_cpd_sig_info[all_cpd_sig_info.pert_id == cpd]
    kd_sigs = all_kd_sig_info[all_kd_sig_info.pert_iname == target]
    # compare cell lines
    cpd_cells = set(cpd_sigs.cell_id.unique())
    kd_cells = set(kd_sigs.cell_id.unique())
    common_cells = cpd_cells & kd_cells
    if len(common_cells) > 0:
        cpd_2_.append(cpd)
        target_2_.append(target)
        common_cells_2_.append(common_cells)

In [218]:
pair_set_2_df = pd.DataFrame({'cpd': cpd_2_, 'target': target_2_})
print('Pair set 2 statistics')
print('No. unique cpds:\t', len(pair_set_2_df.cpd.unique()))
print('No. unique KDs:\t\t', len(pair_set_2_df.target.unique()))
print('No. interactions:\t', pair_set_2_df.shape[0])

Pair set 2 statistics
No. unique cpds:	 1069
No. unique KDs:		 607
No. interactions:	 4139


## 3. Identify cpd-KD pairs with interaction partners (pair set 3)

The third of several filtering steps, here we filter out cpd-KD pairs for which we do not know any interaction partners for the target.

In [239]:
# load STRING 4.0 database
# 355319.0 high confidence interactions for 14893 genes
string_gene_interactions_700 = pd.DataFrame.from_csv('checkpoint_files/string_gene_interactions_700.csv')
string_genes = string_gene_interactions_700.gene_1.unique()

In [246]:
cpd_3_, target_3_, common_cells_3_ = [], [], []
for index, row in log_progress(pair_set_2_df.iterrows(), every=1):
    target = row.target
    if target in string_genes:
        cpd = row.cpd
        common_cells = common_cells_2_[index]
        cpd_3_.append(cpd)
        target_3_.append(target)
        common_cells_3_.append(common_cells)

In [247]:
pair_set_3_df = pd.DataFrame({'cpd': cpd_3_, 'target': target_3_})
print('Pair set 3 statistics')
print('No. unique cpds:\t', len(pair_set_3_df.cpd.unique()))
print('No. unique KDs:\t\t', len(pair_set_3_df.target.unique()))
print('No. interactions:\t', pair_set_3_df.shape[0])

Pair set 3 statistics
No. unique cpds:	 1067
No. unique KDs:		 602
No. interactions:	 4118


## 4. Identify cpd-KD pairs with same-cell interaction partner KDs (pair set 4)

The fourth of several filtering steps, here we filter out cpd-KD pairs for which the target's interaction partners were not knocked down in any of the same cells as the cpd-target pair

In [275]:
cpd_4_, target_4_, common_cells_4_, common_partners_4_ = [], [], [], []

for index, row in log_progress(pair_set_3_df.iterrows(), every=1):
    common_cells = common_cells_3_[index]
    target = row.target
    common_partners_dict = {}
    
    # find interaction partner signatures
    partners = string_gene_interactions_700[string_gene_interactions_700.gene_1 == target].gene_2
    partner_sigs = all_kd_sig_info[all_kd_sig_info.pert_iname.isin(partners)]
    
    # group by cell line
    for cell_line in common_cells:
        partner_cell_sigs = partner_sigs[partner_sigs.cell_id == cell_line]
        cell_partners = partner_cell_sigs.pert_iname.unique()
        if len(cell_partners) > 0:
            # store in dictionary
            common_partners_dict[cell_line] = cell_partners
    
    # make sure we have partner KDs in at least one cell line
    if len(common_partners_dict.keys()) > 0:
        common_cells_4 = set(common_partners_dict.keys())
        cpd_4_.append(row.cpd)
        target_4_.append(target)
        common_cells_4_.append(common_cells_4)
        common_partners_4_.append(common_partners_dict)

In [276]:
pair_set_4_df = pd.DataFrame({'cpd': cpd_4_, 'target': target_4_})
print('Pair set 4 statistics')
print('No. unique cpds:\t', len(pair_set_4_df.cpd.unique()))
print('No. unique KDs:\t\t', len(pair_set_4_df.target.unique()))
print('No. interactions:\t', pair_set_4_df.shape[0])

Pair set 4 statistics
No. unique cpds:	 1065
No. unique KDs:		 590
No. interactions:	 4086


## 5. Signature quality of cpd-KD pair (pair set 5)

Now we'll look at how the size of the training set changes as we place more stringent requirements on the quality of the expression signatures for the main cpd-KD pair. Next we'll enforce the same signature quality requirements on the interaction partners

In [296]:
# data quality parameters 
pct_self_rank_cutoff = 0.05
distill_cc_cutoff = 0.2

In [299]:
cpd_5_, target_5_, common_cells_5_, common_partners_5_ = [], [], [], []

for index, row in log_progress(pair_set_4_df.iterrows(), every=1):
    cpd = row.cpd
    target = row.target
    common_cells = common_cells_4_[index]
    
    # check that criteria are met in each cell line
    for cell_line in common_cells:
        cpd_cell_sigs = all_cpd_sig_info[(all_cpd_sig_info.pert_id == cpd) &
                                         (all_cpd_sig_info.cell_id == cell_line)]
        good_cpd_cell_sigs = cpd_cell_sigs[(cpd_cell_sigs.distil_cc_q75 >= distill_cc_cutoff) &
                                           (cpd_cell_sigs.pct_self_rank_q25 <= pct_self_rank_cutoff)]
        print(len(good_cpd_cell_sigs))
    break

0
0
0
0
0
0
0
0
