In [8]:
import pandas as pd
import itertools
from support_functions import log_progress

# Predicting drug-target interactions
___
## Load the data from top-4 cell lines

In [9]:
all_pair_sig_ids = pd.DataFrame.from_csv('checkpoint_files/top_4_all_pair_sig_ids.csv')
cpd_lm_sigs = pd.DataFrame.from_csv('checkpoint_files/top_4_cpd_lm_sigs.csv')
target_kd_lm_sigs = pd.DataFrame.from_csv('checkpoint_files/top_4_target_kd_lm_sigs.csv')

In [10]:
# combine cpd and kd sigs into one dataframe
lm_sigs = pd.concat([cpd_lm_sigs,target_kd_lm_sigs], axis=1)

### Compute cpd-kd pearson correlations

In [11]:
# define the pairs we'll take the correlation between
cpd_col_names = sorted([ c for c in all_pair_sig_ids.columns if 'cpd_' in c ])
target_col_names = sorted([ c for c in all_pair_sig_ids.columns if 'target_' in c ])
cpd_target_cell_pairs = [ list(p) for p in zip(cpd_col_names,target_col_names) ]

In [12]:
# extract the cell lines
cell_lines = [ c.split('_')[1] for c in cpd_col_names ]

# initialize empty dataframe to store correlation values
training_data = all_pair_sig_ids[['pert_id', 'target', 'label']].reset_index(drop=True)
for cell in cell_lines: 
    training_data[cell] = pd.Series(0, index=training_data.index)

In [13]:
# extract the actual signatures and compute correlations
for index, row in log_progress(all_pair_sig_ids.iterrows(), every=1):
    for pair in cpd_target_cell_pairs:
        cell_line = pair[0].split('_')[1]
        [sig_id_1, sig_id_2] = row[pair].values
        corr = lm_sigs[sig_id_1].corr(lm_sigs[sig_id_2])
        training_data.loc[index, cell_line] = corr

In [16]:
training_data.to_csv('checkpoint_files/top_4_correlation_training_data.csv')

In [18]:
training_data.sample(n=10)

Unnamed: 0,pert_id,target,label,A375,A549,MCF7,PC3
8512,BRD-K64890080,EPHA2,-1,-0.042758,-0.101152,0.165243,0.051858
4897,BRD-K68202742,JAK3,-1,0.165038,0.027302,0.079603,-0.03107
6729,BRD-K64052750,HDAC4,-1,-0.005062,-0.026603,-0.085391,-0.229337
9080,BRD-K12184916,IMPDH1,-1,-0.027646,-0.094541,0.01394,0.066537
10676,BRD-K99749624,EPHB4,-1,0.13683,0.023258,0.26102,0.098326
858,BRD-K09638361,HDAC4,-1,-0.130681,-0.017994,0.15687,0.23287
5307,BRD-K49328571,PSMB7,-1,0.101953,0.069006,0.103936,0.164845
9615,BRD-K59369769,ERBB4,-1,-0.103797,-0.280984,0.220235,-0.00497
3186,BRD-K52075040,METAP2,-1,0.322087,0.032957,-0.010839,-0.195411
9451,BRD-K69932463,CSK,-1,-0.107757,0.000891,-0.091516,0.14753
