In [1]:
import pandas as pd
import itertools
from support_functions import log_progress

# Predicting drug-target interactions
___
## Load the data from top-4 cell lines

In [2]:
all_pair_sig_ids = pd.DataFrame.from_csv('checkpoint_files/top_4_all_pair_sig_ids.csv')
cpd_lm_sigs = pd.DataFrame.from_csv('checkpoint_files/top_4_cpd_lm_sigs.csv')
target_kd_lm_sigs = pd.DataFrame.from_csv('checkpoint_files/top_4_target_kd_lm_sigs.csv')

In [3]:
# combine cpd and kd sigs into one dataframe
lm_sigs = pd.concat([cpd_lm_sigs,target_kd_lm_sigs], axis=1)

### Compute cpd-kd pearson correlations

In [4]:
# define the pairs we'll take the correlation between
cpd_col_names = sorted([ c for c in all_pair_sig_ids.columns if 'cpd_' in c ])
target_col_names = sorted([ c for c in all_pair_sig_ids.columns if 'target_' in c ])
cpd_target_cell_pairs = [ list(p) for p in zip(cpd_col_names,target_col_names) ]

In [5]:
# extract the cell lines
cell_lines = [ c.split('_')[1] for c in cpd_col_names ]

# initialize empty dataframe to store correlation values
training_data = all_pair_sig_ids[['pert_id', 'target', 'label']].reset_index(drop=True)
for cell in cell_lines: 
    training_data[cell] = pd.Series(0, index=training_data.index)

In [6]:
# extract the actual signatures and compute correlations
for index, row in log_progress(all_pair_sig_ids.iterrows(), every=1):
    for pair in cpd_target_cell_pairs:
        cell_line = pair[0].split('_')[1]
        [sig_id_1, sig_id_2] = row[pair].values
        corr = lm_sigs[sig_id_1].corr(lm_sigs[sig_id_2])
        training_data.loc[index, cell_line] = corr

In [7]:
training_data.to_csv('checkpoint_files/top_4_correlation_training_data.csv')

In [8]:
training_data.sample(n=10)

Unnamed: 0,pert_id,target,label,A375,A549,MCF7,PC3
7153,BRD-K07572174,PSMB8,-1.0,0.026937,0.411298,0.192492,0.063277
2521,BRD-K50836978,PDGFRA,-1.0,0.136004,0.123435,0.136657,0.184201
8016,BRD-K53414658,TXNRD1,-1.0,0.334166,0.133377,-0.044792,-0.151702
3406,BRD-K85606544,TXNRD1,-1.0,0.144653,0.088307,0.149488,-0.153384
7285,BRD-K75295174,JAK2,-1.0,0.038793,0.020993,-0.067938,0.112787
8640,BRD-K67844266,LTA4H,-1.0,-0.221447,-0.052077,0.206822,0.218072
8707,BRD-K67844266,IGF1R,-1.0,0.329923,0.059508,0.216304,0.098352
131,BRD-K07572174,ABCC5,1.0,0.204857,0.207595,0.09876,-0.019937
10493,BRD-K84937637,JUN,-1.0,0.263447,0.303619,0.102726,0.097837
10607,BRD-K99749624,HDAC2,-1.0,-0.013156,0.084224,0.059482,0.051612
