In [158]:
import pandas as pd
import numpy as np
import scipy
import itertools
from support_functions import log_progress
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)

___
# Load the signatures and metadata from the top-4 cell lines

In [142]:
# the known cpd-target interactions. shape: [244, 2]
top_4_known_interactions = pd.DataFrame.from_csv('checkpoint_files/top_4_known_interactions.csv')

# the signature ID metadata. shapes: [11948, 15] & [284, 16]
repr_top_4_gold_kd_sigs = pd.DataFrame.from_csv('checkpoint_files/repr_top_4_gold_kd_sigs.csv')
repr_top_4_gold_cpd_sigs = pd.DataFrame.from_csv('checkpoint_files/repr_top_4_gold_cpd_sigs.csv')

# the actual signatures. shapes: [978, 11948] & [978, 284]
top_4_kd_lm_sigs = pd.DataFrame.from_csv('checkpoint_files/top_4_kd_lm_sigs.csv')
top_4_cpd_lm_sigs = pd.DataFrame.from_csv('checkpoint_files/top_4_cpd_lm_sigs.csv')

___
# Load PPI data from String database

In [14]:
# 710638 high confidence interactions
string_gene_interactions_700 = pd.DataFrame.from_csv('checkpoint_files/string_gene_interactions_700.csv')

___
# Assemble cpd-kd pairs

In [33]:
# create cpd-kd mapping with labels
kds = repr_top_4_gold_kd_sigs.pert_iname.unique()
cpds = repr_top_4_gold_cpd_sigs.pert_id.unique()

In [53]:
cpd_ = []
kd_ = []
label_ = []

for kd in log_progress(kds):
    for cpd in cpds:
        if top_4_known_interactions.query('pert_id == "{}" & target == "{}"'.format(cpd,kd)).shape[0]:
            label = 1 # true interaction
        else:
            label = -1 # false interaction
        cpd_.append(cpd)
        kd_.append(kd)
        label_.append(label)

In [177]:
# store pairs & labels in dataframe
cpd_kd_pairs_df = pd.DataFrame({'cpd': cpd_, 'kd': kd_, 'label': label_})
cpd_kd_pairs_df.to_csv('features/cpd_kd_pairs_df.csv')

___
# Construct Direct Correlation Features

We want to construct direct correlation values for all cpd-kd pairs in each cell line. Eventually we will also want to construct the indirect correlation features.

It seems the most useful way to organize this data is in a large table where each row is a cpd-kd pair and the columns are the feature values. The row indeces of this table will be map to the corresponding row indeces of the **cpd_kd_pairs_df**.

In [69]:
# extract the top 4 cell lines for reference
top_4_cell_lines = repr_top_4_gold_cpd_sigs.cell_id.unique()

In [172]:
# initialize empty dataframe to hold direct correlation values
dir_corr_df = pd.DataFrame(index=cpd_kd_pairs_df.index, columns=top_4_cell_lines)

# loop through cpd-target pairs, calculate correlations in each cell line
for index, row in log_progress(cpd_kd_pairs_df.iterrows(), every=100):
    cpd = row.cpd
    kd = row.kd
    
    for cell_line in top_4_cell_lines:
        cpd_sig_info = repr_top_4_gold_cpd_sigs.query('pert_id == "{}" & cell_id == "{}"'.format(cpd,cell_line))
        kd_sig_info = repr_top_4_gold_kd_sigs.query('pert_iname == "{}" & cell_id == "{}"'.format(kd,cell_line))
        
        # extract signatures
        cpd_sig_id = cpd_sig_info.sig_id
        kd_sig_id = kd_sig_info.sig_id
        cpd_lm_sig = top_4_cpd_lm_sigs[cpd_sig_id].values
        kd_lm_sig = top_4_kd_lm_sigs[kd_sig_id].values
        
        #compute and store correlation
        corr = scipy.stats.pearsonr(cpd_lm_sig, kd_lm_sig)[0][0]
        dir_corr_df.at[index, cell_line] = corr

In [178]:
# save direct correlation features to file
dir_corr_df.to_csv('features/dir_corr_df.csv')

In [63]:
repr_top_4_gold_cpd_sigs.head()

Unnamed: 0,cell_id,distil_id,pert_dose,pert_dose_unit,pert_id,pert_idose,pert_iname,pert_itime,pert_time,pert_time_unit,pert_type,sig_id,distil_cc_q75,pct_self_rank_q25,tas,inchi_key
18286,A375,LJP005_A375_24H_X1_B19:C21|LJP005_A375_24H_X2_...,,,BRD-K21680192,1.11 um,mitoxantrone,24 h,,,trt_cp,LJP005_A375_24H:C21,0.88,-666.0,0.856349,KKZJGLLVHKMTCM-UHFFFAOYSA-N
18299,A549,LJP005_A549_24H_X1_B19:C23|LJP005_A549_24H_X2_...,,,BRD-K21680192,0.12 um,mitoxantrone,24 h,,,trt_cp,LJP005_A549_24H:C23,0.86,-666.0,0.800102,KKZJGLLVHKMTCM-UHFFFAOYSA-N
12520,MCF7,CPD001_MCF7_24H_X1_B6_DUO52HI53LO:N01|CPD001_M...,10.0,µM,BRD-K21680192,10 µM,mitoxantrone,24 h,24.0,h,trt_cp,CPD001_MCF7_24H:BRD-K21680192-300-11-0:10,0.78,0.0,0.728264,KKZJGLLVHKMTCM-UHFFFAOYSA-N
2128,PC3,CPC005_PC3_24H_X1_B4_DUO52HI53LO:O13|CPC005_PC...,10.0,µM,BRD-K21680192,10 µM,mitoxantrone,24 h,24.0,h,trt_cp,CPC005_PC3_24H:BRD-K21680192-300-06-0:10,0.77,0.0,0.790653,KKZJGLLVHKMTCM-UHFFFAOYSA-N
1721,A375,CPC005_A375_24H_X1_B3_DUO52HI53LO:B04|CPC005_A...,10.0,µM,BRD-K81418486,10 µM,vorinostat,24 h,24.0,h,trt_cp,CPC005_A375_24H:BRD-K81418486:10,0.89,0.0,0.828894,WAEXFXRVDQXREF-UHFFFAOYSA-N


In [64]:
top_4_cpd_lm_sigs.head()

Unnamed: 0,CPC005_A375_6H:BRD-K50836978-001-01-7:10,CPC005_A375_24H:BRD-K56429665-001-03-9:10,CPC005_A375_24H:BRD-K31342827-001-06-2:10,CPC005_A375_24H:BRD-A45889380-300-04-8:10,CPC005_A375_24H:BRD-A81772229-001-01-6:10,CPC005_A375_24H:BRD-K55127134-300-07-0:10,CPC005_A375_24H:BRD-K81418486:10,CPC005_A549_6H:BRD-K56429665-001-03-9:10,CPC005_A549_6H:BRD-K50836978-001-01-7:10,CPC005_A549_24H:BRD-A81772229-001-01-6:10,...,LJP006_MCF7_24H:A12,LJP006_MCF7_24H:D08,LJP007_A375_24H:M02,LJP007_MCF7_24H:C05,LJP007_MCF7_24H:M02,LJP007_PC3_24H:M06,LJP009_MCF7_24H:G14,LJP009_PC3_24H:G18,LPROT001_A375_6H:L01,LPROT001_MCF7_6H:K01
10007,-3.253649,-0.381245,5.617756,-5.441064,-1.561962,3.608747,-0.18025,-0.713414,-2.045255,-2.727256,...,1.102926,-0.538376,-0.593074,0.013058,-0.848993,-0.050306,-1.877276,-0.510815,-0.1917,-0.9115
1001,3.122064,0.002457,-0.112246,3.992977,0.210938,0.181215,2.0597,-0.72199,1.331961,-0.196673,...,-0.618056,-0.231992,0.294789,0.136761,-0.00314,0.037441,-0.992789,-1.340263,0.065,-0.5532
10013,-0.716842,-0.654145,2.251048,-3.612755,2.425956,0.63382,9.087151,-0.578746,-0.991451,2.299488,...,1.37922,0.756409,-0.09185,-0.367061,-0.563915,-0.460368,0.389259,0.589351,0.5969,-0.8921
10038,-2.547382,1.992397,-4.273192,-4.944189,-5.00455,-2.494175,-3.86235,0.14591,-4.276713,-2.291521,...,-1.172589,0.296486,-0.292281,-0.045172,-0.111818,0.301023,0.127563,-1.309107,5.898,2.058
10046,2.728706,-0.102893,-1.175333,4.777947,0.000192,-0.904953,1.00395,0.60542,0.756224,1.133909,...,0.674637,2.976056,1.235914,-1.027002,0.5501,-0.057055,2.055717,-0.448527,-2.3078,1.1517
