In [158]:
import pandas as pd
import numpy as np
import scipy
import itertools
from support_functions import log_progress
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)

___
# Load the signatures and metadata from the top-4 cell lines

In [142]:
# the known cpd-target interactions. shape: [244, 2]
top_4_known_interactions = pd.DataFrame.from_csv('checkpoint_files/top_4_known_interactions.csv')

# the signature ID metadata. shapes: [11948, 15] & [284, 16]
repr_top_4_gold_kd_sigs = pd.DataFrame.from_csv('checkpoint_files/repr_top_4_gold_kd_sigs.csv')
repr_top_4_gold_cpd_sigs = pd.DataFrame.from_csv('checkpoint_files/repr_top_4_gold_cpd_sigs.csv')

# the actual signatures. shapes: [978, 11948] & [978, 284]
top_4_kd_lm_sigs = pd.DataFrame.from_csv('checkpoint_files/top_4_kd_lm_sigs.csv')
top_4_cpd_lm_sigs = pd.DataFrame.from_csv('checkpoint_files/top_4_cpd_lm_sigs.csv')

___
# Load PPI data from String database

The table is redundant in that it has 2 entries for each interaction, with the order of the proteins swapped.

In [183]:
# 710638 high confidence interactions
string_gene_interactions_700 = pd.DataFrame.from_csv('checkpoint_files/string_gene_interactions_700.csv')

___
# Assemble cpd-kd pairs

In [33]:
# create cpd-kd mapping with labels
kds = repr_top_4_gold_kd_sigs.pert_iname.unique()
cpds = repr_top_4_gold_cpd_sigs.pert_id.unique()

In [53]:
cpd_ = []
kd_ = []
label_ = []

for kd in log_progress(kds):
    for cpd in cpds:
        if top_4_known_interactions.query('pert_id == "{}" & target == "{}"'.format(cpd,kd)).shape[0]:
            label = 1 # true interaction
        else:
            label = -1 # false interaction
        cpd_.append(cpd)
        kd_.append(kd)
        label_.append(label)

In [177]:
# store pairs & labels in dataframe
cpd_kd_pairs_df = pd.DataFrame({'cpd': cpd_, 'kd': kd_, 'label': label_})
cpd_kd_pairs_df.to_csv('features/cpd_kd_pairs_df.csv')

___
# Construct Direct Correlation Features

We want to construct direct correlation values for all cpd-kd pairs in each cell line. Eventually we will also want to construct the indirect correlation features.

It seems the most useful way to organize this data is in a large table **dir_corr_df** where each row is a cpd-kd pair and the columns are the feature values. The row indeces of this table will be map to the corresponding row indeces of the **cpd_kd_pairs_df**.

In [69]:
# extract the top 4 cell lines for reference
top_4_cell_lines = repr_top_4_gold_cpd_sigs.cell_id.unique()

In [172]:
# initialize empty dataframe to hold direct correlation values
dir_corr_df = pd.DataFrame(index=cpd_kd_pairs_df.index, columns=top_4_cell_lines)

# loop through cpd-target pairs, calculate correlations in each cell line
for index, row in log_progress(cpd_kd_pairs_df.iterrows(), every=100):
    cpd = row.cpd
    kd = row.kd
    
    for cell_line in top_4_cell_lines:
        cpd_sig_info = repr_top_4_gold_cpd_sigs.query('pert_id == "{}" & cell_id == "{}"'.format(cpd,cell_line))
        kd_sig_info = repr_top_4_gold_kd_sigs.query('pert_iname == "{}" & cell_id == "{}"'.format(kd,cell_line))
        
        # extract signatures
        cpd_sig_id = cpd_sig_info.sig_id
        kd_sig_id = kd_sig_info.sig_id
        cpd_lm_sig = top_4_cpd_lm_sigs[cpd_sig_id].values
        kd_lm_sig = top_4_kd_lm_sigs[kd_sig_id].values
        
        #compute and store correlation
        corr = scipy.stats.pearsonr(cpd_lm_sig, kd_lm_sig)[0][0]
        dir_corr_df.at[index, cell_line] = corr

In [178]:
# save direct correlation features to file
dir_corr_df.to_csv('features/dir_corr_df.csv')

___
# Construct Indirect Correlation Features

We want to construct indirect correlation values for all cpd-kd pairs in each cell line. 

These features will be stored in a large table **indir_corr_df** where each row is a cpd-kd pair and the columns are the feature values. The row indeces of this table will be map to the corresponding row indeces of the **cpd_kd_pairs_df**.

In [207]:
# list of all KDs in top-4 cells, for reference
all_kds = repr_top_4_gold_kd_sigs.pert_iname.unique()

In [233]:
# initialize empty dataframe to hold direct correlation values
indir_max_corr_df = pd.DataFrame(index=cpd_kd_pairs_df.index, columns=top_4_cell_lines)
indir_min_corr_df = pd.DataFrame(index=cpd_kd_pairs_df.index, columns=top_4_cell_lines)
indir_avg_corr_df = pd.DataFrame(index=cpd_kd_pairs_df.index, columns=top_4_cell_lines)

# loop through cpd-target pairs, calculate correlations in each cell line
for index, row in log_progress(cpd_kd_pairs_df.iterrows(), every=10):
    cpd = row.cpd
    kd = row.kd
    
    # find the target's interaction partners that have KDs
    interaction_partners = string_gene_interactions_700.query('gene_1 == "{}"'.format(kd)).gene_2
    partner_kds = np.intersect1d(interaction_partners, all_kds)
    
    # compute corr with each partner in each cell line
    for cell_line in top_4_cell_lines:
        
        # extract the cpd signature
        cpd_sig_info = repr_top_4_gold_cpd_sigs.query('pert_id == "{}" & cell_id == "{}"'.format(cpd,cell_line))
        cpd_sig_id = cpd_sig_info.sig_id
        cpd_lm_sig = top_4_cpd_lm_sigs[cpd_sig_id].values
       
        # initialize empty Series to hold corrs for all partners in this cell line
        pkd_corrs = pd.Series(index=partner_kds)
        
        for pkd in partner_kds:
            # extract the partner kd signature
            pkd_sig_info = repr_top_4_gold_kd_sigs.query('pert_iname == "{}" & cell_id == "{}"'.format(pkd,cell_line))
            pkd_sig_id = pkd_sig_info.sig_id
            pkd_lm_sig = top_4_kd_lm_sigs[pkd_sig_id].values

            #compute and store correlation
            corr = scipy.stats.pearsonr(cpd_lm_sig, pkd_lm_sig)[0][0]
            pkd_corrs[pkd] = corr
        
        # compute max, min, and average of the partner kd corrs
        max_pkd_corr = pkd_corrs.max()
        min_pkd_corr = pkd_corrs.min()
        avg_pkd_corr = pkd_corrs.mean()
        
        # store these in the appropriate data frames
        indir_max_corr_df.at[index, cell_line] = max_pkd_corr
        indir_min_corr_df.at[index, cell_line] = min_pkd_corr
        indir_avg_corr_df.at[index, cell_line] = avg_pkd_corr
        

In [None]:
# save the indirect correlation features to a file
indir_max_corr_df.to_csv('features/indir_max_corr_df.csv')
indir_min_corr_df.to_csv('features/indir_min_corr_df.csv')
indir_avg_corr_df.to_csv('features/indir_avg_corr_df.csv')