In [2]:
import pandas as pd
import numpy as np
import scipy
import itertools
from support_functions import log_progress
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)

___
# Load the signatures and metadata from the top-4 cell lines

In [6]:
# the known cpd-target interactions. shape: [244, 2]
top_4_known_interactions = pd.DataFrame.from_csv('checkpoint_files/top_4_known_interactions.csv')

# the signature ID metadata. shapes: [11948, 15] & [284, 16]
repr_top_4_gold_kd_sigs = pd.DataFrame.from_csv('checkpoint_files/repr_top_4_gold_kd_sigs.csv')
repr_top_4_gold_cpd_sigs = pd.DataFrame.from_csv('checkpoint_files/repr_top_4_gold_cpd_sigs.csv')

# the actual signatures. shapes: [978, 11948] & [978, 284]
top_4_kd_lm_sigs = pd.DataFrame.from_csv('checkpoint_files/top_4_kd_lm_sigs.csv')
top_4_cpd_lm_sigs = pd.DataFrame.from_csv('checkpoint_files/top_4_cpd_lm_sigs.csv')

___
# Load PPI data from String database

The table is redundant in that it has 2 entries for each interaction, with the order of the proteins swapped.

In [183]:
# 710638 high confidence interactions
string_gene_interactions_700 = pd.DataFrame.from_csv('checkpoint_files/string_gene_interactions_700.csv')

___
# Assemble cpd-kd pairs

In [322]:
# create cpd-kd mapping with labels
all_top_4_kds = repr_top_4_gold_kd_sigs.pert_iname.unique()
all_top_4_cpds = repr_top_4_gold_cpd_sigs.pert_id.unique()

In [53]:
cpd_ = []
kd_ = []
label_ = []

for kd in log_progress(all_top_4_kds):
    for cpd in all_top_4_cpds:
        if top_4_known_interactions.query('pert_id == "{}" & target == "{}"'.format(cpd,kd)).shape[0]:
            label = 1 # true interaction
        else:
            label = -1 # false interaction
        cpd_.append(cpd)
        kd_.append(kd)
        label_.append(label)

In [177]:
# store pairs & labels in dataframe
cpd_kd_pairs_df = pd.DataFrame({'cpd': cpd_, 'kd': kd_, 'label': label_})
cpd_kd_pairs_df.to_csv('features/cpd_kd_pairs_df.csv')

___
# Compute Direct Correlation Features

We want to construct direct correlation values for all cpd-kd pairs in each cell line. Eventually we will also want to construct the indirect correlation features.

It seems the most useful way to organize this data is in a large table **dir_corr_df** where each row is a cpd-kd pair and the columns are the feature values. The row indeces of this table will be map to the corresponding row indeces of the **cpd_kd_pairs_df**.

In [69]:
# extract the top 4 cell lines for reference
top_4_cell_lines = repr_top_4_gold_cpd_sigs.cell_id.unique()

In [245]:
# initialize empty dataframe to hold direct correlation values
dir_corr_df = pd.DataFrame(index=cpd_kd_pairs_df.index, columns=top_4_cell_lines)

# loop through cpd-target pairs, calculate correlations in each cell line
for index, row in log_progress(cpd_kd_pairs_df.iterrows(), every=100):
    cpd = row.cpd
    kd = row.kd
    
    for cell_line in top_4_cell_lines:
        cpd_sig_info = repr_top_4_gold_cpd_sigs.query('pert_id == "{}" & cell_id == "{}"'.format(cpd,cell_line))
        kd_sig_info = repr_top_4_gold_kd_sigs.query('pert_iname == "{}" & cell_id == "{}"'.format(kd,cell_line))
        
        # extract signatures
        cpd_sig_id = cpd_sig_info.sig_id
        kd_sig_id = kd_sig_info.sig_id
        cpd_lm_sig = top_4_cpd_lm_sigs[cpd_sig_id].values
        kd_lm_sig = top_4_kd_lm_sigs[kd_sig_id].values
        
        #compute and store correlation
        corr = scipy.stats.pearsonr(cpd_lm_sig, kd_lm_sig)[0][0]
        dir_corr_df.at[index, cell_line] = corr

KeyboardInterrupt: 

In [178]:
# save direct correlation features to file
dir_corr_df.to_csv('features/dir_corr_df.csv')

In [248]:
dir_corr_df = pd.DataFrame.from_csv('features/dir_corr_df.csv')

___
# Compute Indirect Correlation Features

We want to construct indirect correlation values for all cpd-kd pairs in each cell line. 

These features will be stored in a large table **indir_corr_df** where each row is a cpd-kd pair and the columns are the feature values. The row indeces of this table will be map to the corresponding row indeces of the **cpd_kd_pairs_df**.

In [207]:
# list of all KDs in top-4 cells, for reference
all_kds = repr_top_4_gold_kd_sigs.pert_iname.unique()

In [233]:
# initialize empty dataframe to hold direct correlation values
indir_max_corr_df = pd.DataFrame(index=cpd_kd_pairs_df.index, columns=top_4_cell_lines)
indir_min_corr_df = pd.DataFrame(index=cpd_kd_pairs_df.index, columns=top_4_cell_lines)
indir_avg_corr_df = pd.DataFrame(index=cpd_kd_pairs_df.index, columns=top_4_cell_lines)

# loop through cpd-target pairs, calculate correlations in each cell line
for index, row in log_progress(cpd_kd_pairs_df.iterrows(), every=10):
    cpd = row.cpd
    kd = row.kd
    
    # find the target's interaction partners that have KDs
    interaction_partners = string_gene_interactions_700.query('gene_1 == "{}"'.format(kd)).gene_2
    partner_kds = np.intersect1d(interaction_partners, all_kds)
    
    # compute corr with each partner in each cell line
    for cell_line in top_4_cell_lines:
        
        # extract the cpd signature
        cpd_sig_info = repr_top_4_gold_cpd_sigs.query('pert_id == "{}" & cell_id == "{}"'.format(cpd,cell_line))
        cpd_sig_id = cpd_sig_info.sig_id
        cpd_lm_sig = top_4_cpd_lm_sigs[cpd_sig_id].values
       
        # initialize empty Series to hold corrs for all partners in this cell line
        pkd_corrs = pd.Series(index=partner_kds)
        
        for pkd in partner_kds:
            # extract the partner kd signature
            pkd_sig_info = repr_top_4_gold_kd_sigs.query('pert_iname == "{}" & cell_id == "{}"'.format(pkd,cell_line))
            pkd_sig_id = pkd_sig_info.sig_id
            pkd_lm_sig = top_4_kd_lm_sigs[pkd_sig_id].values

            #compute and store correlation
            corr = scipy.stats.pearsonr(cpd_lm_sig, pkd_lm_sig)[0][0]
            pkd_corrs[pkd] = corr
        
        # compute max, min, and average of the partner kd corrs
        max_pkd_corr = pkd_corrs.max()
        min_pkd_corr = pkd_corrs.min()
        avg_pkd_corr = pkd_corrs.mean()
        
        # store these in the appropriate data frames
        indir_max_corr_df.at[index, cell_line] = max_pkd_corr
        indir_min_corr_df.at[index, cell_line] = min_pkd_corr
        indir_avg_corr_df.at[index, cell_line] = avg_pkd_corr
        

In [234]:
# save the indirect correlation features to a file
indir_max_corr_df.to_csv('features/indir_max_corr_df.csv')
indir_min_corr_df.to_csv('features/indir_min_corr_df.csv')
indir_avg_corr_df.to_csv('features/indir_avg_corr_df.csv')

___
# Dealing with missing values

We see below that there are several cpd-kd pairs for which we are unable to calculate indirect correlation features. My hunch is because these KDs do not have known interaction partners in STRING. We need to profile these KDs and make sure there are not too many, and then remove them from the training data.

In [348]:
features_df = pd.concat([dir_corr_df, indir_max_corr_df, indir_min_corr_df, indir_avg_corr_df], axis=1)

In [338]:
# extract rows that have NaN entries
null_row_indeces = features_df.isnull().any(axis=1)
null_rows = features_df[null_row_indeces]
null_pairs = cpd_kd_pairs_df[null_row_indeces]
print('Number of cpd-KD pairs that have Null Values: ', len(null_pairs))

Number of cpd-KD pairs that have Null Values:  33157


In [315]:
null_kds = null_pairs.kd
n_no_partners = 0

for nkd in log_progress(null_kds, every=10):
    interaction_partners = string_gene_interactions_700.query('gene_1 == "{}"'.format(nkd)).gene_2
    if len(interaction_partners) == 0: 
        n_no_partners += 1
        
print('Number of cpd-KD pairs for which the KD has no interaction partners: ', n_no_partners)

Number of cpd-KD pairs for which the KD has no interaction partners:  14768


In [319]:
null_kds = null_pairs.kd.unique()
null_kds_with_partners = []

for nkd in log_progress(null_kds, every=10):
    interaction_partners = string_gene_interactions_700.query('gene_1 == "{}"'.format(nkd)).gene_2
    if len(interaction_partners) != 0: 
        null_kds_with_partners.append(nkd)
        
print('Number of NaN KDs that do have partners: ', len(null_kds_with_partners))

Number of NaN KDs that do have partners:  259


In [331]:
# check if these partners have KDs
for nkdwp in log_progress(null_kds_with_partners):
    interaction_partners = string_gene_interactions_700.query('gene_1 == "{}"'.format(nkdwp)).gene_2.values
    for p in interaction_partners:
        if p in all_top_4_kds:
            print('KD: ', nkdwp, 'Partner: ', p)

### So why is there missing data?
Based on the analysis above, we see that a cpd-kd pair might be missing data for one of two reasons:

1. The KD has no known interaction partners in the STRING database
2. None of the interaction partners have KDs in our top-4 cell line data.

What happens if we remove these null pairs all together? Will we still have 71 compounds with true interactions to work with?

In [371]:
complete_row_indeces = ~null_row_indeces
complete_data_rows = features_df[complete_row_indeces]
complete_pair_rows = cpd_kd_pairs_df[complete_row_indeces]
positive_pair_rows = complete_pair_rows.query('label == 1')

print('Number of remaining positive cpds: ', len(positive_pair_rows.cpd.unique()))
print('Number of remaining positive KDs: ', len(positive_pair_rows.kd.unique()))
print('Number of remaining positive interactions: ', len(positive_pair_rows))

Number of remaining positive cpds:  71
Number of remaining positive KDs:  151
Number of remaining positive interactions:  237


So if we remove all the missing pairs, we still have at least 1 positive interaction for all 71 of our compounds, which is fantastic.

___
# Constructing Training Data

In [402]:
X = complete_data_rows
y = pd.DataFrame({'label': complete_pair_rows.label})
metadata = complete_pair_rows

In [399]:
# scale the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X)
X_std = pd.DataFrame(sc.transform(X))

In [403]:
# save the data
X_std.to_csv('features/X_std.csv')
y.to_csv('features/y.csv')
metadata.to_csv('features/metadata.csv')

In [394]:
X_std.shape

(178920, 16)

In [401]:
y

Unnamed: 0,label
0,-1
1,-1
2,-1
3,-1
4,-1
5,-1
6,-1
7,-1
8,-1
9,-1
