# Leave-One-Out Cross Validation

We'll perform LOOCV using our modified LINCS Random Forest implementation and test the model's accuracy on each compounds after training on the remaining compounds. 


In [43]:
import pandas as pd
import numpy as np
import itertools
from sklearn.ensemble import RandomForestClassifier
from support_functions import log_progress
import scipy
import time
from joblib import Parallel, delayed

## Load training and validation data

In [2]:
num_batches = 20

In [4]:
# Construct metadata table
cpd_kd_pair_df = pd.DataFrame(columns=['cpd', 'kd', 'label'])

for b in log_progress(range(num_batches)):
    df_filename = 'checkpoint_files/pair_set_4_subdf_{}.csv'.format(b)
    cpd_kd_pair_df = cpd_kd_pair_df.append(pd.DataFrame.from_csv(df_filename))
    
cpd_kd_pair_df = cpd_kd_pair_df.reset_index(drop=True)

In [9]:
print('Profile of training data: \n')
print('No. True interactions:\t', cpd_kd_pair_df.label.value_counts()[1])
print('No. True cpds:\t\t', len(cpd_kd_pair_df[cpd_kd_pair_df.label == 1].cpd.unique()))
print('No. True genes:\t\t', len(cpd_kd_pair_df[cpd_kd_pair_df.label == 1].kd.unique()))
print('\nNo. False interactions:\t', cpd_kd_pair_df.label.value_counts()[0])
print('No. False cpds:\t\t', len(cpd_kd_pair_df[cpd_kd_pair_df.label == 0].cpd.unique()))
print('No. False genes:\t', len(cpd_kd_pair_df[cpd_kd_pair_df.label == 0].kd.unique()))

Profile of training data: 

No. True interactions:	 592
No. True cpds:		 182
No. True genes:		 250

No. False interactions:	 2688751
No. False cpds:		 933
No. False genes:	 3233


In [10]:
# Direct Correlation Data
dir_corr_df_0 = pd.DataFrame.from_csv('features/top_7_dir_corr_sub_df_0.csv')
dir_corr_df = pd.DataFrame(columns=dir_corr_df_0.columns)
dir_corr_df = dir_corr_df.append(dir_corr_df_0)

for b in log_progress(range(1,num_batches)):
    df_filename = 'features/top_7_dir_corr_sub_df_{}.csv'.format(b)
    dir_corr_df = dir_corr_df.append(pd.DataFrame.from_csv(df_filename))
    
dir_corr_df = dir_corr_df.reset_index(drop=True)

In [11]:
# Indirect Correlation Data
indir_max_corr_df_0 = pd.DataFrame.from_csv('features/top_7_indir_max_corr_sub_df_0.csv')
indir_max_corr_df = pd.DataFrame(columns=indir_max_corr_df_0.columns)
indir_max_corr_df = indir_max_corr_df.append(indir_max_corr_df_0)

indir_min_corr_df_0 = pd.DataFrame.from_csv('features/top_7_indir_min_corr_sub_df_0.csv')
indir_min_corr_df = pd.DataFrame(columns=indir_min_corr_df_0.columns)
indir_min_corr_df = indir_min_corr_df.append(indir_min_corr_df_0)

indir_avg_corr_df_0 = pd.DataFrame.from_csv('features/top_7_indir_avg_corr_sub_df_0.csv')
indir_avg_corr_df = pd.DataFrame(columns=indir_avg_corr_df_0.columns)
indir_avg_corr_df = indir_avg_corr_df.append(indir_avg_corr_df_0)


for b in log_progress(range(1,num_batches)):
    max_df_filename = 'features/top_7_indir_max_corr_sub_df_{}.csv'.format(b)
    min_df_filename = 'features/top_7_indir_min_corr_sub_df_{}.csv'.format(b)
    avg_df_filename = 'features/top_7_indir_avg_corr_sub_df_{}.csv'.format(b)
    
    indir_max_corr_df = indir_max_corr_df.append(pd.DataFrame.from_csv(max_df_filename))
    indir_min_corr_df = indir_min_corr_df.append(pd.DataFrame.from_csv(min_df_filename))
    indir_avg_corr_df = indir_avg_corr_df.append(pd.DataFrame.from_csv(avg_df_filename))

In [12]:
# Compile features into dataframe
cell_lines = dir_corr_df.columns
X_df = pd.DataFrame()

for cell_line in log_progress(cell_lines):
    dir_corr = dir_corr_df[cell_line].values
    indir_max_corr = indir_max_corr_df[cell_line].values
    indir_min_corr = indir_min_corr_df[cell_line].values
    indir_avg_corr = indir_avg_corr_df[cell_line].values
  
    tmp_df = pd.DataFrame({'{}_dir'.format(cell_line): dir_corr,
                           '{}_max'.format(cell_line): indir_max_corr,
                           '{}_min'.format(cell_line): indir_min_corr,
                           '{}_avg'.format(cell_line): indir_avg_corr,
                          })
    
    X_df = pd.concat([X_df, tmp_df], axis=1)

## Cross validation

In [35]:
pos_cpds = cpd_kd_pair_df[cpd_kd_pair_df.label == 1].cpd.unique()

# just for debugging
test_pos_cpds = ['BRD-K43389675']

In [83]:
# split up data into training and validation
loocv_result_df = pd.DataFrame()

for cpd in log_progress(pos_cpds):
    
    # split train/test, testing 1 cpd at a time
    test_idx = cpd_kd_pair_df[cpd_kd_pair_df.cpd == cpd].index
    train_idx = cpd_kd_pair_df[cpd_kd_pair_df.cpd != cpd].index
    X_test = X_df.loc[test_idx].values
    X_train = X_df.loc[train_idx].values
    y_train = cpd_kd_pair_df.loc[train_idx].label.values.astype(int)
    
    # train the model
    LRF = LincsRandomForestClassifier(n_cells_per_forest = 3,
                                      n_estimators_per_forest=100, 
                                      max_depth=12, 
                                      max_features="auto",
                                      class_weight="balanced_subsample",
                                      random_state=1)
    LRF.fit(X_train, y_train)
    
    # predict probabilities for test cpd's potential targets
    test_proba_ = LRF.predict_proba_(X_test)
    
    # rank potential targets predicted probability
    test_result_df = cpd_kd_pair_df.loc[test_idx].copy()
    test_result_df['proba'] = test_proba_[:,1]
    test_result_df['rank'] = test_result_df.proba.rank(ascending=False)
    test_result_df['n_potential_targets'] = len(test_result_df)

    # record rank of true target(s) and which cell lines had data
    true_target_result = test_result_df[test_result_df.label == 1]
    true_target_cell_lines = ~dir_corr_df.loc[true_target_result.index].isnull()
    loo_result = pd.concat([true_target_result, true_target_cell_lines], axis=1)

    # record results
    loocv_result_df = loocv_result_df.append(loo_result)
    loocv_result_df.to_csv('results/loocv_results_min3_tree100_depth12_balanced.csv')

In [66]:
class LincsRandomForestClassifier(object):
    
    """WE ASSUME THE DATA IS GROUPED BY CELL LINE AND HAS 4 FEATURES PER CELL LINE"""
   
    def __init__(self, n_cells_per_forest, 
                 n_estimators_per_forest=10, 
                 max_depth=None, 
                 max_features="auto",
                 class_weight="balanced_subsample",
                 random_state=1,):
        self.n_cells_per_forest = n_cells_per_forest
        self.n_estimators_per_forest = n_estimators_per_forest
        self.max_depth = max_depth
        self.max_features = max_features
        self.class_weight = class_weight
        self.random_state = random_state
        
    def fit(self, X, y):
        '''
        Train several random forests, each one on a different
        subset of cells. Store forests in a dictionary called
        self.forests.
        '''
        # make sure we have enough data to work with
        min_num_cells = self.get_min_num_cells(X)
        assert min_num_cells >= self.n_cells_per_forest, "Too much missing data for n_cells_per_forest = %s. (Some samples only tested in %d cells)" % \
                                                         (self.n_cells_per_forest, min_num_cells)
        
        # generate cell subsets for training
        # ASSUMES 4 FEATURES PER CELL
        total_num_cells = int(X.shape[1] / 4) # THIS IS HARDCODED IN
        cell_subsets = itertools.combinations(np.arange(total_num_cells), self.n_cells_per_forest)
        
        # initialize dictionary to hold the forests
        self.forests = {}
        
        # train forest on each subset
        for cell_subset in log_progress(cell_subsets, every=1):
            #print('Growing forest for cell lines: ', cell_subset, end="\t")
            
            # find samples that have complete data from the cell subset
            cell_subset_idx = np.array([ 4*i + np.array([0, 1, 2, 3])for i in cell_subset ]).reshape(1,-1)[0].astype(int)
            cell_subset_data = X[:,cell_subset_idx]
            bad_sample_idx = np.isnan(cell_subset_data).any(axis=1)
            good_samples = cell_subset_data[~bad_sample_idx]
            good_labels = y[~bad_sample_idx]
            #print('Sample class distribution: ', np.bincount(good_labels.astype(int)))
            
            # train and store a RF classifier on this training subset
            # print('Growing forest for cell subset: %s' % str(cell_subset))
            forest = RandomForestClassifier(criterion='gini',
                                            n_estimators=self.n_estimators_per_forest,
                                            max_depth=self.max_depth,
                                            max_features=self.max_features,
                                            class_weight=self.class_weight,
                                            random_state=self.random_state,
                                            n_jobs=-1)
            forest.fit(good_samples, good_labels)
            self.forests[cell_subset] = forest            

        
    def get_min_num_cells(self, X):
        '''
        Calculate the minimum number of cells any sample has data for
        ASSUMES 4 FEATURES PER CELL LINE
        '''
        X_not_missing = ~np.isnan(X)
        num_cells_not_missing = np.count_nonzero(X_not_missing, axis=1) / 4
        min_num_cells = np.min(num_cells_not_missing)
        return min_num_cells
    
    def predict_proba(self, X):
        '''
        Return the class probabilities label OF ONE SINGLE SAMPLE FOR FUCKS SAKE
        '''
        # figure out which cell lines we have data for
        non_nan_idx = np.where(np.isnan(X) == False)[0]
        good_cells = (non_nan_idx[np.where(non_nan_idx/4%1 == 0)[0]] / 4).astype(int)
        # select appropriate forests and predict
        cell_subsets = itertools.combinations(good_cells, self.n_cells_per_forest)
        tree_predictions_ = []
        for cell_subset in cell_subsets:
            # extract appropriate data
            cell_subset_idx = np.array([ 4*i + np.array([0, 1, 2, 3])for i in cell_subset ]).reshape(1,-1)[0].astype(int)
            cell_subset_data = X[cell_subset_idx].reshape(1,-1) 
            # extract appropriate forest and make prediction
            forest = self.forests[cell_subset]
            tree_predictions = [ tree.predict(cell_subset_data) for tree in forest.estimators_ ]
            tree_predictions_.append(tree_predictions)
        
        # majority vote of all the trees in all the forests
        results = np.array(tree_predictions_).flatten()
        proba = results.sum() / len(results)
        return np.array([1.-proba, proba])
    
    def predict(self, X):
        '''
        Return the predicted class label OF ONE SINGLE SAMPLE FOR FUCKS SAKE
        '''
        class_probabilities = self.predict_proba(X)
        return np.argmax(class_probabilities)
    
#     def predict_proba_parallel_(self, X):
#         '''
#         Predict probabilities for a multidimentional X
#         '''
#         import multiprocessing
#         # calculate the chunk size as an integer
#         num_processes = multiprocessing.cpu_count()
#         chunk_size = int(len(X)/num_processes)

#         # break the dataframe up into chunks
#         X_chunks = [ X[i:i + chunk_size] for i in range(0, len(X), chunk_size)]

#         # filter each section of the pairs dataframe in parallel
#         pool = multiprocessing.Pool(processes=num_processes)
#         result = pool.map(self.predict_proba_, X_chunks)
#         pool.close()
        
#         flat_result = functools.reduce(lambda x,y: x+y, result)
#         return flat_result
    
    
    def predict_proba_(self, X):
        proba_ = []
        for i in range(len(X)):
            proba_.append(self.predict_proba(X[i]))
        return np.array(proba_)
    
    def predict_(self, X):
        '''
        for a multidimentional X
        '''
        predicted_classes = np.array([ self.predict(x) for x in X ])
        return predicted_classes

In [84]:
# A failed attempt at implementing a parallelized 'predict_proba_' method for the LRF class

# def predict_proba_(inputs):
#     model, X = inputs
#     model.predict_proba_(X)

# def LRF_predict_proba_parallel_(model, data):
#     '''
#     Predict probabilities for a multidimentional X
#     '''
#     # calculate the chunk size as an integer
#     import multiprocessing
#     num_processes = multiprocessing.cpu_count()
#     chunk_size = int(len(data)/num_processes)

#     # break the dataframe up into chunks
#     X_chunks = [ data[i:i + chunk_size] for i in range(0, len(data), chunk_size)]
#     chunks = [ (model,chunk) for chunk in X_chunks ]

#     # filter each section of the pairs dataframe in parallel
#     results = Parallel(n_jobs=num_processes, backend="multiprocessing")(
#              map(delayed(predict_proba_), chunks))
    
    
# #     pool = multiprocessing.Pool(processes=num_processes)
# #     result = pool.map(model.predict_proba_, X_chunks)
# #     pool.close()

#     flat_result = functools.reduce(lambda x,y: x+y, results)
#     return flat_result