# Analysis Code


In this notebook we analyse the results from the experiments three ways. 


*   Gather per experiment performance metrics
*   Create a mean prediction minus target dataset 
*   Aggregate the performance metrics across the dataset-algorithm combinations

We analyse both the **test set** --- which is disjoint but created from the same dataset as the training set, and an **independent test set** --- which  produced separately 




In [0]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
import pandas as pd 
import numpy as np

from sklearn.metrics import mean_absolute_error, median_absolute_error, mean_squared_error
import scipy

file_loc = '/content/drive/My Drive/Yeast_Growth_Project/predictions/'
data_loc = '/content/drive/My Drive/Yeast_Growth_Project/data/'

In [0]:
target = pd.read_csv(data_loc + 'test_growth_target.csv')
target = target.iloc[1:, 1].to_numpy().reshape(227,1)

# Gathering Error Statistics on the Test Set

Here we gather the set of error statistics for a set of experiments

In [0]:

def get_stats_for_each_run(file_name, data_1, data_2,  integration_type, method_type, number):
  maes = np.zeros(number)
  mdaes = np.zeros(number)
  rmses = np.zeros(number)
  pccs = np.zeros(number)
  for x in range(number):
    pred = pd.read_csv(file_loc + file_name + '_Predictions_' + str(x) + '.csv')
    if pred.shape == (227, 2): # The R code models have a different format in predictions
      pred = np.array(pred.iloc[:, 1]).reshape(227,1) 
    elif pred.shape == (228, 2):
      pred = np.array(pred.iloc[1:, 1]).reshape(227,1) 
    else:
      pred = np.array(pred).reshape(227,1)
    maes[x] = mean_absolute_error(pred, target)
    mdaes[x] = median_absolute_error(pred, target)
    rmses[x] = np.sqrt(mean_squared_error(pred,target))
    pccs[x] = scipy.stats.pearsonr(pred[:,0], target[:,0])[0]

  return pd.DataFrame({'file_name' : [file_name] * number,
                          'integration_type' : [integration_type] * number, 
                          'method_type' : [method_type] * number,
                           'data_1' : [data_1] * number, 
                           'data_2' : [data_2] * number, 
                       'mae' : maes,
                       'mdae' : mdaes,
                       'rmse' : rmses,
                       'pcc' : pccs})

Calculate the mean and confidence of a list of stats

In [0]:
def mean_and_conf(stat):
  return np.mean(stat), np.std(stat)
    


Summarises the set of results from an experiment as a single set of metrics (means)

In [0]:

def summarise_results(file_name,  data_1, data_2, integration_type, method_type, number):
  maes = np.zeros(number)
  mdaes = np.zeros(number)
  rmses = np.zeros(number)
  pccs = np.zeros(number)
  for x in range(number):
    pred = pd.read_csv(file_loc + file_name + '_Predictions_' + str(x) + '.csv')
    if pred.shape == (227, 2): # The R code models have a different format in predictions
      pred = np.array(pred.iloc[:, 1]).reshape(227,1) 
    elif pred.shape == (228, 2):
      pred = np.array(pred.iloc[1:, 1]).reshape(227,1)
      
    else:
      pred = np.array(pred).reshape(227,1)
    maes[x] = mean_absolute_error(pred, target)
    mdaes[x] = median_absolute_error(pred, target)
    rmses[x] = np.sqrt(mean_squared_error(pred,target))
    pccs[x] = scipy.stats.pearsonr(pred[:,0], target[:,0])[0]
    
    mae_m, mae_std = mean_and_conf(maes)
    mdae_m, mdae_std = mean_and_conf(mdaes)
    rmse_m, rmse_std = mean_and_conf(rmses)
    pcc_m, pcc_std = mean_and_conf(pccs)
  return pd.DataFrame({'file_name' : file_name, 
                       'integration_type' : integration_type,
                       'method_type' : method_type, 
                       'data_1' :data_1,
                       'data_2' : data_2,
                       'mae_mean' : mae_m,
                       'mae_std' : mae_std,
                       'mdae_mean' : mdae_m,
                       'mdae_std' : mdae_std,
                       'rmse_mean' : rmse_m,
                       'rmse_std' : rmse_std,
                       'pcc_mean' : pcc_m,
                       'pcc_std' : pcc_std}, index=[0])

Calculates the mean prediction for each test point in a set of experiments

In [0]:

  

def mean_prediction(file_name, number):

  predictions = np.zeros((227, 1))
  for x in range(number):
    pred = pd.read_csv(file_loc + file_name + '_Predictions_' + str(x) + '.csv')
    if pred.shape == (227, 2): # The R code models have a different format in predictions
      pred = np.array(pred.iloc[:, 1]).reshape(227,1) 
    elif pred.shape == (228, 2):
      pred = np.array(pred.iloc[1:, 1]).reshape(227,1)
    predictions = np.array(pred) + np.array(predictions)
  predictions /= number 
 
  return predictions


In [0]:
experiments_ran = 98
#'metablic_expression',
names = [ ['sgl_svm', 'sgl', 'Na', 'Early', 'svm'],
          ['iRF_svm', 'iRF', 'Na', 'Early', 'svm'],
          ['genetic_rf', 'NSGA-II', 'Na', 'Early', 'rf'],
         ['genetic_svm', 'NSGA-II', 'Na', 'Early', 'svm'],
         ['expression_svm', 'expression', 'Na', 'None', 'svm'], 
         [ 'bagged_expression_fluxes_rf', 'expression', 'fluxes', 'Late', 'rf'],
         ['expression_rf', 'expression', 'Na', 'None', 'rf'],
         ['fluxes_rf', 'fluxes', 'Na', 'None', 'rf'],
         ['iRF_rf', 'iRF', 'Na', 'Early', 'rf'],
         ['sgl_rf', 'sgl', 'Na', 'Early', 'rf'],
         ['metabolic_expression_rf', 'metabolic_expression', 'Na', 'None', 'rf'],
         ['fluxes_svm', 'fluxes', 'Na', 'None', 'svm'],
         ['metabolic_expression_svm', 'metabolic_expression', 'Na', 'None', 'svm'],
         ['metabolic_expression', 'metabolic_expression', 'Na', 'None', 'dl'],
         ['expression', 'expression', 'Na', 'None', 'dl'],
         ['fluxes', 'fluxes', 'Na', 'None', 'dl'],
         ['concat_expression_fluxes_svm', 'expression', 'fluxes', 'Early', 'svm'],
         ['concate_Flu_GE', 'expression', 'fluxes', 'Early', 'dl'],
         ['fluxes', 'fluxes', 'Na', 'None', 'dl'],
         ['concate_Flu_GE', 'expression', 'fluxes', 'Early', 'dl'],
         ['iRF', 'iRF', 'Na', 'Early', 'dl'],
         ['multi_model_metabolic_expression', 'metabolic_expression', 'fluxes', 'Intermediate', 'dl'],
         ['multi_model_full_expression', 'expression', 'fluxes', 'Intermediate', 'dl'],
         ['SGL', 'SGL', 'Na', 'Early', 'dl'],
         ['NSGA-II', 'NSGA-II', 'Na', 'Early', 'dl'],
         ['metabolic_expression_bemkl', 'metabolic_expression', 'fluxes', 'Intermediate', 'svm'],
         ['expression_bemkl', 'expression', 'fluxes', 'Intermediate', 'svm'],
         ['concat_expression_fluxes_rf', 'expression', 'fluxes', 'Early', 'rf'],
         ['bagged_metabolic_expression_fluxes_rf_', 'metabolic_expression', 'fluxes', 'Early', 'rf']]



results_averaged = pd.DataFrame(columns = ['file_name', 'method_type', 'data_1', 'data_2', 'integration_type', 'mae_mean', 'mae_std', 'mdae_mean', 'mdae_std', 'rmse_mean', 'rmse_std', 'pcc_mean',  'pcc_std'])
results_all = pd.DataFrame(columns = ['file_name', 'method_type', 'data_1', 'data_2', 'integration_type', 'mae',
                                      'mdae', 'rmse', 'pcc'])

predictions_vs_actual = pd.DataFrame()

for x in names: 
  results_averaged = pd.concat([summarise_results(x[0], x[1], x[2], x[3], x[4], experiments_ran), results_averaged], ignore_index=True)
  results_all = pd.concat([get_stats_for_each_run(x[0], x[1], x[2], x[3], x[4], experiments_ran), results_all], ignore_index=True)
  print(results_averaged)
  mean_pred  =  mean_prediction(x[0], experiments_ran)
  prediction_minus_target = mean_pred - target
  predictions_vs_actual[x[0]] = prediction_minus_target[:, 0] 

results_all.to_csv(data_loc + 'full_results_singles_100_experiments.csv', index = False)
results_averaged.to_csv(data_loc + 'full_results_100_experiments.csv', index = False)
predictions_vs_actual.to_csv(data_loc + 'predictions_minus_target.csv', index = False)

# Gathering Error Statistics on the Test Set 

In [0]:
# Location of the independent test set 
independent_target = pd.read_csv(data_loc + '/independent_target.csv', header = None)
independent_loc = file_loc + 'Independent/'

# Which indicies relate to the double and single knockouts
double_knockouts = pd.read_csv(data_loc +  'is_double_gene_knockout.csv', header = None) > 0 
double_knockouts_index = np.where(double_knockouts)[0]
single_knockouts_index = np.where(-double_knockouts)[0]


Here we take in a set of results on the independent dataset and produce a table of metrics on the results

In [0]:

def summarise_independent(data, algorithm, imputation_type,file_name,  double_only = False, single_only = False, header = True):
    
    if not header:
      pred = pd.read_csv(independent_loc + file_name, header=None)
    else:
      pred = pd.read_csv(independent_loc + file_name)

    if pred.shape[1] == 2: # The R code models have a different format in predictions
      pred = pred.iloc[:, 1].to_frame()
    if double_only:
      pred = pred.iloc[double_knockouts_index]
      target = independent_target.iloc[double_knockouts_index, :]
      data_gene_type = 'double'
    elif single_only:
      pred = pred.iloc[single_knockouts_index]
      target = independent_target.iloc[single_knockouts_index, :]
      data_gene_type = 'single'
    else: 
      target = independent_target
      data_gene_type = 'all'

    mae = mean_absolute_error(pred, target)
    mdae = median_absolute_error(pred, target)
    rmse = np.sqrt(mean_squared_error(pred,target))
    pred = pred.to_numpy().squeeze()
    pcc = scipy.stats.pearsonr(pred, target.squeeze())[0]
    return pd.DataFrame({ 'data' : data, 
                          'algorithm' : algorithm,
                         'imputation_method'  : imputation_type,
                         'genes' : data_gene_type, 
                       'mae' : mae,
                       'mdae' : mdae,
                       'mdae' : mdae,
                       'rmse' : rmse,
                       'pcc' : pcc}, index=[0])


Creating a summary csv to be exported

In [0]:


independent_results = summarise_independent('expression', 'svm', 'lr', 'expression_only_svm_independent_lr.csv')
independent_results = pd.concat([summarise_independent('expression', 'svm', 'lr', 'expression_only_svm_independent_lr.csv', double_only=True), independent_results], ignore_index = True )
independent_results = pd.concat([summarise_independent('expression', 'svm', 'lr', 'expression_only_svm_independent_lr.csv', single_only = True), independent_results], ignore_index = True )


independent_results = pd.concat([summarise_independent('expression', 'svm', 'm', 'expression_only_svm_independent_m.csv'), independent_results], ignore_index = True )
independent_results = pd.concat([summarise_independent('expression', 'svm', 'm', 'expression_only_svm_independent_m.csv', double_only=True), independent_results], ignore_index = True )
independent_results = pd.concat([summarise_independent('expression', 'svm', 'm', 'expression_only_svm_independent_m.csv', single_only = True), independent_results], ignore_index = True )

independent_results = pd.concat([summarise_independent('expression', 'dl', 'lr', 'independent_lr_dl_expression_only', header = False),independent_results], ignore_index = True )
independent_results = pd.concat([summarise_independent('expression', 'dl', 'lr', 'independent_lr_dl_expression_only', header = False, double_only=True),independent_results], ignore_index = True )
independent_results = pd.concat([summarise_independent('expression', 'dl', 'lr', 'independent_lr_dl_expression_only', header = False, single_only = True),independent_results], ignore_index = True )

independent_results = pd.concat([summarise_independent('expression', 'dl', 'm', 'independent_m_dl_expression_only', header = False),independent_results], ignore_index = True )
independent_results = pd.concat([summarise_independent('expression', 'dl', 'm', 'independent_m_dl_expression_only', header = False, double_only = True),independent_results], ignore_index = True )
independent_results = pd.concat([summarise_independent('expression', 'dl', 'm', 'independent_m_dl_expression_only', header = False, single_only = True),independent_results], ignore_index = True )

independent_results = pd.concat([summarise_independent('expression + fluxes', 'dl', 'lr', 'independent_lr_multi_modal_expression_fluxes', header = False), independent_results], ignore_index = True )
independent_results = pd.concat([summarise_independent('expression + fluxes', 'dl', 'lr', 'independent_lr_multi_modal_expression_fluxes', header = False, double_only = True), independent_results], ignore_index = True )
independent_results = pd.concat([summarise_independent('expression + fluxes', 'dl', 'lr', 'independent_lr_multi_modal_expression_fluxes', header = False, single_only = True), independent_results], ignore_index = True )

independent_results = pd.concat([summarise_independent('expression + fluxes', 'dl', 'm', 'independent_m_multi_modal_expression_fluxes', header = False),independent_results],  ignore_index = True )
independent_results = pd.concat([summarise_independent('expression + fluxes', 'dl', 'm', 'independent_m_multi_modal_expression_fluxes', header = False, double_only = True),independent_results],  ignore_index = True )
independent_results = pd.concat([summarise_independent('expression + fluxes', 'dl', 'm', 'independent_m_multi_modal_expression_fluxes', header = False, single_only = True),independent_results],  ignore_index = True )


independent_results.to_csv(data_loc + 'independent_results.csv')