In [1]:
import sys
sys.path.insert(0, '../../')

from data import load_data_gse135820 as gse135820, load_data_gse68465 as gse68465
from data import load_data_gse94873 as gse94873, load_data_gse96058 as gse96058
from data import load_data_gse136400 as gse136400

from pipeline import SMLA

from sklearn.model_selection import StratifiedKFold
from evaluation import optimize_threshold, classification_metrics
from sklearn.metrics import roc_auc_score, log_loss, confusion_matrix

from constants import N_FOLDS, RANDOM_STATE
from util import join_values

import lightgbm as lgb
import pickle as pkl
import pandas as pd
import numpy as np
import time
import os


def experiment_pipeline(predictor):
    #
    result = {c: [] for c in ['dataset', 'experiment', 'train_auc', 'valid_auc', 'predictor',
                              'train_loss', 'valid_loss', 'execution_time', 'threshold']}
    
    dataset_id = ['GSE94873', 'GSE68465', 'GSE135820']

    # drop old files
    BASE_PATH = os.path.join('output', 'smla', predictor)
    
    for root, subdirs, files in os.walk(BASE_PATH, topdown=False):
        for item in files:
            path = os.path.join(root, item)
            if os.path.isfile(path):
                os.remove(path)
        for s in subdirs:
            os.rmdir(os.path.join(root, s))
    
    for i, func in enumerate([gse94873, gse68465, gse135820]):

        print('=============================================================================')
        print('Dataset {}'.format(dataset_id[i]))
        print('=============================================================================\n')
        
        path = os.path.join(BASE_PATH, dataset_id[i])
        
        if not os.path.exists(path):
            os.makedirs(path)

        c, g, o = func()

        # Creating 5-fold CV splits stratified by treatments and outcome
        kfold = StratifiedKFold(N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
        split = kfold.split(np.zeros(o.shape[0]), o)

        for experiment, (train_index, valid_index) in enumerate(split):
            
            print('*************************************************************************')
            print('Experiment {} of {}'.format(experiment + 1, N_FOLDS))
            print('*************************************************************************\n')

            initial_time = time.time()

            #######################################################################################################
            # Split train & valid
            #######################################################################################################

            clinical_outcome_train = o.iloc[train_index, 0]
            clinical_outcome_valid = o.iloc[valid_index, 0]

            clinical_markers_train = c.iloc[train_index, :]
            clinical_markers_valid = c.iloc[valid_index, :]

            # treatments_train = treatments.iloc[train_index, :]
            # treatments_valid = treatments.iloc[valid_index, :]

            genes_train = g.iloc[train_index, :]
            genes_valid = g.iloc[valid_index, :]
            
            filename = os.path.join(path, 'trained_model_{}.pkl'.format(experiment))
            
            if True: #not os.path.isfile(filename):
                
                if predictor == 'lightgbm':
                    model_default_params = {
                        'objective': 'binary'
                    }

                else:
                    model_default_params = None

                optimizer_default_params = {'fixed_parameters': model_default_params}

                smla = SMLA(
                        predictor=predictor,
                        optimizer_default_params=optimizer_default_params,
                        model_default_params=model_default_params,
                        random_state=RANDOM_STATE,
                        use_gpu=True,
                        verbose=-1,
                        output_path=path,
                        experiment_number=experiment,
                        number_of_experiments=N_FOLDS
                )

                # fit model based on SMNA pipeline
                smla.fit(clinical_markers=clinical_markers_train,
                         genes=genes_train,
                         outcome=clinical_outcome_train,
                         clinical_marker_selection_threshold=0.05, 
                         genes_marker_selection_threshold=0.05)

                with open(filename, 'wb') as file:
                    pkl.dump(smla, file)
            
            else:
                
                with open(filename, 'rb') as file:
                    smla = pkl.load(file)

            y_hat_train = smla.predict(clinical_markers=clinical_markers_train, 
                                       genes=genes_train)
            
            y_hat_valid = smla.predict(clinical_markers=clinical_markers_valid, 
                                       genes=genes_valid)

            #################################################################################################
            # Analysing Performance
            #################################################################################################   
            
            # Computing AUC
            train_auc = roc_auc_score(clinical_outcome_train, y_hat_train)
            valid_auc = roc_auc_score(clinical_outcome_valid, y_hat_valid)

            # Computing logLoss
            train_loss = log_loss(clinical_outcome_train, y_hat_train)
            valid_loss = log_loss(clinical_outcome_valid, y_hat_valid)

            # Compute optimized threshold
            opt_threshold = optimize_threshold(clinical_outcome_train, y_hat_train)

            if opt_threshold is None:
                opt_threshold = np.mean(clinical_outcome_train)

            # compute confusion matrix
            tn, fp, fn, tp = confusion_matrix(clinical_outcome_valid, [int(y >= opt_threshold) for y in y_hat_valid]).ravel()

            classification_results = classification_metrics(tn, fp, fn, tp)

            # add results to data frame (dict for now)
            for k in classification_results:
                if k not in result:
                    result[k] = []
                result[k].append(classification_results[k])

            result['experiment'].append(experiment)
            result['predictor'].append(predictor)
            result['train_auc'].append(train_auc)
            result['valid_auc'].append(valid_auc)
            result['train_loss'].append(train_loss)
            result['valid_loss'].append(valid_loss)
            result['execution_time'].append(time.time() - initial_time)
            result['threshold'].append(opt_threshold)
            result['dataset'].append(dataset_id[i])

            print('* Selected genes: {}'.format(len(smla.selected_genes[0])))
            print('* Selected clinical markers: {}\n'.format(len(smla.selected_clinical[0])))

            print('* Train AUC: {}'.format(train_auc))
            print('* Valid AUC: {}\n'.format(valid_auc))

            print('* Execution time: {:10.2f} minutes\n'.format((time.time() - initial_time) / 60.))
            
            # Exporting inference
            
            inference_path = os.path.join(path, 'inference') 
            
            if not os.path.exists(inference_path):
                os.makedirs(inference_path)
            
            clinical_outcome_train = pd.DataFrame(clinical_outcome_train)
            clinical_outcome_train['y_hat'] = y_hat_train
            clinical_outcome_train.to_csv(os.path.join(inference_path, 'train_{}.csv'.format(experiment)), index=True, sep=',')

            clinical_outcome_valid = pd.DataFrame(clinical_outcome_valid)
            clinical_outcome_valid['y_hat'] = y_hat_valid
            clinical_outcome_valid.to_csv(os.path.join(inference_path, 'valid_{}.csv'.format(experiment)), index=True, sep=',')
      
    result = pd.DataFrame(result)

    result.to_csv(os.path.join(BASE_PATH, '{}_metrics.csv'.format(predictor)))

    return result




Using TensorFlow backend.


# KNN

In [None]:
knn_result = experiment_pipeline('knn')

In [2]:
pd.read_csv('output/smla/knn/knn_metrics.csv').groupby('dataset').mean()

Unnamed: 0_level_0,Unnamed: 0,experiment,train_auc,valid_auc,train_loss,valid_loss,execution_time,threshold,accuracy,precision,sensitivity,specificity
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
GSE135820,12,2,1.0,0.707894,9.992007e-16,1.916901,251.542027,0.01,0.55555,0.264648,0.765035,0.506966
GSE68465,2,2,0.903253,0.578289,0.2798406,2.872321,166.196109,0.21,0.509193,0.468359,0.666667,0.385633
GSE94873,7,2,0.972241,0.672948,0.08986841,2.098194,111.101582,0.09,0.530528,0.490866,0.848365,0.273608


# MLP

In [None]:
mlp_result = experiment_pipeline('mlp')

In [3]:
pd.read_csv('output/smla/mlp/mlp_metrics.csv').groupby('dataset').mean()

Unnamed: 0_level_0,Unnamed: 0,experiment,train_auc,valid_auc,train_loss,valid_loss,execution_time,threshold,accuracy,precision,sensitivity,specificity
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
GSE135820,12,2,0.566771,0.564183,0.570786,0.571501,79.878343,0.380264,0.469919,0.212599,0.657343,0.426403
GSE68465,2,2,0.555539,0.523443,0.68545,0.69004,85.330967,0.45267,0.515935,0.52255,0.54359,0.495265
GSE94873,7,2,0.557187,0.518576,0.685194,0.689687,43.019105,0.464769,0.50708,0.456331,0.568125,0.458038


# SVM

In [None]:
svm_result = experiment_pipeline('svm')

In [4]:
pd.read_csv('output/smla/svm/svm_metrics.csv').groupby('dataset').mean()

Unnamed: 0_level_0,Unnamed: 0,experiment,train_auc,valid_auc,train_loss,valid_loss,execution_time,threshold,accuracy,precision,sensitivity,specificity
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
GSE135820,12,2,0.637425,0.606111,0.49102,0.491941,68.237016,0.226555,0.298521,0.226327,0.921678,0.153971
GSE68465,2,2,0.678903,0.59208,0.668747,0.681453,96.809731,0.447719,0.474898,0.46605,0.892308,0.144
GSE94873,7,2,0.612051,0.60892,0.686613,0.696926,43.460898,0.472311,0.472048,0.460723,0.950769,0.085


# LightGBM

In [None]:
lgb_result = experiment_pipeline('lightgbm')

In [5]:
pd.read_csv('output/smla/lightgbm/lightgbm_metrics.csv').groupby('dataset').mean()

Unnamed: 0_level_0,Unnamed: 0,experiment,train_auc,valid_auc,train_loss,valid_loss,execution_time,threshold,accuracy,precision,sensitivity,specificity
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
GSE135820,12,2,0.913337,0.769595,0.306304,0.413946,85.763131,0.161811,0.755141,0.403265,0.611189,0.788532
GSE68465,7,2,0.970958,0.614493,0.2275,0.814158,98.076477,0.442486,0.592722,0.555161,0.507692,0.659265
GSE94873,2,2,0.906724,0.679942,0.410384,0.68841,44.759493,0.442952,0.631979,0.579699,0.626779,0.635759
