In [1]:
import sys
sys.path.insert(0, '../../')

from data import load_data_gse135820 as gse135820, load_data_gse68465 as gse68465
from data import load_data_gse94873 as gse94873, load_data_gse96058 as gse96058
from data import load_data_gse136400 as gse136400

from pipeline import SMLA

from sklearn.model_selection import StratifiedKFold
from evaluation import optimize_threshold, classification_metrics
from sklearn.metrics import roc_auc_score, log_loss, confusion_matrix


from constants import N_FOLDS, RANDOM_STATE
from util import join_values

import lightgbm as lgb
import pickle as pkl
import pandas as pd
import numpy as np
import time
import os


def experiment_pipeline(predictor):
    #
    result = {c: [] for c in ['dataset', 'experiment', 'train_auc', 'valid_auc', 'predictor',
                              'train_loss', 'valid_loss', 'execution_time', 'threshold']}

    dataset_id = ['GSE135820', 'GSE136400', 'GSE94873', 'GSE96058', 'GSE68465']

    for i, func in enumerate([gse135820, gse136400, gse94873, gse96058, gse68465]):

        print('=============================================================================')
        print('Dataset {}'.format(dataset_id[i]))
        print('=============================================================================\n')
        
        BASE_PATH = os.path.join('output/smla/', predictor, dataset_id[i])
        path = os.path.join(BASE_PATH, 'inference')

        if not os.path.exists(path):
            os.makedirs(path)

        c, g, o = func()

        # Creating 10-fold CV splits stratified by treatments and outcome
        kfold = StratifiedKFold(N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
        split = kfold.split(np.zeros(o.shape[0]), o)

        for experiment, (train_index, valid_index) in enumerate(split):
            
            print('*************************************************************************')
            print('Experiment {} of {}'.format(experiment + 1, N_FOLDS))
            print('*************************************************************************\n')

            initial_time = time.time()

            #######################################################################################################
            # Split train & valid
            #######################################################################################################

            clinical_outcome_train = o.iloc[train_index, 0]
            clinical_outcome_valid = o.iloc[valid_index, 0]

            clinical_markers_train = c.iloc[train_index, :]
            clinical_markers_valid = c.iloc[valid_index, :]

            # treatments_train = treatments.iloc[train_index, :]
            # treatments_valid = treatments.iloc[valid_index, :]

            genes_train = g.iloc[train_index, :]
            genes_valid = g.iloc[valid_index, :]
            
            if predictor == 'lightgbm':
                model_default_params = {'objective': 'binary'}
                
            else:
                model_default_params = None

            optimizer_default_params = {'fixed_parameters': model_default_params}
            
            smla = SMLA(
                    predictor=predictor,
                    optimizer_default_params=optimizer_default_params,
                    model_default_params=model_default_params,
                    random_state=RANDOM_STATE,
                    use_gpu=True,
                    test_size=.2,
                    verbose=-1,
                    output_path=BASE_PATH
            )

            # fit model based on SMNA pipeline
            smla.fit(clinical_markers=clinical_markers_train,
                     genes=genes_train, 
                     outcome=clinical_outcome_train,
                     clinical_marker_selection_threshold=0.05, 
                     genes_marker_selection_threshold=0.05)

            with open('{}/trained_model_{}.pkl'.format(BASE_PATH, experiment), 'wb') as file:
                pkl.dump(smla, file)

            y_hat_train = smla.predict(clinical_markers=clinical_markers_train, 
                                       genes=genes_train)
            
            y_hat_valid = smla.predict(clinical_markers=clinical_markers_valid, 
                                       genes=genes_valid)

            #################################################################################################
            # Analysing Performance
            #################################################################################################   

            # Computing AUC
            train_auc = roc_auc_score(clinical_outcome_train, y_hat_train)
            valid_auc = roc_auc_score(clinical_outcome_valid, y_hat_valid)

            # Computing logLoss
            train_loss = log_loss(clinical_outcome_train, y_hat_train)
            valid_loss = log_loss(clinical_outcome_valid, y_hat_valid)

            # Compute optimized threshold
            opt_threshold = optimize_threshold(clinical_outcome_train, y_hat_train)

            if opt_threshold is None:
                opt_threshold = np.mean(clinical_outcome_train)

            # compute confusion matrix
            tn, fp, fn, tp = confusion_matrix(clinical_outcome_valid, [int(y >= opt_threshold) for y in y_hat_valid]).ravel()

            classification_results = classification_metrics(tn, fp, fn, tp)

            # add results to data frame (dict for now)
            for k in classification_results:
                if k not in result:
                    result[k] = []
                result[k].append(classification_results[k])

            result['experiment'].append(experiment)
            result['predictor'].append(predictor)
            result['train_auc'].append(train_auc)
            result['valid_auc'].append(valid_auc)
            result['train_loss'].append(train_loss)
            result['valid_loss'].append(valid_loss)
            result['execution_time'].append(time.time() - initial_time)
            result['threshold'].append(opt_threshold)
            result['dataset'].append(dataset_id[i])

            print('* Selected genes: {}'.format(len(smla.selected_genes[0])))
            print('* Selected clinical markers: {}\n'.format(len(smla.selected_clinical[0])))

            print('* Train AUC: {}'.format(train_auc))
            print('* Valid AUC: {}\n'.format(valid_auc))

            print('* Execution time: {:10.2f} minutes\n'.format((time.time() - initial_time) / 60.))
            
            # Exporting inference
            clinical_outcome_train['y_hat'] = y_hat_train
            clinical_outcome_train.to_csv('{}/inference/train_{}.csv'.format(BASE_PATH, experiment), index=True, sep=',')

            clinical_outcome_valid['y_hat'] = y_hat_valid
            clinical_outcome_valid.to_csv('{}/inference/valid_{}.csv'.format(BASE_PATH, experiment), index=True, sep=',')
      
    result = pd.DataFrame(result)

    result.to_csv(os.path.join(BASE_PATH, '{}_metrics.csv'.format(predictor)))

    return  result




Using TensorFlow backend.


In [2]:
r = experiment_pipeline('lightgbm')

r.groupby('dataset').mean()

Dataset GSE135820

*************************************************************************
Experiment 1 of 5
*************************************************************************

* Selected genes: 300
* Selected clinical markers: 4

* Train AUC: 1.0
* Valid AUC: 0.8245854631592071

* Execution time:       0.54 minutes

*************************************************************************
Experiment 2 of 5
*************************************************************************

* Selected genes: 300
* Selected clinical markers: 4

* Train AUC: 1.0
* Valid AUC: 0.7651845723158527

* Execution time:       0.69 minutes

*************************************************************************
Experiment 3 of 5
*************************************************************************

* Selected genes: 300
* Selected clinical markers: 5

* Train AUC: 1.0
* Valid AUC: 0.7777878523421473

* Execution time:       0.57 minutes

******************************************************

Unnamed: 0_level_0,experiment,train_auc,valid_auc,train_loss,valid_loss,execution_time,threshold,accuracy,precision,sensitivity,specificity
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
GSE135820,2,1.0,0.785422,0.028513,0.446175,34.102823,0.130512,0.798052,0.469163,0.570629,0.850796
GSE136400,2,1.0,0.724835,0.011422,0.727231,41.492467,0.037148,0.543559,0.514059,0.961429,0.155596
GSE68465,2,1.0,0.612377,0.007304,1.007178,36.676425,0.024405,0.491062,0.461035,0.902564,0.166367
GSE94873,2,1.0,0.704005,0.015755,0.80202,15.259104,0.054947,0.562531,0.506657,0.903702,0.286392
GSE96058,2,1.0,0.896156,0.030533,0.430083,67.875472,0.182529,0.798186,0.728007,0.953079,0.643207


In [None]:
knn_result = experiment_pipeline('knn')

knn_result.groupby('dataset').mean()