In [1]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

import sys
sys.path.insert(0, '../../')

import logging
logging.getLogger("root").setLevel(logging.ERROR)

from data import load_data_gse135820 as gse135820
from data import load_data_gse68465 as gse68465
from data import load_data_gse94873 as gse94873 
from data import load_data_gse96058 as gse96058
from data import load_data_gse136400 as gse136400

from pipeline import MuLT

from sklearn.model_selection import StratifiedKFold
from evaluation import optimize_threshold, classification_metrics
from sklearn.metrics import roc_auc_score, log_loss, confusion_matrix


from constants import N_FOLDS, RANDOM_STATE
from util import join_values

import lightgbm as lgb
import pickle as pkl
import pandas as pd
import numpy as np
import time
import os

# creating analyser object to compute and group 
# classification matrics grouped by training and validation
# dataset and by experiment id
# analyser = Analyser()

# removing files and folders
BASE_PATH = os.path.join('output', 'mult')

for root, subdirs, files in os.walk(BASE_PATH, topdown=False):
    for item in files:
        path = os.path.join(root, item)
        if os.path.isfile(path):
            os.remove(path)
    for s in subdirs:
        os.rmdir(os.path.join(root, s))

#
result = {c: [] for c in ['dataset', 'experiment', 'train_auc', 'valid_auc', 
                          'train_loss', 'valid_loss', 'execution_time', 'threshold']}

dataset_id = ['GSE94873', 'GSE68465', 'GSE135820']

for i, func in enumerate([gse94873, gse68465, gse135820]):
    
    print('=============================================================================')
    print('Dataset {}'.format(dataset_id[i]))
    print('=============================================================================\n')
    
    BASE_PATH = os.path.join('output', 'mult', dataset_id[i])
    path = os.path.join(BASE_PATH, 'inference')
    
    if not os.path.exists(path):
        os.makedirs(path)

    c, g, o = func()

    # Creating 10-fold CV splits stratified by treatments and outcome
    kfold = StratifiedKFold(N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    split = kfold.split(np.zeros(o.shape[0]), o)

    for experiment, (train_index, valid_index) in enumerate(split):
        
        initial_time = time.time()
        
        print('*************************************************************************')
        print('Experiment {} of {}'.format(experiment + 1, N_FOLDS))
        print('*************************************************************************\n')

        #######################################################################################################
        # Split train & valid
        #######################################################################################################

        response_train = o.iloc[train_index, 0]
        response_valid = o.iloc[valid_index, 0]

        clinical_train = c.iloc[train_index, :]
        clinical_valid = c.iloc[valid_index, :]

        # treatments_train = treatments.iloc[train_index, :]
        # treatments_valid = treatments.iloc[valid_index, :]

        genes_train = g.iloc[train_index, :]
        genes_valid = g.iloc[valid_index, :]

        #######################################################################################################
        # MuLT fitting
        #######################################################################################################

        filename = '{}/trained_model_{}.pkl'.format(BASE_PATH, experiment)
        
        if True:
        
            mult = MuLT(experiment_number=experiment, 
                        number_of_experiments=N_FOLDS, 
                        output_path=BASE_PATH, 
                        random_state=RANDOM_STATE)

            mult.fit(clinical=clinical_train, genes=genes_train, outcome=response_train, 

                lgb_fixed_parameters = {
                    'objective': 'binary',
                    'metric': 'binary_logloss'
                },

                optimization_n_call=25,
                optimization_n_folds=2,
                optimization_early_stopping_rounds=1,

                clinical_marker_selection_threshold=0.05,
                gene_selection_threshold=0.05,

                dae_decay_rate=1e-8,
                dae_learning_rate=1e-2,
                dae_steps=20000,
                dae_early_stopping_rounds=2000,
                dae_keep_probability=.75,

                minor_class_augmentation=False,

                lgb_early_stopping_rounds=100,

                predictor_n_folds=2)

            with open(filename, 'wb') as file:
                pkl.dump(mult, file)
        else:
            
            with open(filename, 'rb') as file:
                mult = pkl.load(file)

        #######################################################################################################
        # MuLT inference
        #######################################################################################################

        y_hat_train = mult.predict(clinical=clinical_train, genes=genes_train)
        y_hat_valid = mult.predict(clinical=clinical_valid, genes=genes_valid)

        #################################################################################################
        # Analysing Performance
        #################################################################################################   

        # Computing AUC
        train_auc = roc_auc_score(response_train, y_hat_train)
        valid_auc = roc_auc_score(response_valid, y_hat_valid)

        # Computing logLoss
        train_loss = log_loss(response_train, y_hat_train)
        valid_loss = log_loss(response_valid, y_hat_valid)

        # Compute optimized threshold
        opt_threshold = optimize_threshold(response_train, y_hat_train)

        if opt_threshold is None:
            opt_threshold = np.mean(response_train)

        # compute confusion matrix
        tn, fp, fn, tp = confusion_matrix(response_valid, [int(y >= opt_threshold) for y in y_hat_valid]).ravel()

        classification_results = classification_metrics(tn, fp, fn, tp)

        # add results to data frame (dict for now)
        for k in classification_results:
            if k not in result:
                result[k] = []
            result[k].append(classification_results[k])
        
        result['experiment'].append(experiment)
        result['train_auc'].append(train_auc)
        result['valid_auc'].append(valid_auc)
        result['train_loss'].append(train_loss)
        result['valid_loss'].append(valid_loss)
        result['execution_time'].append(time.time() - initial_time)
        result['threshold'].append(opt_threshold)
        result['dataset'].append(dataset_id[i])
        
        print('* Selected genes: {}'.format(len(mult.selected_genes[0])))
        print('* Selected clinical markers: {}\n'.format(len(mult.selected_clinical[0])))
        
        print('* Train AUC: {}'.format(train_auc))
        print('* Valid AUC: {}\n'.format(valid_auc))
        
        print('* Execution time: {:10.2f} minutes\n'.format((time.time() - initial_time) / 60.))
        
        # Exporting inference
        response_train = pd.DataFrame(response_train)
        response_train['y_hat'] = y_hat_train
        response_train.to_csv('{}/inference/train_{}.csv'.format(
            BASE_PATH, experiment), index=True, sep=',')
        
        response_valid = pd.DataFrame(response_valid)
        response_valid['y_hat'] = y_hat_valid
        response_valid.to_csv('{}/inference/valid_{}.csv'.format(
            BASE_PATH, experiment), index=True, sep=',')

# exporting results
result = pd.DataFrame(result)

result.to_csv(os.path.join(BASE_PATH, 'mult_metrics.csv'), sep=',', index=False)

result.head(10)

Using TensorFlow backend.


Dataset GSE94873

*************************************************************************
Experiment 1 of 5
*************************************************************************

* Selected genes: 40
* Selected clinical markers: 0

* Train AUC: 0.7905562489293492
* Valid AUC: 0.6777884615384616

* Execution time:       2.15 minutes

*************************************************************************
Experiment 2 of 5
*************************************************************************

* Selected genes: 38
* Selected clinical markers: 0

* Train AUC: 0.7775554902968456
* Valid AUC: 0.7147115384615385

* Execution time:       2.97 minutes

*************************************************************************
Experiment 3 of 5
*************************************************************************

* Selected genes: 50
* Selected clinical markers: 0

* Train AUC: 0.8182316805616499
* Valid AUC: 0.667578125

* Execution time:       2.88 minutes

********************

Unnamed: 0,dataset,experiment,train_auc,valid_auc,train_loss,valid_loss,execution_time,threshold,accuracy,precision,sensitivity,specificity
0,GSE94873,0,0.790556,0.677788,0.606973,0.646554,128.859792,0.492273,0.613793,0.595745,0.430769,0.7625
1,GSE94873,1,0.777555,0.714712,0.618596,0.638587,178.301267,0.434268,0.634483,0.573171,0.723077,0.5625
2,GSE94873,2,0.818232,0.667578,0.590109,0.647076,172.915923,0.48313,0.659722,0.641509,0.53125,0.7625
3,GSE94873,3,0.795874,0.686116,0.590167,0.632941,184.929518,0.467344,0.685315,0.617284,0.78125,0.607595
4,GSE94873,4,0.772193,0.671578,0.620399,0.656226,181.724378,0.477175,0.573427,0.528302,0.4375,0.683544
5,GSE68465,0,0.923858,0.674872,0.474862,0.652251,157.878167,0.42783,0.640449,0.64,0.410256,0.82
6,GSE68465,1,0.966094,0.608718,0.363679,0.715786,166.953747,0.429065,0.617978,0.55102,0.692308,0.56
7,GSE68465,2,0.843693,0.635269,0.531377,0.666102,230.58222,0.419179,0.568182,0.509091,0.717949,0.44898
8,GSE68465,3,0.97601,0.699634,0.390967,0.625995,185.711811,0.392212,0.647727,0.590909,0.666667,0.632653
9,GSE68465,4,0.948815,0.610152,0.449363,0.703397,190.086465,0.381544,0.602273,0.555556,0.512821,0.673469


In [2]:
result.to_csv('output/mult/mult_metrics.csv', sep=',', index=False)

result.head(10)

Unnamed: 0,dataset,experiment,train_auc,valid_auc,train_loss,valid_loss,execution_time,threshold,accuracy,precision,sensitivity,specificity
0,GSE94873,0,0.790556,0.677788,0.606973,0.646554,128.859792,0.492273,0.613793,0.595745,0.430769,0.7625
1,GSE94873,1,0.777555,0.714712,0.618596,0.638587,178.301267,0.434268,0.634483,0.573171,0.723077,0.5625
2,GSE94873,2,0.818232,0.667578,0.590109,0.647076,172.915923,0.48313,0.659722,0.641509,0.53125,0.7625
3,GSE94873,3,0.795874,0.686116,0.590167,0.632941,184.929518,0.467344,0.685315,0.617284,0.78125,0.607595
4,GSE94873,4,0.772193,0.671578,0.620399,0.656226,181.724378,0.477175,0.573427,0.528302,0.4375,0.683544
5,GSE68465,0,0.923858,0.674872,0.474862,0.652251,157.878167,0.42783,0.640449,0.64,0.410256,0.82
6,GSE68465,1,0.966094,0.608718,0.363679,0.715786,166.953747,0.429065,0.617978,0.55102,0.692308,0.56
7,GSE68465,2,0.843693,0.635269,0.531377,0.666102,230.58222,0.419179,0.568182,0.509091,0.717949,0.44898
8,GSE68465,3,0.97601,0.699634,0.390967,0.625995,185.711811,0.392212,0.647727,0.590909,0.666667,0.632653
9,GSE68465,4,0.948815,0.610152,0.449363,0.703397,190.086465,0.381544,0.602273,0.555556,0.512821,0.673469


In [3]:
pd.DataFrame(result).groupby('dataset').mean()

Unnamed: 0_level_0,experiment,train_auc,valid_auc,train_loss,valid_loss,execution_time,threshold,accuracy,precision,sensitivity,specificity
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
GSE135820,2,0.929174,0.774052,0.318104,0.405399,678.553097,0.216009,0.770934,0.421686,0.586014,0.813826
GSE68465,2,0.931694,0.645729,0.44205,0.672706,186.242482,0.409966,0.615322,0.569315,0.6,0.62702
GSE94873,2,0.790882,0.683554,0.605249,0.644277,169.346176,0.470838,0.633348,0.591202,0.580769,0.675728
