In [1]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

import sys
sys.path.insert(0, '../../')

import logging
logging.getLogger("root").setLevel(logging.ERROR)

from data import load_data_gse135820 as gse135820
from data import load_data_gse68465 as gse68465
from data import load_data_gse94873 as gse94873 
from data import load_data_gse96058 as gse96058
from data import load_data_gse136400 as gse136400

from pipeline import MuLT

from sklearn.model_selection import StratifiedKFold
from evaluation import optimize_threshold, classification_metrics
from sklearn.metrics import roc_auc_score, log_loss, confusion_matrix


from constants import N_FOLDS, RANDOM_STATE
from util import join_values

import lightgbm as lgb
import pickle as pkl
import pandas as pd
import numpy as np
import time
import os

# creating analyser object to compute and group 
# classification matrics grouped by training and validation
# dataset and by experiment id
# analyser = Analyser()

#
result = {c: [] for c in ['dataset', 'experiment', 'train_auc', 'valid_auc', 
                          'train_loss', 'valid_loss', 'execution_time', 'threshold']}

dataset_id = ['GSE135820', 'GSE136400', 'GSE94873', 'GSE96058', 'GSE68465']

for i, func in enumerate([gse135820, gse136400, gse94873, gse96058, gse68465]):
    
    print('=============================================================================')
    print('Dataset {}'.format(dataset_id[i]))
    print('=============================================================================\n')
    
    BASE_PATH = os.path.join('output/mult/', dataset_id[i])
    path = os.path.join(BASE_PATH, 'inference')
    
    if not os.path.exists(path):
        os.makedirs(path)
    
    c, g, o = func()

    # Creating 10-fold CV splits stratified by treatments and outcome
    kfold = StratifiedKFold(N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    split = kfold.split(np.zeros(o.shape[0]), o)

    for experiment, (train_index, valid_index) in enumerate(split):
        
        initial_time = time.time()
        
        print('*************************************************************************')
        print('Experiment {} of {}'.format(experiment + 1, N_FOLDS))
        print('*************************************************************************\n')

        #######################################################################################################
        # Split train & valid
        #######################################################################################################

        response_train = o.iloc[train_index, 0]
        response_valid = o.iloc[valid_index, 0]

        clinical_train = c.iloc[train_index, :]
        clinical_valid = c.iloc[valid_index, :]

        # treatments_train = treatments.iloc[train_index, :]
        # treatments_valid = treatments.iloc[valid_index, :]

        genes_train = g.iloc[train_index, :]
        genes_valid = g.iloc[valid_index, :]

        #######################################################################################################
        # MuLT fitting
        #######################################################################################################

        mult = MuLT(experiment_number=experiment, 
                    number_of_experiments=N_FOLDS, 
                    output_path=BASE_PATH, 
                    random_state=RANDOM_STATE)

        mult.fit(clinical=clinical_train, genes=genes_train, outcome=response_train, 

            lgb_fixed_parameters = {
                'metric': 'binary_logloss',
                'n_estimators': 100,
                'objective': 'binary',
                'is_unbalance': False, 
                'extra_trees': True,
                'max_depth': 4,
                'learning_rate': 0.1,
                'min_split_gain': 0.0001,
                'min_child_weight': 0.0001},

            optimization_n_call=50,
            optimization_n_folds=2,
            optimization_early_stopping_rounds=1,

            clinical_marker_selection_threshold=0.05,
            gene_selection_threshold=0.05,

            dae_decay_rate=1.0,
            dae_learning_rate=1e-3,
            dae_steps=100000,
            dae_early_stopping_rounds=1000,

            lgb_early_stopping_rounds=1,

            predictor_n_folds=3)

        with open('{}/trained_model_{}.pkl'.format(BASE_PATH, experiment), 'wb') as file:
            pkl.dump(mult, file)

        #######################################################################################################
        # MuLT inference
        #######################################################################################################

        y_hat_train = mult.predict(clinical=clinical_train, genes=genes_train)
        y_hat_valid = mult.predict(clinical=clinical_valid, genes=genes_valid)

        #################################################################################################
        # Analysing Performance
        #################################################################################################   

        # Computing AUC
        train_auc = roc_auc_score(response_train, y_hat_train)
        valid_auc = roc_auc_score(response_valid, y_hat_valid)

        # Computing logLoss
        train_loss = log_loss(response_train, y_hat_train)
        valid_loss = log_loss(response_valid, y_hat_valid)

        # Compute optimized threshold
        opt_threshold = optimize_threshold(response_train, y_hat_train)

        if opt_threshold is None:
            opt_threshold = np.mean(response_train)

        # compute confusion matrix
        tn, fp, fn, tp = confusion_matrix(response_valid, [int(y >= opt_threshold) for y in y_hat_valid]).ravel()

        classification_results = classification_metrics(tn, fp, fn, tp)

        # add results to data frame (dict for now)
        for k in classification_results:
            if k not in result:
                result[k] = []
            result[k].append(classification_results[k])

        result['experiment'].append(experiment)
        result['train_auc'].append(train_auc)
        result['valid_auc'].append(valid_auc)
        result['train_loss'].append(train_loss)
        result['valid_loss'].append(valid_loss)
        result['execution_time'].append(time.time() - initial_time)
        result['threshold'].append(opt_threshold)
        result['dataset'].append(dataset_id[i])
        
        print('* Selected genes: {}'.format(len(mult.selected_genes[0])))
        print('* Selected clinical markers: {}\n'.format(len(mult.selected_clinical[0])))
        
        print('* Train AUC: {}'.format(train_auc))
        print('* Valid AUC: {}\n'.format(valid_auc))
        
        print('* Execution time: {:10.2f} minutes\n'.format((time.time() - initial_time) / 60.))
        
        # Exporting inference
        response_train = pd.DataFrame(response_train)
        response_train['y_hat'] = y_hat_train
        response_train.to_csv('{}/inference/train_{}.csv'.format(BASE_PATH, experiment), index=True, sep=',')

        response_valid = pd.DataFrame(response_valid)
        response_valid['y_hat'] = y_hat_valid
        response_valid.to_csv('{}/inference/valid_{}.csv'.format(BASE_PATH, experiment), index=True, sep=',')

Using TensorFlow backend.


Dataset GSE135820

*************************************************************************
Experiment 1 of 10
*************************************************************************

early stopping after 1000 iterations without improvements with 2504 steps: best metric value 3031622656.0
* Selected genes: 300
* Selected clinical markers: 4

* Train AUC: 0.8068694419745224
* Valid AUC: 0.7950827040632865

* Execution time:      11.13 minutes

*************************************************************************
Experiment 2 of 10
*************************************************************************

early stopping after 1000 iterations without improvements with 1096 steps: best metric value 3081777920.0
* Selected genes: 300
* Selected clinical markers: 4

* Train AUC: 0.8018407989764993
* Valid AUC: 0.7646979503775619

* Execution time:      13.59 minutes

*************************************************************************
Experiment 3 of 10
**************************

In [2]:
pd.DataFrame(result).groupby('dataset').mean()

Unnamed: 0_level_0,experiment,train_auc,valid_auc,train_loss,valid_loss,execution_time,threshold,accuracy,precision,sensitivity,specificity
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
GSE135820,4.5,0.805648,0.760898,0.388405,0.415363,1006.443836,0.182155,0.721451,0.364899,0.638928,0.740539
GSE136400,4.5,0.7496,0.676643,0.605276,0.680115,360.190974,0.440669,0.632237,0.608024,0.665028,0.601635
GSE68465,4.5,0.823251,0.611409,0.520134,0.717316,166.466648,0.453866,0.549813,0.492826,0.578684,0.527333
GSE94873,4.5,0.752822,0.680451,0.599636,0.674056,158.307547,0.527201,0.624988,0.580091,0.600379,0.645449
GSE96058,4.5,0.902204,0.875196,0.402186,0.43925,1084.638058,0.546441,0.805236,0.783108,0.845167,0.765284


In [3]:
pd.DataFrame(result).to_csv('output/mult/metrics.csv', index=False, sep=',')