In [None]:
import sys
sys.path.insert(0, '../../')

from data import load_data_gse135820 as gse135820, load_data_gse68465 as gse68465
from data import load_data_gse94873 as gse94873, load_data_gse96058 as gse96058
from data import load_data_gse136400 as gse136400

from pipeline import MuLT

from sklearn.model_selection import StratifiedKFold
from evaluation import optimize_threshold, classification_metrics
from sklearn.metrics import roc_auc_score, log_loss, confusion_matrix


from constants import N_FOLDS, RANDOM_STATE
from util import join_values

import lightgbm as lgb
import pickle as pkl
import pandas as pd
import numpy as np
import time
import os

# creating analyser object to compute and group 
# classification matrics grouped by training and validation
# dataset and by experiment id
# analyser = Analyser()

#
result = {c: [] for c in ['dataset', 'experiment', 'train_auc', 'valid_auc', 
                          'train_loss', 'valid_loss', 'execution_time', 'threshold']}

dataset_id = ['GSE136400', 'GSE94873', 'GSE135820', 'GSE96058', 'GSE68465']

for i, func in enumerate([gse136400, gse94873, gse135820, gse96058, gse68465]):
    
    BASE_PATH = os.path.join('output/smla/', dataset_id[i])
    path = os.path.join(BASE_PATH, 'inference')
    
    if not os.path.exists(path):
        os.makedirs(path)
    
    c, g, o = func()

    # Creating 10-fold CV splits stratified by treatments and outcome
    kfold = StratifiedKFold(N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    split = kfold.split(np.zeros(o.shape[0]), o)

    for experiment, (train_index, valid_index) in enumerate(split):

        initial_time = time.time()

        #######################################################################################################
        # Split train & valid
        #######################################################################################################

        clinical_outcome_train = o.iloc[train_index, 0]
        clinical_outcome_valid = o.iloc[valid_index, 0]

        clinical_markers_train = c.iloc[train_index, :]
        clinical_markers_valid = c.iloc[valid_index, :]

        # treatments_train = treatments.iloc[train_index, :]
        # treatments_valid = treatments.iloc[valid_index, :]

        genes_train = g.iloc[train_index, :]
        genes_valid = g.iloc[valid_index, :]

        # create an independent TS predictor for each ML algorithm
        for predictor in ['mlp', 'svm', 'lightgbm']:

            initial_time = time.time()

            if predictor == 'lightgbm':

                model_default_params = {
                    'metric': 'binary_logloss',
                    'n_estimators': 100,
                    'objective': 'binary',
                    'is_unbalance': False, 
                    'extra_trees': True,
                    'max_depth': 4,
                    'learning_rate': 0.1,
                    'min_split_gain': 0.0001,
                    'min_child_weight': 0.0001}

                optimizer_default_params['early_stopping_rounds'] = 1

            else:
                model_default_params = None

            optimizer_default_params = {
                'n_folds': 2, 
                'n_calls': 50,
                'fixed_parameters': model_default_params, 
                'random_state': RANDOM_STATE,
                'verbose': -1
            }

            snma = SMLA(
                    predictor=predictor,
                    optimizer_default_params=optimizer_default_params,
                    model_default_params=model_default_params,
                    random_state=RANDOM_STATE,
                    use_gpu=True,
                    test_size=.2,
                    verbose=-1)

            # fit model based on SMNA pipeline
            snma.fit(clinical_markers_train, genes_train, treatments_train, clinical_outcome_train,
                    clinical_marker_selection_threshold=0.05, genes_marker_selection_threshold=0.0005)

            # predict for trained dataset, 
            # just to compare results
            y_hat_train = snma.predict(clinical_markers_train, genes_train, treatments_train)

            # predict for valid dataset, 
            # used to compute main results
            y_hat_valid = snma.predict(clinical_markers_valid, genes_valid, treatments_valid)

            # compute classification metrics for training dataset
            # each experiment is named "exp_#_train"
            # analyser.compute_classification_metrics(
            #    y_train, y_hat_train, experiment_id=experiment,  experiment_group='train')

            # compute classification metrics for validation dataset
            # each experiment is named "exp_#_valid"
            # analyser.compute_classification_metrics(
            #    y_valid, y_hat_valid, experiment_id=experiment, experiment_group='valid')

            #################################################################################################
            # Analysing Performance
            #################################################################################################   

            # Computing AUC
            train_auc = roc_auc_score(clinical_outcome_train, y_hat_train)
            valid_auc = roc_auc_score(clinical_outcome_valid, y_hat_valid)

            # Computing logLoss
            train_loss = log_loss(clinical_outcome_train, y_hat_train)
            valid_loss = log_loss(clinical_outcome_valid, y_hat_valid)

            # Compute optimized threshold
            opt_threshold = optimize_threshold(clinical_outcome_train, y_hat_train)

            if opt_threshold is None:
                opt_threshold = np.mean(clinical_outcome_train)

            # compute confusion matrix
            tn, fp, fn, tp = confusion_matrix(clinical_outcome_valid, [int(y >= opt_threshold) for y in y_hat_valid]).ravel()

            classification_results = classification_metrics(tn, fp, fn, tp)

            # add results to data frame (dict for now)
            for k in classification_results:
                if k not in result:
                    result[k] = []
                result[k].append(classification_results[k])

            result['experiment'].append(experiment)
            result['predictor'].append(predictor)
            result['train_auc'].append(train_auc)
            result['valid_auc'].append(valid_auc)
            result['train_loss'].append(train_loss)
            result['valid_loss'].append(valid_loss)
            result['execution_time'].append(time.time() - initial_time)
            result['threshold'].append(opt_threshold)
            result['database'].append(database_id[i])

            log_message = 'Experiment #{}: '.format(experiment) + 'Train AUC: {}'.format(train_auc) + ' '
            log_message += 'Valid AUC: {}'.format(valid_auc)

            print(log_message)

        print('')




Using TensorFlow backend.


0




In [None]:
pd.DataFrame(result).groupby('dataset').mean()

In [None]:
pd.DataFrame(result).to_csv('output/mult/metrics.csv', index=False, sep=',')