In [None]:
import sys
sys.path.insert(0, '../../')

from data import load_data_gse135820 as gse135820, load_data_gse68465 as gse68465
from data import load_data_gse94873 as gse94873, load_data_gse96058 as gse96058
from data import load_data_gse136400 as gse136400

from pipeline import MuLT

from sklearn.model_selection import StratifiedKFold
from evaluation import optimize_threshold, classification_metrics
from sklearn.metrics import roc_auc_score, log_loss, confusion_matrix


from constants import N_FOLDS, RANDOM_STATE
from util import join_values

import lightgbm as lgb
import pickle as pkl
import pandas as pd
import numpy as np
import time
import os

# creating analyser object to compute and group 
# classification matrics grouped by training and validation
# dataset and by experiment id
# analyser = Analyser()

#
result = {c: [] for c in ['dataset', 'experiment', 'train_auc', 'valid_auc', 
                          'train_loss', 'valid_loss', 'execution_time', 'threshold']}

dataset_id = ['GSE136400', 'GSE94873', 'GSE135820', 'GSE96058', 'GSE68465']

for i, func in enumerate([gse136400, gse94873, gse135820, gse96058, gse68465]):
    
    BASE_PATH = os.path.join('output/mult/', dataset_id[i])
    path = os.path.join(BASE_PATH, 'inference')
    
    if not os.path.exists(path):
        os.makedirs(path)
    
    c, g, o = func()

    # Creating 10-fold CV splits stratified by treatments and outcome
    kfold = StratifiedKFold(N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    split = kfold.split(np.zeros(o.shape[0]), o)

    for experiment, (train_index, valid_index) in enumerate(split):

        initial_time = time.time()

        print('{}\n\n'.format(experiment))

        #######################################################################################################
        # Split train & valid
        #######################################################################################################

        response_train = o.iloc[train_index, 0]
        response_valid = o.iloc[valid_index, 0]

        clinical_train = c.iloc[train_index, :]
        clinical_valid = c.iloc[valid_index, :]

        # treatments_train = treatments.iloc[train_index, :]
        # treatments_valid = treatments.iloc[valid_index, :]

        genes_train = g.iloc[train_index, :]
        genes_valid = g.iloc[valid_index, :]

        #######################################################################################################
        # MuLT fitting
        #######################################################################################################

        mult = MuLT(experiment_number=experiment, 
                    number_of_experiments=N_FOLDS, 
                    output_path=BASE_PATH, 
                    random_state=RANDOM_STATE)

        mult.fit(clinical=clinical_train, genes=genes_train, outcome=response_train, 

            lgb_fixed_parameters = {
                'metric': 'binary_logloss',
                'n_estimators': 100,
                'objective': 'binary',
                'is_unbalance': False, 
                'extra_trees': True,
                'max_depth': 4,
                'learning_rate': 0.1,
                'min_split_gain': 0.0001,
                'min_child_weight': 0.0001},

            optimization_n_call=50,
            optimization_n_folds=2,
            optimization_early_stopping_rounds=1,

            clinical_marker_selection_threshold=0.05,
            gene_selection_threshold=0.05,

            dae_decay_rate=1.0,
            dae_learning_rate=1e-3,
            dae_steps=100000,
            dae_early_stopping_rounds=1000,

            lgb_early_stopping_rounds=1,

            predictor_n_folds=3)

        with open('{}/trained_model_{}.pkl'.format(BASE_PATH, experiment), 'wb') as file:
            pkl.dump(mult, file)

        #######################################################################################################
        # MuLT inference
        #######################################################################################################

        y_hat_train = mult.predict(clinical=clinical_train, genes=genes_train)
        y_hat_valid = mult.predict(clinical=clinical_valid, genes=genes_valid)

        #################################################################################################
        # Analysing Performance
        #################################################################################################   

        # Computing AUC
        train_auc = roc_auc_score(response_train, y_hat_train)
        valid_auc = roc_auc_score(response_valid, y_hat_valid)

        # Computing logLoss
        train_loss = log_loss(response_train, y_hat_train)
        valid_loss = log_loss(response_valid, y_hat_valid)

        # Compute optimized threshold
        opt_threshold = optimize_threshold(response_train, y_hat_train)

        if opt_threshold is None:
            opt_threshold = np.mean(response_train)

        # compute confusion matrix
        tn, fp, fn, tp = confusion_matrix(response_valid, [int(y >= opt_threshold) for y in y_hat_valid]).ravel()

        classification_results = classification_metrics(tn, fp, fn, tp)

        # add results to data frame (dict for now)
        for k in classification_results:
            if k not in result:
                result[k] = []
            result[k].append(classification_results[k])

        result['experiment'].append(experiment)
        result['train_auc'].append(train_auc)
        result['valid_auc'].append(valid_auc)
        result['train_loss'].append(train_loss)
        result['valid_loss'].append(valid_loss)
        result['execution_time'].append(time.time() - initial_time)
        result['threshold'].append(opt_threshold)
        result['dataset'].append(dataset_id[i])
        
        print('Experiment {} with {} genes and {} clinical markers'.format(
              experiment, len(mult.selected_genes[0]), len(mult.selected_clinical[0])))

        print('Train: {}'.format(train_auc))

        print('Valid: {}'.format(valid_auc))
        
        print("\n========================================================================================\n")
        
        # Exporting inference
        response_train = pd.DataFrame(response_train)
        response_train['y_hat'] = y_hat_train
        response_train.to_csv('{}/inference/train_{}.csv'.format(BASE_PATH, experiment), index=True, sep=',')

        response_valid = pd.DataFrame(response_valid)
        response_valid['y_hat'] = y_hat_valid
        response_valid.to_csv('{}/inference/valid_{}.csv'.format(BASE_PATH, experiment), index=True, sep=',')




Using TensorFlow backend.


0







Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.dense instead.






early stopping after 1000 iterations without improvements with 2153 steps: best metric value 10639569920.0

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from C:\Users\Venezian\git\multiple-myeloma\experiments\exp-geo\output\mult\GSE136400\dae\data_augmentation_adadelta_000\graph\data_augmentation_adadelta_000
INFO:tensorflow:Restoring parameters from C:\Users\Venezian\git\multiple-myelo

In [None]:
pd.DataFrame(result).groupby('dataset').mean()

In [None]:
pd.DataFrame(result).to_csv('output/mult/metrics.csv', index=False, sep=',')