In [1]:
from data import load_data

# loading clinical, gene expressions, treatment, 
# and clincial outcome data, ignoring fish markers 
# those are not used in experiments performed below
clinical_markers, _, genes, treatments, clinical_outcome = load_data()

In [11]:
from pipeline import SMLA

from util import join_values

from constants import RANDOM_STATE, N_FOLDS

from sklearn.model_selection import StratifiedKFold

from evaluation import optimize_threshold, classification_metrics
from sklearn.metrics import roc_auc_score, log_loss, confusion_matrix

import time

# creating analyser object to compute and group 
# classification matrics grouped by training and validation
# dataset and by experiment id
# analyser = Analyser()

# create a stratification flag by combining treatment and clinical outcome
# in order to avoid bias in the models generated by umbalanced treatments
# or clinical outcome
stratification_values = join_values([treatments, clinical_outcome])

# split data in 10-fold stratified by 
# treatment and treatment sensitivity outcome
kfold = StratifiedKFold(N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

#
#
result = {c: [] for c in ['experiment', 'predictor', 'train_auc', 'valid_auc', 
                          'train_loss', 'valid_loss', 'execution_time', 'threshold']}

for experiment, (train_index, valid_index) in enumerate(kfold.split(_, stratification_values)):
    
    # selecting clinical markers
    # alpha defined as the probability of a marker be randonly choose
    # selected_clinical_markers = select_markers(
    #    clinical_markers.iloc[train_index], 
    #    alpha=1./clinical_markers.shape[1], beta=0.75)
    
    # selecting gene expressions
    # alpha defined as the probability of a marker be randonly choose
    # selected_gene_expressions = select_markers(
    #    gene_expressions.iloc[train_index], 
    #    alpha=1./gene_expressions.shape[1], beta=0.75)
    
    # joining markers
    # x = clinical_markers[selected_clinical_markers].join(
    #    gene_expressions[selected_gene_expressions])
       
    #######################################################################################################
    # Split train & valid
    #######################################################################################################
    
    clinical_outcome_train = clinical_outcome.iloc[train_index, 0]
    clinical_outcome_valid = clinical_outcome.iloc[valid_index, 0]

    clinical_markers_train = clinical_markers.iloc[train_index, :]
    clinical_markers_valid = clinical_markers.iloc[valid_index, :]

    treatments_train = treatments.iloc[train_index, :]
    treatments_valid = treatments.iloc[valid_index, :]
    
    genes_train = genes.iloc[train_index, :]
    genes_valid = genes.iloc[valid_index, :]
    
    # create an independent TS predictor for each ML algorithm
    for predictor in ['mlp', 'svm', 'lightgbm']:
        
        initial_time = time.time()
        
        if predictor == 'lightgbm':
            
            model_default_params = {
                'metric': 'binary_logloss',
                'n_estimators': 100,
                'objective': 'binary',
                'is_unbalance': False, 
                'extra_trees': True,
                'max_depth': 4,
                'learning_rate': 0.1,
                'min_split_gain': 0.0001,
                'min_child_weight': 0.0001}
            
            optimizer_default_params['early_stopping_rounds'] = 1
            
        else:
            model_default_params = None
            
        optimizer_default_params = {
            'n_folds': 2, 
            'n_calls': 50,
            'fixed_parameters': model_default_params, 
            'random_state': RANDOM_STATE,
            'verbose': -1
        }
        
        snma = SMLA(
                predictor=predictor,
                optimizer_default_params=optimizer_default_params,
                model_default_params=model_default_params,
                random_state=RANDOM_STATE,
                use_gpu=True,
                test_size=.2,
                verbose=-1)
        
        # fit model based on SMNA pipeline
        snma.fit(clinical_markers_train, genes_train, treatments_train, clinical_outcome_train,
                clinical_marker_selection_threshold=0.05, genes_marker_selection_threshold=0.0005)
        
        # predict for trained dataset, 
        # just to compare results
        y_hat_train = snma.predict(clinical_markers_train, genes_train, treatments_train)
        
        # predict for valid dataset, 
        # used to compute main results
        y_hat_valid = snma.predict(clinical_markers_valid, genes_valid, treatments_valid)
        
        # compute classification metrics for training dataset
        # each experiment is named "exp_#_train"
        # analyser.compute_classification_metrics(
        #    y_train, y_hat_train, experiment_id=experiment,  experiment_group='train')
        
        # compute classification metrics for validation dataset
        # each experiment is named "exp_#_valid"
        # analyser.compute_classification_metrics(
        #    y_valid, y_hat_valid, experiment_id=experiment, experiment_group='valid')
        
        #################################################################################################
        # Analysing Performance
        #################################################################################################   
        
        # Computing AUC
        train_auc = roc_auc_score(clinical_outcome_train, y_hat_train)
        valid_auc = roc_auc_score(clinical_outcome_valid, y_hat_valid)
        
        # Computing logLoss
        train_loss = log_loss(clinical_outcome_train, y_hat_train)
        valid_loss = log_loss(clinical_outcome_valid, y_hat_valid)
        
        # Compute optimized threshold
        opt_threshold = optimize_threshold(clinical_outcome_train, y_hat_train)

        if opt_threshold is None:
            opt_threshold = np.mean(clinical_outcome_train)
        
        # compute confusion matrix
        tn, fp, fn, tp = confusion_matrix(clinical_outcome_valid, [int(y >= opt_threshold) for y in y_hat_valid]).ravel()

        classification_results = classification_metrics(tn, fp, fn, tp)
        
        # add results to data frame (dict for now)
        for k in classification_results:
            if k not in result:
                result[k] = []
            result[k].append(classification_results[k])
        
        result['experiment'].append(experiment)
        result['predictor'].append(predictor)
        result['train_auc'].append(train_auc)
        result['valid_auc'].append(valid_auc)
        result['train_loss'].append(train_loss)
        result['valid_loss'].append(valid_loss)
        result['execution_time'].append(time.time() - initial_time)
        result['threshold'].append(opt_threshold)
        
        log_message = 'Experiment #{}: '.format(experiment) + 'Train AUC: {}'.format(train_auc) + ' '
        log_message += 'Valid AUC: {}'.format(valid_auc)
        
        print(log_message)
    
    print('')

6
16
Experiment #0: Train AUC: 0.7751803751803751 Valid AUC: 0.6227180527383367
6
16
Experiment #0: Train AUC: 0.6152695789059426 Valid AUC: 0.4766734279918864
6
16
Experiment #0: Train AUC: 0.8499278499278501 Valid AUC: 0.5578093306288032

6
9
Experiment #1: Train AUC: 0.7647495755517827 Valid AUC: 0.6989843028624192
6
9
Experiment #1: Train AUC: 0.6683865662139219 Valid AUC: 0.6722068328716528
6
9
Experiment #1: Train AUC: 0.8183361629881154 Valid AUC: 0.7211449676823638

5
14
Experiment #2: Train AUC: 0.77513529286927 Valid AUC: 0.58264081255771
5
14
Experiment #2: Train AUC: 0.7419885398981325 Valid AUC: 0.6195752539242844
5
14
Experiment #2: Train AUC: 0.8820564516129031 Valid AUC: 0.6075715604801478

7
6
Experiment #3: Train AUC: 0.7333543746136952 Valid AUC: 0.6567460317460317
7
6
Experiment #3: Train AUC: 0.5309635591325732 Valid AUC: 0.621031746031746
7
6
Experiment #3: Train AUC: 0.7760550229481464 Valid AUC: 0.5952380952380952

7
18
Experiment #4: Train AUC: 0.40655432186261

In [14]:
import pandas as pd

result = pd.DataFrame(result)

result.to_csv('output/smla/metrics.csv', sep=',', index=True)

result.head()

Unnamed: 0,experiment,predictor,train_auc,valid_auc,train_loss,valid_loss,execution_time,threshold,accuracy,precision,sensitivity,specificity
0,0,mlp,0.77518,0.622718,0.467782,0.534754,151.593535,0.201045,0.6,0.290323,0.529412,0.62069
1,0,svm,0.61527,0.476673,0.572516,0.573564,882.657005,0.349508,0.506667,0.236842,0.529412,0.5
2,0,lightgbm,0.849928,0.557809,0.410791,0.562996,79.287451,0.200566,0.533333,0.21875,0.411765,0.568966
3,1,mlp,0.76475,0.698984,0.461688,0.629941,148.227856,0.270025,0.736842,0.48,0.631579,0.77193
4,1,svm,0.668387,0.672207,0.570251,0.581209,1548.510465,0.352554,0.592105,0.363636,0.842105,0.508772


In [15]:
result.groupby('predictor').mean()

Unnamed: 0_level_0,experiment,train_auc,valid_auc,train_loss,valid_loss,execution_time,threshold,accuracy,precision,sensitivity,specificity
predictor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
lightgbm,4.5,0.837346,0.601559,0.424133,0.55717,79.594029,0.242873,0.618168,0.314768,0.514628,0.649977
mlp,4.5,0.739441,0.615441,0.479872,0.582913,152.639376,0.271372,0.608977,0.331092,0.564993,0.621992
svm,4.5,0.632798,0.555702,0.568667,0.573545,900.073843,0.324864,0.515232,0.265539,0.576526,0.492072


In [16]:
result.groupby('predictor').std()

Unnamed: 0_level_0,experiment,train_auc,valid_auc,train_loss,valid_loss,execution_time,threshold,accuracy,precision,sensitivity,specificity
predictor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
lightgbm,3.02765,0.037659,0.055779,0.034424,0.03548,4.195382,0.029468,0.059616,0.064156,0.098912,0.070057
mlp,3.02765,0.119502,0.049586,0.067388,0.044931,28.907271,0.081054,0.104166,0.075931,0.124791,0.158728
svm,3.02765,0.073372,0.082405,0.014092,0.010869,666.112513,0.075224,0.126962,0.053821,0.240184,0.226675
