# Loading Clinical Data

In [1]:
from data import load_data

clinical, _, genes, treatments, outcome = load_data()

clinical.head()

Unnamed: 0_level_0,cmmc,ecog_ps,percent_aneuploid,percent_plama_cells_bone_marrow,percent_plama_cells_peripherical_blood,creatinine,iss,absolute_neutrophil,platelet,wbc_x10_10_9_l,...,m_protein,first_line_transplant,cell_markers_cd117,cell_markers_cd13,cell_markers_cd138,cell_markers_cd38,race_asian,race_black_african_american,race_other,race_white
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MMRF1021,,1.0,0.0,4.9,0.0,88.4,1.0,2.4,216.0,5.2,...,3.05,1,0,1,0,0,0,0,0,1
MMRF1024,,1.0,11.0,6.0,0.0,123.76,2.0,2.3,188.0,4.3,...,2.6,0,1,0,0,0,0,0,0,1
MMRF1029,,1.0,0.0,8.4,0.0,106.08,1.0,2.6,219.0,4.0,...,1.8,0,1,0,0,0,0,0,0,1
MMRF1030,,1.0,15.4,9.6,0.0,55.692,1.0,2.5,215.0,4.7,...,3.55,1,1,0,0,0,0,0,0,1
MMRF1031,,,18.3,10.1,0.0,81.328,1.0,10.29,385.0,12.4,...,1.52,0,1,0,0,0,0,0,0,1


In [4]:
from evaluation import optimize_threshold, classification_metrics
from constants import N_FOLDS, RANDOM_STATE

from correlation import select_genes

from sklearn.metrics import roc_auc_score, log_loss, confusion_matrix
from sklearn.model_selection import StratifiedKFold

from optimization import LightGBMOptimizer

from lightgbm import LGBMModel

import pandas as pd
import pickle
import time
import os

result = {c: [] for c in ['experiment', 'marker', 'train_auc', 'valid_auc', 
                          'train_loss', 'valid_loss', 'execution_time', 'threshold']}

selected_genes_mem = {}

for t in clinical.columns:
    
    print('*********************************************************************************')
    print(t)
    print('*********************************************************************************\n')
    
    y = outcome.dropna()
    
    x = clinical[[t]].dropna().join(treatments, how='inner')
    
    x = x.join(genes).join(y)[x.columns]
    y = x.join(y)[y.columns[0]]
    
    opt_kf = StratifiedKFold(N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    
    if y.sum() < 10 or (1 - y).sum() < 10:
        print('Ignoring {} marker once it has less than five treatment senstive patients associted.')
    
    for experiment, (opt_train, opt_valid) in enumerate(opt_kf.split(x, y)):
        
        initial_time = time.time()
        
        #################################################################################################
        # Train and Valid Split
        #################################################################################################
        
        genes_train = genes.iloc[opt_train, :]
        genes_valid = genes.iloc[opt_valid, :]
        
        x_train, y_train = x.iloc[opt_train, :], y.iloc[opt_train]
        x_valid, y_valid = x.iloc[opt_valid, :], y.iloc[opt_valid]
        
        #################################################################################################
        # Gene Expression Selection
        #################################################################################################        
        
        if experiment not in selected_genes_mem:
            selected_genes_mem[experiment] = select_genes(genes_train, y_train, threshold=0.002)
            
        selected_genes = selected_genes_mem[experiment][0]
        
        genes_train = genes_train[selected_genes]
        genes_valid = genes_valid[selected_genes]
                                  
        x_train = x_train.join(genes_train)
        x_valid = x_valid.join(genes_valid)
        
        #################################################################################################
        # Hyper parameters optimization
        #################################################################################################        
        
        training_default_params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'n_estimators': 100,
            'objective': 'binary',
            'is_unbalance': False, 
            'extra_trees': True,
            'max_depth': 4,
            'learning_rate': 0.1,
            'min_split_gain': 0.0001,
            'min_child_weight': 0.0001,
            'random_state': RANDOM_STATE}
        
        optimizer_params = {
            'n_folds': 2, 
            'n_calls': 50, 
            'shuffle': True, 
            'early_stopping_rounds': 1,
            'fixed_parameters': training_default_params, 
            'random_state': RANDOM_STATE, 
            'use_gpu': False}
        
        optimizer = LightGBMOptimizer(**optimizer_params)
        
        params = optimizer.optimize(x_train, y_train)

        params = {**params, **training_default_params}
        
        skf = StratifiedKFold(3, shuffle=True, random_state=RANDOM_STATE)

        models = []
        
        for train_index, valid_index in skf.split(x_train, y_train):

            xx_train, yy_train = x_train.values[train_index, :], y_train.values[train_index]
            xx_valid, yy_valid = x_train.values[valid_index, :], y_train.values[valid_index]

            gbm = LGBMModel(**params)

            gbm.fit(xx_train, 
                    yy_train,
                    eval_set=[(xx_valid, yy_valid)],
                    early_stopping_rounds=1,
                    verbose=False)
            
            models.append(gbm)
        
        #################################################################################################
        # Predicting
        #################################################################################################        
        
        y_hat_train, y_hat_valid = None, None
        
        for model in models:
            
            y_ = model.predict(x_train)
            y_hat_train = y_ if y_hat_train is None else y_ + y_hat_train
            
            y_ = model.predict(x_valid)
            y_hat_valid = y_ if y_hat_valid is None else y_ + y_hat_valid
        
        y_hat_train /= len(models)
        y_hat_valid /= len(models)
        
        #################################################################################################
        # Analysing Performance
        #################################################################################################   
        
        # Computing AUC
        train_auc = roc_auc_score(y_train, y_hat_train)
        valid_auc = roc_auc_score(y_valid, y_hat_valid)
        
        # Computing logLoss
        train_loss = log_loss(y_train, y_hat_train)
        valid_loss = log_loss(y_valid, y_hat_valid)
        
        # Compute optimized threshold
        opt_threshold = optimize_threshold(y_train, y_hat_train)

        if opt_threshold is None:
            opt_threshold = np.mean(y_train)
        
        # compute confusion matrix
        tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= opt_threshold) for y in y_]).ravel()

        classification_results = classification_metrics(tn, fp, fn, tp)
        
        # add results to data frame (dict for now)
        for k in classification_results:
            if k not in result:
                result[k] = []
            result[k].append(classification_results[k])
        
        result['experiment'].append(experiment)
        result['marker'].append(t)
        result['train_auc'].append(train_auc)
        result['valid_auc'].append(valid_auc)
        result['train_loss'].append(train_loss)
        result['valid_loss'].append(valid_loss)
        result['execution_time'].append(time.time() - initial_time)
        result['threshold'].append(opt_threshold)
        
        log_message = 'Experiment #{}: '.format(experiment) + 'Train AUC: {}'.format(train_auc) + ' '
        log_message += 'Valid AUC: {}'.format(valid_auc)
        
        print(log_message)
    
    print('')
   
result = pd.DataFrame(result)

result.to_csv('output/gene/metrics_clinical_plus_genes.csv', sep=',', index=False)

result.head()

*********************************************************************************
cmmc
*********************************************************************************

21
Experiment #0: Train AUC: 0.6797619047619048 Valid AUC: 0.5120689655172413
86
Experiment #1: Train AUC: 0.6576610090676588 Valid AUC: 0.5607142857142857
11
Experiment #2: Train AUC: 0.6262729597767961 Valid AUC: 0.6250000000000001
10
Experiment #3: Train AUC: 0.6417809811671704 Valid AUC: 0.48571428571428577
15
Experiment #4: Train AUC: 0.6215763775866077 Valid AUC: 0.842857142857143
11
Experiment #5: Train AUC: 0.6985936207372001 Valid AUC: 0.46031746031746024
54
Experiment #6: Train AUC: 0.6328706682599504 Valid AUC: 0.7579365079365079
53
Experiment #7: Train AUC: 0.6926188068756319 Valid AUC: 0.6746031746031746
25
Experiment #8: Train AUC: 0.6568388638661642 Valid AUC: 0.3928571428571428
9
Experiment #9: Train AUC: 0.6183472745656771 Valid AUC: 0.5793650793650794

*************************************************

Unnamed: 0,experiment,marker,train_auc,valid_auc,train_loss,valid_loss,execution_time,threshold,accuracy,precision,sensitivity,specificity
0,0,cmmc,0.679762,0.512069,0.552265,0.563707,71.859493,0.251536,0.74359,0.5,0.3,0.896552
1,1,cmmc,0.657661,0.560714,0.53574,0.565656,92.890441,0.244738,0.421053,0.25,0.6,0.357143
2,2,cmmc,0.626273,0.625,0.543947,0.563552,76.188855,0.231946,0.736842,0.5,0.2,0.928571
3,3,cmmc,0.641781,0.485714,0.536871,0.588261,74.27151,0.235926,0.342105,0.258065,0.8,0.178571
4,4,cmmc,0.621576,0.842857,0.547982,0.520705,74.615788,0.25737,0.842105,0.75,0.6,0.928571


In [5]:
result.groupby('marker').mean()

Unnamed: 0_level_0,experiment,train_auc,valid_auc,train_loss,valid_loss,execution_time,threshold,accuracy,precision,sensitivity,specificity
marker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
absolute_neutrophil,4.5,0.593784,0.527529,0.537796,0.550709,20.933341,0.230981,0.511247,0.222569,0.517279,0.507407
age,4.5,0.622075,0.520597,0.540114,0.547508,20.079324,0.227009,0.463969,0.316036,0.65098,0.407045
albumin,4.5,0.61578,0.546028,0.536311,0.541153,20.131784,0.22917,0.496284,0.199543,0.522794,0.489125
beta_2_microglobulin,4.5,0.634959,0.547124,0.5484,0.558359,20.511469,0.250055,0.625941,0.299628,0.24,0.753575
bun,4.5,0.626557,0.58818,0.550245,0.557743,21.58091,0.24479,0.530357,0.328059,0.464286,0.552381
calcium,4.5,0.600729,0.534988,0.537713,0.54563,20.524534,0.233176,0.522777,0.315967,0.494853,0.530606
cell_markers_cd117,4.5,0.636916,0.531116,0.539991,0.54758,20.118619,0.231492,0.480936,0.221855,0.520915,0.468994
cell_markers_cd13,4.5,0.624337,0.52409,0.539636,0.547727,21.348232,0.228077,0.438237,0.229106,0.671242,0.366039
cell_markers_cd138,4.5,0.62334,0.517348,0.540528,0.548738,19.778927,0.229331,0.464569,0.320412,0.635948,0.411558
cell_markers_cd38,4.5,0.6439,0.553737,0.53918,0.547954,22.014152,0.231432,0.526169,0.295936,0.535294,0.524773


In [6]:
result.groupby('marker').std()

Unnamed: 0_level_0,experiment,train_auc,valid_auc,train_loss,valid_loss,execution_time,threshold,accuracy,precision,sensitivity,specificity
marker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
absolute_neutrophil,3.02765,0.027905,0.101802,0.004322,0.014611,0.729976,0.011375,0.161938,0.094364,0.397996,0.32006
age,3.02765,0.036311,0.077285,0.003681,0.011495,0.569441,0.018508,0.161021,0.246535,0.376348,0.322543
albumin,3.02765,0.041645,0.060984,0.002941,0.006423,0.583315,0.006978,0.174989,0.088676,0.393219,0.342082
beta_2_microglobulin,3.02765,0.050005,0.060574,0.004628,0.009303,0.413893,0.012604,0.158782,0.287526,0.317708,0.311414
bun,3.02765,0.014776,0.074725,0.005835,0.007005,1.562134,0.007802,0.183667,0.267955,0.388686,0.365907
calcium,3.02765,0.048502,0.058147,0.004563,0.00976,0.540268,0.008535,0.162034,0.243389,0.369343,0.319194
cell_markers_cd117,3.02765,0.033913,0.075258,0.003521,0.010029,1.058616,0.014238,0.110162,0.061686,0.290751,0.21733
cell_markers_cd13,3.02765,0.045676,0.084318,0.005064,0.011702,2.151804,0.017251,0.121831,0.066321,0.338036,0.253889
cell_markers_cd138,3.02765,0.046148,0.071308,0.004487,0.009295,0.822859,0.01341,0.180881,0.246396,0.416197,0.362597
cell_markers_cd38,3.02765,0.049203,0.093804,0.004227,0.0103,1.738226,0.018393,0.153982,0.265519,0.392598,0.318819


In [7]:
result.groupby('marker').mean().mean()

experiment         4.500000
train_auc          0.623704
valid_auc          0.552851
train_loss         0.541628
valid_loss         0.549519
execution_time    22.553977
threshold          0.237821
accuracy           0.526745
precision          0.305458
sensitivity        0.539986
specificity        0.523206
dtype: float64