# Import Required Libraries

In [1]:
from sklearn.model_selection import StratifiedKFold
from model_selection import ClusteredStratifiedKFold

from sklearn.preprocessing import MinMaxScaler, RobustScaler

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

from scipy.stats import ks_2samp

from model import OptimizedKMeans
from model import GeneticProfiling
from model import GeneticClustering
from model import DenoisingAutoencoder
from model import Dense
from model import ConvDense

from correlation import select_genes, select_genes_mic
from util import to_data_frame
from itertools import compress
from datetime import datetime

import lightgbm as lgb
import pandas as pd
import numpy as np
import pickle
import os

# Loading Data

In [7]:
clinical = pd.read_csv('data/clinical_brfl.tsv', sep='\t', index_col='ID')

genefpkm = pd.read_csv('data/gene_count.tsv', sep='\t', index_col='ID')

selected_index = clinical.join(genefpkm, how='inner').index

clinical = clinical.loc[selected_index,:]

clinical['response_best_response_first_line'] = clinical['response_best_response_first_line'].astype(int)

therapy_class = clinical['therapy_first_line_class']
del clinical['therapy_first_line_class']

therapies = clinical['therapy_first_line']
del clinical['therapy_first_line']

genefpkm = genefpkm.loc[selected_index,:]

for g in genefpkm.loc[:, genefpkm.sum() == 0].columns:
    del genefpkm[g]

# Hyperparameters Optimization

In [27]:
from sklearn.model_selection import cross_val_score
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize

space  = [
    Real(1e-6, 1e-1, 'log-uniform', name='learning_rate'),
    Integer(7, 4095, name='num_leaves'),
    Integer(2, 100, name='max_depth'),
    Integer(1, 4000, name='scale_pos_weight'),
    Real(0.01, 1.5, 'log-uniform', name='min_child_weight'),
    Real(0.4, 1, 'uniform', name='colsample_bytree'),
    Real(0.001, 100, 'log-uniform', name='min_split_gain'),
    Integer(1, 50, name='min_child_samples')]

def optimize(x, y, space, n_calls=50):
    
    @use_named_args(space)
    def objective(num_leaves, max_depth, scale_pos_weight, min_child_weight, colsample_bytree, learning_rate,
                  min_split_gain, min_child_samples):

        scores = []
        
        kf = StratifiedKFold(3, shuffle=True, random_state=23) 

        params = {
            'learning_rate': learning_rate,
            'num_leaves': int(num_leaves),
            'max_depth': int(max_depth),
            'scale_pos_weight': int(scale_pos_weight),
            'min_child_weight': min_child_weight,
            'colsample_bytree': colsample_bytree,
            'min_split_gain': min_split_gain,
            'min_child_samples': int(min_child_samples),
  
            'n_estimators': 10000,
            'subsample_for_bin': 2,
            'objective':'binary',
            'metric':'auc',
            'eval_metric':'auc',
            'is_unbalance':False,
            'nthread':24,          
            'verbose': -1}
        
        for train_index, valid_index in kf.split(x, y):
            
            lgb_train = lgb.Dataset(x[train_index,:], y[train_index, 0])
            lgb_valid = lgb.Dataset(x[valid_index,:], y[valid_index, 0])
            
            gbm = lgb.train(params, lgb_train, num_boost_round=1000,  
                            valid_sets=lgb_valid, early_stopping_rounds=100, verbose_eval=False) 

            y_hat = gbm.predict(x[valid_index,:], num_iteration=gbm.best_iteration, verbose_eval=False)

            auc = roc_auc_score(y[valid_index,0], y_hat)
            
            scores.append(auc)

        return -(np.mean(scores))
    
    return gp_minimize(objective, space, n_calls=n_calls, random_state=22, verbose=False, n_jobs=-1)

# Training Process

In [37]:
from collections import Counter
from scipy.special import erfinv
from sklearn.preprocessing import StandardScaler
from optimization import bayesOpt
import time

result, detailed_result = {'column': [], 'auc': []}, {}

y = clinical.values[:, [0]].astype(int)

for column in clinical.iloc[:, 1:]:
    
    print(column)
    
    detailed_result[column] = {'y': [], 'y_hat': []}
    
    values = clinical[column]
    
    current_column = None
    
    if values.dtype == 'object':
        
        values = pd.get_dummies(values)
        
        n_values = values.shape[1]
        
        values.columns = [column + '_' + str(c).lower().replace(' ', '_') for c in values.columns]
        
        current_column = values  
    else:
        current_column = pd.DataFrame(values)
        
    x = current_column.join(pd.get_dummies(therapies), how='inner').values

    kfold = StratifiedKFold(3, random_state=28)

    for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):

        #
        # Split train & valid
        #
        x_train, y_train = x[train_index,:], y[train_index, :]
        x_valid, y_valid = x[valid_index,:], y[valid_index, :]

        #
        #
        #
        opt = optimize(x_train, y_train, space, n_calls=10).x

        params = {
            'learning_rate': opt[0],
            'num_leaves': opt[1],
            'max_depth': opt[2],
            'scale_pos_weight': opt[3],
            'min_child_weight': opt[4],
            'colsample_bytree': opt[5],
            'min_split_gain': opt[6],
            'min_child_samples': opt[7],

            'n_estimators': 10000,
            'subsample_for_bin': 2,
            'objective':'binary',
            'metric':'auc',
            'eval_metric':'auc',
            'is_unbalance':False,
            'nthread':24,          
            'verbose': -1}

        lgb_train = lgb.Dataset(x_train, list(y_train.reshape((-1,))))
        lgb_valid = lgb.Dataset(x_valid, list(y_valid.reshape((-1,))))

        gbm = lgb.train(params, lgb_train, valid_sets=lgb_valid, num_boost_round=1000, 
                        early_stopping_rounds=100, verbose_eval=False)

        y_check = gbm.predict(x_train, num_iteration=gbm.best_iteration, verbose_eval=False)
        y_ = gbm.predict(x_valid, num_iteration=gbm.best_iteration, verbose_eval=False)

        detailed_result[column]['y'] += list(y_valid.reshape((-1)))
        
        detailed_result[column]['y_hat'] += list(y_)

ecog_ps
cell_markers
percent_aneuploid
percent_plama_cells_bone_marrow
percent_plama_cells_peripherical_blood
creatinine
iss
absolute_neutrophil
platelet
wbc_x10_10_9_l
bun
glucose
total_protein
albumin
beta_2_microglobulin
calcium
hemoglobin
ldh
age
family_cancer
gender
race
hyperdiploid_flag
13q14
13q34
17p13
1q21
11p15
15q15
19q13
20q13
21q22
3q21
5q31
7q22
9q33
t_11_14_ccnd1
t_12_14_ccnd2
t_14_16_maf
t_14_20_mafb
t_4_14_whsc1
t_6_14_ccnd3
t_8_14_mafa
t_8_14_myc
lga
lgg
lgl_lambda
lgm
m_protein
first_line_transplant


In [38]:
for c in detailed_result.keys():
    df = pd.DataFrame(detailed_result[c])
    print(c, roc_auc_score(df['y'], df['y_hat']))

ecog_ps 0.5
cell_markers 0.5
percent_aneuploid 0.5
percent_plama_cells_bone_marrow 0.5
percent_plama_cells_peripherical_blood 0.5
creatinine 0.5
iss 0.5
absolute_neutrophil 0.5
platelet 0.5
wbc_x10_10_9_l 0.5
bun 0.5
glucose 0.5
total_protein 0.5
albumin 0.5
beta_2_microglobulin 0.5
calcium 0.5
hemoglobin 0.5
ldh 0.5
age 0.5
family_cancer 0.5
gender 0.5
race 0.5
hyperdiploid_flag 0.5
13q14 0.5
13q34 0.5
17p13 0.5
1q21 0.5
11p15 0.5
15q15 0.5
19q13 0.5
20q13 0.5
21q22 0.5
3q21 0.5
5q31 0.5
7q22 0.5
9q33 0.5
t_11_14_ccnd1 0.5
t_12_14_ccnd2 0.5
t_14_16_maf 0.5
t_14_20_mafb 0.5
t_4_14_whsc1 0.5
t_6_14_ccnd3 0.5
t_8_14_mafa 0.5
t_8_14_myc 0.5
lga 0.5
lgg 0.5
lgl_lambda 0.5
lgm 0.5
m_protein 0.5
first_line_transplant 0.5


In [None]:
from collections import Counter
from scipy.special import erfinv
from sklearn.preprocessing import StandardScaler
from optimization import bayesOpt
import time

result, detailed_result = {'column': [], 'auc': []}, {}

y = clinical.values[:, [0]].astype(int)

for column in clinical.iloc[:, 1:]:
    
    print(column)
    
    detailed_result[column] = {'y': [], 'y_hat': []}
    
    values = clinical[column]
    
    current_column = None
    
    if values.dtype == 'object':
        
        values = pd.get_dummies(values)
        
        n_values = values.shape[1]
        
        values.columns = [column + '_' + str(c).lower().replace(' ', '_') for c in values.columns]
        
        current_column = values  
    else:
        current_column = pd.DataFrame(values)
        
    x = current_column.join(pd.get_dummies(therapies), how='inner')

    kfold = StratifiedKFold(3, random_state=28)

    for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):

        genefpkm_scaler_robust = RobustScaler()
        genefpkm_scaler_minmax = MinMaxScaler()
    
        genefpkm_train = genefpkm.iloc[train_index, :]
        genefpkm_train = pd.DataFrame(np.maximum(0, np.minimum(1, 
                        genefpkm_scaler_minmax.fit_transform(
                            genefpkm_scaler_robust.fit_transform(genefpkm_train.values)))), 
                                columns=genefpkm_train.columns, index=genefpkm_train.index).replace(
                                    [np.inf, -np.inf], np.nan).fillna(0)
    
        genefpkm_valid = genefpkm.iloc[valid_index, :]
        genefpkm_valid = pd.DataFrame(np.maximum(0, np.minimum(1, 
                        genefpkm_scaler_minmax.transform(
                            genefpkm_scaler_robust.transform(genefpkm_valid.values)))), 
                                  columns=genefpkm_valid.columns, index=genefpkm_valid.index).replace(
                                    [np.inf, -np.inf], np.nan).fillna(0)
        
        selected_genes = sorted(list(select_genes(genefpkm_train, response_train[:,0], threshold=0.02)))
        
        genefpkm_train = genefpkm_train.loc[:,selected_genes]
        genefpkm_valid = genefpkm_valid.loc[:,selected_genes]
        #
        # Split train & valid
        #
        x_train, y_train = x.iloc[train_index,:].join(genefpkm_train, how='inner'), y[train_index, :]
        x_valid, y_valid = x.iloc[valid_index,:].join(genefpkm_valid, how='inner'), y[valid_index, :]

        #
        #
        #
        opt = optimize(x_train, y_train, space, n_calls=10).x

        params = {
            'learning_rate': opt[0],
            'num_leaves': opt[1],
            'max_depth': opt[2],
            'scale_pos_weight': opt[3],
            'min_child_weight': opt[4],
            'colsample_bytree': opt[5],
            'min_split_gain': opt[6],
            'min_child_samples': opt[7],

            'n_estimators': 10000,
            'subsample_for_bin': 2,
            'objective':'binary',
            'metric':'auc',
            'eval_metric':'auc',
            'is_unbalance':False,
            'nthread':24,          
            'verbose': -1}

        lgb_train = lgb.Dataset(x_train, list(y_train.reshape((-1,))))
        lgb_valid = lgb.Dataset(x_valid, list(y_valid.reshape((-1,))))

        gbm = lgb.train(params, lgb_train, valid_sets=lgb_valid, num_boost_round=1000, 
                        early_stopping_rounds=100, verbose_eval=False)

        y_check = gbm.predict(x_train, num_iteration=gbm.best_iteration, verbose_eval=False)
        y_ = gbm.predict(x_valid, num_iteration=gbm.best_iteration, verbose_eval=False)

        detailed_result[column]['y'] += list(y_valid.reshape((-1)))
        
        detailed_result[column]['y_hat'] += list(np.maximum(0, np.minimum(1, (y_ - min(y_check)) / max(y_check) - min(y_check))))