# Import Required Libraries

In [1]:
from sklearn.model_selection import StratifiedKFold
from model_selection import ClusteredStratifiedKFold

from sklearn.preprocessing import MinMaxScaler, RobustScaler

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

from scipy.stats import ks_2samp

from model import OptimizedKMeans
from model import GeneticProfiling
from model import GeneticClustering
from model import DenoisingAutoencoder
from model import Dense
from model import ConvDense

from correlation import select_genes, select_genes_mic
from util import to_data_frame
from itertools import compress
from datetime import datetime

import lightgbm as lgb
import pandas as pd
import numpy as np
import pickle
import os

# Loading Data

In [37]:
clinical = pd.read_csv('data/clinical_dtdp.tsv', sep='\t', index_col='ID')

genefpkm = pd.read_csv('data/gene_count.tsv', sep='\t', index_col='ID')

selected_index = clinical.join(genefpkm, how='inner').index

clinical = clinical.loc[selected_index,:]

clinical['response_best_response_first_line'] = clinical['response_days_to_disease_progression'].astype(int)

therapy_class = clinical['therapy_first_line_class']

del clinical['therapy_first_line_class']

del clinical['response_best_response_first_line']

genefpkm = genefpkm.loc[selected_index,:]

for g in genefpkm.loc[:, genefpkm.sum() == 0].columns:
    del genefpkm[g]

# Transforming Qualitative Variables into Dummy Ones

In [3]:
for column in clinical:
    
    values = clinical[column]
    
    if values.dtype == 'object':
        
        values = pd.get_dummies(values)
        
        n_values = values.shape[1]
        
        values.columns = [column + '_' + str(c).lower().replace(' ', '_') for c in values.columns]
    
        del clinical[column]
        
        if n_values == 2:
            values = values.iloc[:, [0]]
        
        clinical = clinical.join(values, how='inner')

clinical = clinical.fillna(0)

clinical.iloc[:8,:]

Unnamed: 0_level_0,response_days_to_disease_progression,percent_aneuploid,percent_plama_cells_bone_marrow,percent_plama_cells_peripherical_blood,creatinine,iss,absolute_neutrophil,platelet,wbc_x10_10_9_l,bun,...,t_6_14_ccnd3_detected,t_8_14_mafa_detected,t_8_14_myc_detected,therapy_first_line_bor,therapy_first_line_bor-cyc-dex,therapy_first_line_bor-dex,therapy_first_line_bor-len-dex,therapy_first_line_len,therapy_first_line_len-dex,first_line_transplant_no
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MMRF1030,1.0,15.4,9.6,0.0,55.692,1,2.5,215.0,4.7,0.0,...,0,0,1,0,0,0,1,0,0,0
MMRF1031,1.0,18.3,10.1,0.0,81.328,1,10.29,385.0,12.4,4.284,...,0,0,0,0,0,0,1,0,0,1
MMRF1032,1.0,20.7,11.1,0.0,70.72,2,1.3,166.0,2.5,5.355,...,0,0,0,0,1,0,0,0,0,1
MMRF1033,0.0,18.5,12.0,0.0,79.56,1,3.99,307.0,7.4,4.998,...,0,0,0,0,0,0,0,0,1,1
MMRF1038,1.0,29.0,22.0,0.0,97.24,3,5.89,310.0,10.5,7.854,...,0,0,0,0,0,0,0,0,1,1
MMRF1048,1.0,0.0,9.6,0.6,60.112,1,2.1,215.0,3.6,4.998,...,0,0,0,0,0,0,1,0,0,0
MMRF1068,0.0,19.0,18.0,0.0,67.184,1,2.5,251.0,4.4,6.069,...,0,0,1,0,0,0,1,0,0,0
MMRF1077,1.0,25.0,10.3,0.03,0.0,2,2.53,198.0,4.9,0.0,...,0,0,0,0,0,0,1,0,0,0


In [4]:
genefpkm.iloc[:8,:8]

Unnamed: 0_level_0,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MMRF1030,83,0,1235,422,58,42,21,1714
MMRF1031,2,0,1127,432,190,48,117,2527
MMRF1032,24,0,748,214,62,33,65,571
MMRF1033,18,0,827,478,46,211,5,962
MMRF1038,187,0,871,355,50,38,2,1008
MMRF1048,81,0,1623,430,146,62,65,1024
MMRF1068,0,0,870,329,50,2,1,635
MMRF1077,0,0,521,444,62,12,50,535


# Removing Bias from Therapy

In [5]:
sele = []

for c in clinical.columns:
    if 'therapy_first_line' in c:
        sele.append(c)

to_delete = []

for a in list(clinical[sele].loc[:,clinical[sele].sum() < 5].columns):
    to_delete += list(clinical.loc[clinical[a] == 1,:].index)
    del clinical[a]

clinical = clinical.loc[~clinical.index.isin(to_delete),:]

genefpkm = genefpkm.loc[~genefpkm.index.isin(to_delete),:]

# Splits

In [6]:
initial_scaler = MinMaxScaler()

x, y = initial_scaler.fit_transform(clinical.iloc[:, 1:].join(genefpkm, how='inner')), clinical.values[:, 0]

kfold = ClusteredStratifiedKFold(5, random_state=22)

splits = kfold.split(x, y)

In [7]:
from sklearn.model_selection import cross_val_score
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize

space  = [
      Real(1e-6, 1e-1, 'log-uniform', name='learning_rate'),
      Integer(7, 4095, name='num_leaves'),
      Integer(2, 100, name='max_depth'),
      Integer(1, 4000, name='scale_pos_weight'),
      Real(0.01, 1.5, 'log-uniform', name='min_child_weight'),
      Real(1/100, 400/10000, 'uniform', name='subsample'),
      Real(0.4, 1, 'uniform', name='colsample_bytree')]

def optimize(x, y, space):
    
    @use_named_args(space)
    def objective(num_leaves, max_depth, scale_pos_weight, min_child_weight, subsample, colsample_bytree, learning_rate):
        
        scores = []
        
        kf = StratifiedKFold(5, random_state=23) 

        params = {
            'num_leaves': int(num_leaves),
            'max_depth': int(max_depth),
            'scale_pos_weight': scale_pos_weight,
            'min_child_weight': min_child_weight,
            'subsample': subsample,
            'colsample_bytree': colsample_bytree,
            'learning_rate': learning_rate,

            'objective':'binary',
            'metric':'auc',
            'eval_metric':'auc',
            'bagging_freq':1,
            'bagging_fraction': 0.99,
            'min_split_gain':0.1,
            'min_child_samples': 1,
            'subsample_freq':3,
            'subsample_for_bin':5,
            'n_estimators':100000,
            'is_unbalance':False,
            'nthread':24,          
            'verbose': -1}
        
        dataset = lgb.Dataset(x, y[:,0])
            
        aucs = lgb.cv(params, dataset, nfold=5, num_boost_round=1000, early_stopping_rounds=100, verbose_eval=False)       
        
        return -aucs['auc-mean'][-1]
    
    return gp_minimize(objective, space, n_calls=10, random_state=22, verbose=False, n_jobs=-1)

In [49]:
gene_selector = ClusteredSelection(n_splits=30)

scaler = MinMaxScaler()

selected_genes = genefpkm.columns[gene_selector.select(scaler.fit_transform(genefpkm.T.values))]

len(selected_genes)

908

# Training Process

In [None]:
from collections import Counter
from scipy.special import erfinv
from sklearn.preprocessing import StandardScaler
from optimization import bayesOpt

result = None

#
#
#

for i, (train_index, valid_index) in enumerate(splits):
    
    print('Fold #{}'.format(i + 1))
    
    #
    # Split train & valid
    #
    response_train = clinical.iloc[train_index, [0]].values
    response_valid = clinical.iloc[valid_index, [0]].values
    
    clinical_scaler = MinMaxScaler()
    
    clinical_train = clinical.iloc[train_index, 1:]
    clinical_train = pd.DataFrame(clinical_scaler.fit_transform(clinical_train.values), 
                                  columns=clinical_train.columns, index=clinical_train.index).replace([np.inf, -np.inf], np.nan).dropna(axis=1, how='any')
    
    clinical_valid = clinical.iloc[valid_index, 1:]
    clinical_valid = pd.DataFrame(clinical_scaler.transform(clinical_valid.values), 
                                  columns=clinical_valid.columns, index=clinical_valid.index).replace([np.inf, -np.inf], np.nan).dropna(axis=1, how='any')
    
    genefpkm_scaler = MinMaxScaler()
    
    genefpkm_train = genefpkm.iloc[train_index, :]
    genefpkm_train = pd.DataFrame(genefpkm_scaler.fit_transform(genefpkm_train.values), 
                                  columns=genefpkm_train.columns, index=genefpkm_train.index).replace(
        [np.inf, -np.inf], np.nan).dropna(axis=1, how='any')
    
    genefpkm_valid = genefpkm.iloc[valid_index, :]
    genefpkm_valid = pd.DataFrame(genefpkm_scaler.transform(genefpkm_valid.values), 
                                  columns=genefpkm_valid.columns, index=genefpkm_valid.index).replace(
        [np.inf, -np.inf], np.nan).dropna(axis=1, how='any')
    
    #
    # Select gene expressions
    #
    print('Selecting gene expressions')
    
    if os.path.isfile('output/dtdp/selected_genes_fold_{}.pkl'.format(i)):
        
        with open('output/dtdp/selected_genes_fold_{}.pkl'.format(i), 'rb') as file:
            selected_genes = sorted(list(pickle.load(file)))
            
        with open('output/dtdp/selected_feats_fold_{}.pkl'.format(i), 'rb') as file:
            selected_feats = sorted(list(pickle.load(file)))
    
    else:
        
        selected_genes = sorted(list(select_genes(genefpkm_train, response_train[:,0], threshold=0.05)))
        
        selected_feats = sorted(list(select_genes(clinical_train, response_train[:,0], threshold=0.05)))
        
        with open('output/dtdp/selected_genes_fold_{}.pkl'.format(i), 'wb') as file:
            pickle.dump(selected_genes, file)
            
        with open('output/dtdp/selected_feats_fold_{}.pkl'.format(i), 'wb') as file:
            pickle.dump(selected_feats, file)

    print('Selecting {} gene expressions'.format(len(selected_genes)))
    
    genefpkm_train = genefpkm_train.loc[:,selected_genes]
    clinical_train = clinical_train.loc[:,selected_feats]
    
    genefpkm_valid = genefpkm_valid.loc[:,selected_genes]
    clinical_valid = clinical_valid.loc[:,selected_feats]
    
    #
    # Genetic Profiling
    #
    print('Computing genetic profling')
    
    if os.path.isfile('output/dtdp/kmeans_genetic_profiling_fold_{}.pkl'.format(i)):
        
        with open('output/dtdp/kmeans_genetic_profiling_fold_{}.pkl'.format(i), 'rb') as file:
            genetic_profiling = pickle.load(file)
        
    else:
        
        genetic_profiling = GeneticProfiling(random_state=10)

        genetic_profiling.fit(genefpkm_train)
        
        with open('output/dtdp/kmeans_genetic_profiling_fold_{}.pkl'.format(i), 'wb') as file:
            pickle.dump(genetic_profiling, file)
        
    
    profiling_train = to_data_frame(genetic_profiling.transform(genefpkm_train), prefix='PV', index=genefpkm_train.index)
    #profiling_train = to_data_frame(genetic_profiling.predict(genefpkm_train).reshape((-1,1)), prefix='PV', index=genefpkm_train.index)
    clinical_train = pd.concat([clinical_train, profiling_train], axis=1)
    
    profiling_valid = to_data_frame(genetic_profiling.transform(genefpkm_valid), prefix='PV', index=genefpkm_valid.index)
    #profiling_valid = to_data_frame(genetic_profiling.predict(genefpkm_valid).reshape((-1,1)), prefix='PV', index=genefpkm_valid.index)
    clinical_valid = pd.concat([clinical_valid, profiling_valid], axis=1)
    
    #
    # Gene Clustering
    #
    print('Computing genetic clustering')
    
    if os.path.isfile('output/dtdp/kmeans_genetic_clustering_fold_{}.pkl'.format(i)):
        
        with open('output/dtdp/kmeans_genetic_clustering_fold_{}.pkl'.format(i), 'rb') as file:
            genetic_clustering = pickle.load(file)
        
    else:
        
        genetic_clustering = GeneticClustering(random_state=10, verbose=0, early_stopping_rounds=10)

        genetic_clustering.fit(genefpkm_train)
        
        with open('output/dtdp/kmeans_genetic_clustering_fold_{}.pkl'.format(i), 'wb') as file:
            pickle.dump(genetic_clustering, file)
    
    gene_cluster_train = to_data_frame(genetic_clustering.transform(genefpkm_train), prefix='GC', index=genefpkm_train.index)
    clinical_train = pd.concat([clinical_train, gene_cluster_train], axis=1)
    
    gene_cluster_valid = to_data_frame(genetic_clustering.transform(genefpkm_valid), prefix='GC', index=genefpkm_valid.index)
    clinical_valid = pd.concat([clinical_valid, gene_cluster_valid], axis=1)
    
    #
    # Denoising Autoencoder
    #
    print('Denoising autoencoder')
    
    dae = DenoisingAutoencoder(model_name='data_augmentation_adam_fold_{}'.format(i), summaries_dir='output/dtdp/deep_models/', verbose=1)
    
    if not os.path.exists('output/dtdp/deep_models/data_augmentation_adam_fold_{0}/graph/data_augmentation_adam_fold_{0}.meta'.format(i)):
        
        dae.build(n_inputs=genefpkm_train.shape[1], 
                  encoder_units=(int(genefpkm_train.shape[1] * .7), int(genefpkm_train.shape[1] * .6), int(genefpkm_train.shape[1] * .5)), 
                  decoder_units=(int(genefpkm_train.shape[1] * .6), int(genefpkm_train.shape[1] * .7)), 
                  encoder_activation_function='relu', decoder_activation_function='identity', l2_scale=0.01)
        
        dae.fit(genefpkm_train.values, steps=2500, optimizer='adam', loss='mse', learning_rate=1e-2)#, keep_probability=0.5)
        
    dae.load('output/dtdp/deep_models/data_augmentation_adam_fold_{0}/graph/data_augmentation_adam_fold_{0}'.format(i))
    
    error_train = dae.predict(genefpkm_train.values)
    error_train = pd.DataFrame(np.abs(genefpkm_train - error_train) / genefpkm_train, index=genefpkm_train.index)
    error_train.columns = ['ERR' + ens.replace('ENS', '') for ens in genefpkm_train.columns]
    
    error_valid = dae.predict(genefpkm_valid.values)
    error_valid = pd.DataFrame(np.abs(genefpkm_valid - error_valid) / genefpkm_valid, index=genefpkm_valid.index)
    error_valid.columns = ['ERR' + ens.replace('ENS', '') for ens in genefpkm_valid.columns]
    
    #error_train = pd.DataFrame(dae.encode(x_train), index=genefpkm_train.index)
    #error_train = pd.DataFrame(np.abs(x_train - dae.predict(x_train)), index=genefpkm_train.index)
    #error_train = pd.DataFrame(dae.get_error(x_train), index=genefpkm_train.index)
    #error_train.columns = ['ERR' + str(pre) for pre in range(error_train.shape[1])]
    
    #error_valid = pd.DataFrame(dae.encode(x_valid), index=genefpkm_valid.index)
    #error_valid = pd.DataFrame(np.abs(x_valid - dae.predict(x_valid)), index=genefpkm_valid.index)
    #error_valid = pd.DataFrame(dae.get_error(x_valid), index=genefpkm_valid.index)
    #error_valid.columns = ['ERR' + str(pre) for pre in range(error_valid.shape[1])]
    
    dae.close()
    
    del dae
    
    #
    # Join all features
    #
   
    #x_train = erfinv(final_scalar.fit_transform(clinical_train.join(error_train, how='inner').fillna(0).values))
    #x_valid = erfinv(final_scalar.transform(clinical_valid.join(error_valid, how='inner').fillna(0).values)
    
    x_train = clinical_train.join(genefpkm_train, how='inner').join(error_train, how='inner').values
    x_valid = clinical_valid.join(genefpkm_valid, how='inner').values
    
    #bayesOpt(x_train, response_train)
    
    #
    # Dense
    #
    '''
    print('Multiple Dense Models')
    
    md_prefix = 'multiple_dense_selected_fold'
    
    multiple_dense = Dense(model_name='{}_{}'.format(md_prefix, i), summaries_dir='output/deep_models/')
    
    if not os.path.exists('output/deep_models/{0}_{1}/{0}_{1}.meta'.format(md_prefix, i)):
        
        multiple_dense.build(n_input_features=x_train.shape[1], 
                             n_outputs=1, 
                             abstraction_activation_functions=('sigmoid', 'tanh', 'relu'),
                             n_hidden_layers=3, n_hidden_nodes=128, 
                             keep_probability=0.5,
                             optimizer_algorithms=('adam', 'adam', 'adam'), 
                             cost_function='logloss', 
                             add_summaries=True,
                             batch_normalization=True, l2_regularizer=1e-8)
        
        multiple_dense.fit(x_train, response_train, x_valid, 
                           response_valid, learning_rate=1e-1, 
                           steps=10000, batch_size=100, shuffle=True)
    
    multiple_dense.load('output/deep_models/{0}_{1}/{0}_{1}'.format(md_prefix, i))

    x_train_transformed = multiple_dense.transform(x_train)
    x_valid_transformed = multiple_dense.transform(x_valid)
    
    multiple_dense.close()
    
    del multiple_dense
    
    #
    # Conv Dense
    #
    
    for ii in range(x_train_transformed.shape[1]):
        
        for jj in range(x_train_transformed.shape[2]):
            
            for kk in range(x_train_transformed.shape[3]):
                
                s = MinMaxScaler()
                
                s.fit(x_train_transformed[:,ii,jj,kk,0].reshape((-1, 1)))
                
                x_train_transformed[:,ii,jj,kk,0] = s.transform(x_train_transformed[:,ii,jj,kk,0].reshape((-1, 1))).reshape((-1))
                
                x_valid_transformed[:,ii,jj,kk,0] = s.transform(x_valid_transformed[:,ii,jj,kk,0].reshape((-1, 1))).reshape((-1))
            
    x_train_transformed = np.nan_to_num(x_train_transformed)
    x_valid_transformed = np.nan_to_num(x_valid_transformed)
    
    cd_prefix = 'convdense_standard_drop=0.5_lr=1e-4_nodecay_logloss'
    
    conv_dense = ConvDense(model_name='{}_fold_{}'.format(cd_prefix, i), summaries_dir='output/deep_models/', verbose=1)
    
    if not os.path.exists('output/deep_models/{0}_fold_{1}/graph/{0}_fold_{1}.meta'.format(cd_prefix, i)):
    
        conv_dense.build(n_models=3, n_neurons_per_layer=128, n_layers=3, 
                         n_outputs=1, optimizer_algorithm='adagrad', keep_probability=0.5, loss='logloss')

        conv_dense.fit(x_train_transformed, response_train, x_valid_transformed, response_valid, 
                       batch_size=x_train.shape[0], steps=500, learning_rate=1e-1)    
    
    conv_dense.load('output/deep_models/{0}_fold_{1}/graph/{0}_fold_{1}'.format(cd_prefix, i))
    
    x_train = conv_dense.transform(x_train_transformed)
    
    x_valid = conv_dense.transform(x_valid_transformed)
    
    conv_dense.close()
    
    del conv_dense
    '''
    
    #
    #
    #
    print('Optimizing')
    
    file_name = 'output/dtdp/optimization_lgbm_fold_{}.pkl'.format(i)
    
    if os.path.exists(file_name):
        with open(file_name, 'rb') as file:
            opt = pickle.load(file)
    else:
        opt = optimize(x_train, response_train, space).x
        
        with open(file_name, 'wb') as file:
            pickle.dump(opt, file)
        
    params = {
        'learning_rate': opt[0], 
        'num_leaves': opt[1], 
        'max_depth': opt[2],  
        'scale_pos_weight': opt[3],
        'min_child_weight': opt[4],
        'subsample': opt[5],
        'colsample_bytree': opt[6], 


        'objective':'binary',
        'metric':'auc',
        'eval_metric':'auc',
        'bagging_freq':1,
        'bagging_fraction': 0.99,
        'min_split_gain':0.1,
        'min_child_samples': 1,
        'subsample_freq':3,
        'subsample_for_bin':5,
        'n_estimators':100000,
        'is_unbalance':False,
        'nthread':24,          
        'verbose': -1}

    print('Training')
    
    lgb_train = lgb.Dataset(x_train, list(response_train.reshape((-1,))))
    lgb_valid = lgb.Dataset(x_valid, list(response_valid.reshape((-1,))))

    gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

    y_ = gbm.predict(x_valid, num_iteration=gbm.best_iteration, verbose_eval=True)

    #
    #
    #
    auc = roc_auc_score(response_valid, y_)

    print(i + 1, auc)

    #
    # 
    #    
    print('')


In [None]:
selected_feats