# Import Required Libraries

In [1]:
from sklearn.model_selection import StratifiedKFold

from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

from scipy.stats import ks_2samp

from model import OptimizedKMeans
from model import GeneticProfiling
from model import GeneticClustering
from model import DenoisingAutoencoder

from correlation import select_genes
from util import to_data_frame
from itertools import compress
from datetime import datetime

import lightgbm as lgb
import pandas as pd
import numpy as np
import pickle
import os

# Loading Data

In [2]:
clinical = pd.read_csv('data/clinical_brfl.tsv', sep='\t', index_col='ID')

genefpkm = pd.read_csv('data/gene_fpkm.tsv', sep='\t', index_col='ID')

selected_index = clinical.join(genefpkm, how='inner').index

clinical = clinical.loc[selected_index,:]

clinical['response_best_response_first_line'] = clinical['response_best_response_first_line'].astype(int)

genefpkm = genefpkm.loc[selected_index,:]

# Defining General Classification Params

In [3]:
params = {'boosting_type': 'gbdt', 
          'objective': 'binary',
          'num_class': 1,
          'metric': 'logloss',
          'learning_rate': 0.01, 
          'num_leaves': 31, 
          'max_depth': 4,  
          'min_child_samples': 20, 
          'max_bin': 255,  
          'subsample': 0.8, 
          'subsample_freq': 0,  
          'colsample_bytree': 0.3,  
          'min_child_weight': 5, 
          'subsample_for_bin': 200000,
          'min_split_gain': 0, 
          'reg_alpha': 0, 
          'reg_lambda': 0, 
          'nthread': 6, 
          'verbose': 0}

# Transforming Qualitative Variables into Dummy Ones

In [4]:
for column in clinical:
    
    values = clinical[column]
    
    if values.dtype == 'object':
        
        values = pd.get_dummies(values)
        
        values.columns = [column + '_' + str(c).lower().replace(' ', '_') for c in values.columns]
    
        del clinical[column]
    
        clinical = clinical.join(values, how='inner')

clinical = clinical.fillna(0)

clinical.iloc[:8,:8]

Unnamed: 0_level_0,response_best_response_first_line,percent_aneuploid,percent_plama_cells_bone_marrow,percent_plama_cells_peripherical_blood,creatinine,iss,absolute_neutrophil,platelet
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MMRF1029,0,0.0,8.4,0.0,106.08,1,2.6,219.0
MMRF1030,1,15.4,9.6,0.0,55.692,1,2.5,215.0
MMRF1031,0,18.3,10.1,0.0,81.328,1,10.29,385.0
MMRF1032,0,20.7,11.1,0.0,70.72,2,1.3,166.0
MMRF1033,0,18.5,12.0,0.0,79.56,1,3.99,307.0
MMRF1037,0,20.7,17.0,0.0,70.72,1,3.2,361.0
MMRF1038,0,29.0,22.0,0.0,97.24,3,5.89,310.0
MMRF1048,0,0.0,9.6,0.6,60.112,1,2.1,215.0


# Training Process

In [None]:
from collections import Counter

kfold = StratifiedKFold(10, random_state=13)

result = None
    
x, y = clinical.values[:, 1:], clinical.values[:, 0]

for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):
    
    print('Fold #{}'.format(i + 1))
    
    #
    # Split train & valid
    #
    response_train = clinical.iloc[train_index, 0]
    response_valid = clinical.iloc[valid_index, 0]
    
    clinical_train = clinical.iloc[train_index, 1:]
    clinical_valid = clinical.iloc[valid_index, 1:]
    
    genefpkm_train = genefpkm.iloc[train_index, :]
    genefpkm_valid = genefpkm.iloc[valid_index, :]
    
    #
    # Select gene expressions
    #
    print('Selecting gene expressions')
    
    if os.path.isfile('output/selected_genes_fold_{}.pkl'.format(i)):
        with open('output/selected_genes_fold_{}.pkl'.format(i), 'rb') as file:
            selected_genes = pickle.load(file)
    
    else:
        
        selected_genes = select_genes(genefpkm_train, response_train)
        
        with open('output/selected_genes_fold_{}.pkl'.format(i), 'wb') as file:
            pickle.dump(selected_genes, file)
    
    genefpkm_train = genefpkm_train[selected_genes]
    
    genefpkm_valid = genefpkm_valid[selected_genes]
    
    #
    # Genetic Profiling
    #
    print('Computing genetic profling')
    
    if os.path.isfile('output/kmeans_genetic_profiling_fold_{}.pkl'.format(i)):
        
        with open('output/kmeans_genetic_profiling_fold_{}.pkl'.format(i), 'rb') as file:
            genetic_profiling = pickle.load(file)
        
    else:
        
        genetic_profiling = GeneticProfiling(random_state=10)

        genetic_profiling.fit(genefpkm_train)
        
        with open('output/kmeans_genetic_profiling_fold_{}.pkl'.format(i), 'wb') as file:
            pickle.dump(genetic_profiling, file)
        
    
    profiling_train = to_data_frame(genetic_profiling.transform(genefpkm_train), prefix='PV', index=genefpkm_train.index)
    clinical_train = pd.concat([clinical_train, profiling_train], axis=1)
    
    profiling_valid = to_data_frame(genetic_profiling.transform(genefpkm_valid), prefix='PV', index=genefpkm_valid.index)
    clinical_valid = pd.concat([clinical_valid, profiling_valid], axis=1)
    
    #
    # Genetic Clustering
    #
    print('Computing genetic clustering')
    
    if os.path.isfile('output/kmeans_genetic_clustering_fold_{}.pkl'.format(i)):
        
        with open('output/kmeans_genetic_clustering_fold_{}.pkl'.format(i), 'rb') as file:
            genetic_clustering = pickle.load(file)
        
    else:
        
        genetic_clustering = GeneticClustering(random_state=10, verbose=0, early_stopping_rounds=10)

        genetic_clustering.fit(genefpkm_train)
        
        with open('output/kmeans_genetic_clustering_fold_{}.pkl'.format(i), 'wb') as file:
            pickle.dump(genetic_clustering, file)
    
    gene_cluster_train = to_data_frame(genetic_clustering.transform(genefpkm_train), prefix='GC', index=genefpkm_train.index)
    clinical_train = pd.concat([clinical_train, gene_cluster_train], axis=1)
    
    gene_cluster_valid = to_data_frame(genetic_clustering.transform(genefpkm_valid), prefix='GC', index=genefpkm_valid.index)
    clinical_valid = pd.concat([clinical_valid, gene_cluster_valid], axis=1)
    
    #
    #
    #
    x_train = genefpkm_train.fillna(0).values
    x_valid = genefpkm_valid.fillna(0).values
    
    #
    # Denoising Autoencoder
    #
    print('Denoising autoencoder')
    
    dae = DenoisingAutoencoder(model_name='data_augmentation_adam_fold_{}'.format(i), summaries_dir='output/deep_models/', verbose=1)
    
    denoising_scaler = MinMaxScaler()
    
    x_train = denoising_scaler.fit_transform(x_train)
    
    if not os.path.exists('output/deep_models/data_augmentation_adam_fold_{0}/graph/data_augmentation_adam_fold_{0}.meta'.format(i)):
        
        dae.build(n_inputs=x_train.shape[1], 
                  encoder_units=(int(x_train.shape[1] * .9), int(x_train.shape[1] * .8), int(x_train.shape[1] * .7)), 
                  decoder_units=(int(x_train.shape[1] * .8), int(x_train.shape[1] * .9)), 
                  encoder_activation_function='relu', decoder_activation_function='identity', l2_scale=0.01)
        
        dae.fit(x_train, steps=10000, optimizer='adam', loss='mse', learning_rate=1e-2)
        
    dae.load('output/deep_models/data_augmentation_adam_fold_{0}/graph/data_augmentation_adam_fold_{0}'.format(i))
    
    #
    #
    #
    print(x_train[:1,:5])
    print(dae.predict(x_train)[:1,:5])
    
    print('')

Fold #1
Selecting gene expressions
Computing genetic profling
Computing genetic clustering
Denoising autoencoder
INFO:tensorflow:Restoring parameters from output/deep_models/data_augmentation_adam_fold_0/graph/data_augmentation_adam_fold_0
[[0.15893324 0.3002057  0.19514278 0.2573063  0.18912288]]
[[ 0.05198449  0.07265677 -0.05117178  0.04844414 -0.05764659]]

Fold #2
Selecting gene expressions
Computing genetic profling
Computing genetic clustering
Denoising autoencoder
early stopping after 1001 iterations without improvements: best metri value 0.07888059318065643
INFO:tensorflow:Restoring parameters from output/deep_models/data_augmentation_adam_fold_1/graph/data_augmentation_adam_fold_1
[[0.00060812 0.33186943 0.44220612 0.57802692 0.30839268]]
[[-0.0552588  -0.00379574  0.07688395 -0.0635755  -0.08078298]]

Fold #3
Selecting gene expressions
Computing genetic profling
Computing genetic clustering
Denoising autoencoder
early stopping after 1001 iterations without improvements: best