# Import Required Libraries

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from scipy.stats import ks_2samp
from model import OptimizedKMeans
from model import GeneticProfiling, GeneticClustering
from model import DenoisingAutoencoder
from correlation import select_genes
from util import to_data_frame
from itertools import compress
from datetime import datetime

import lightgbm as lgb
import pandas as pd
import numpy as np
import pickle
import os

# Loading Data

In [None]:
clinical = pd.read_csv('data/clinical_brfl.tsv', sep='\t', index_col='ID')

genefpkm = pd.read_csv('data/gene_fpkm.tsv', sep='\t', index_col='ID')

selected_index = clinical.join(genefpkm, how='inner').index

clinical = clinical.loc[selected_index,:]

clinical['response_best_response_first_line'] = clinical['response_best_response_first_line'].astype(int)

genefpkm = genefpkm.loc[selected_index,:]

# Defining General Classification Params

In [None]:
params = {'boosting_type': 'gbdt', 
          'objective': 'binary',
          'num_class': 1,
          'metric': 'logloss',
          'learning_rate': 0.01, 
          'num_leaves': 31, 
          'max_depth': 4,  
          'min_child_samples': 20, 
          'max_bin': 255,  
          'subsample': 0.8, 
          'subsample_freq': 0,  
          'colsample_bytree': 0.3,  
          'min_child_weight': 5, 
          'subsample_for_bin': 200000,
          'min_split_gain': 0, 
          'reg_alpha': 0, 
          'reg_lambda': 0, 
          'nthread': 6, 
          'verbose': 0}

# Transforming Qualitative Variables into Dummy Ones

In [None]:
for column in clinical:
    
    values = clinical[column]
    
    if values.dtype == 'object':
        
        values = pd.get_dummies(values)
        
        values.columns = [column + '_' + str(c).lower().replace(' ', '_') for c in values.columns]
    
        del clinical[column]
    
        clinical = clinical.join(values, how='inner')

clinical = clinical.fillna(0)

clinical.iloc[:8,:8]

# Training Process

In [None]:
from collections import Counter

kfold = StratifiedKFold(10, random_state=13)

result = None
    
x, y = clinical.values[:, 1:], clinical.values[:, 0]

for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):
    
    print('Fold #{}'.format(i + 1))
    
    #
    # Split train & valid
    #
    response_train = clinical.iloc[train_index, 0]
    response_valid = clinical.iloc[valid_index, 0]
    
    clinical_train = clinical.iloc[train_index, 1:]
    clinical_valid = clinical.iloc[valid_index, 1:]
    
    genefpkm_train = genefpkm.iloc[train_index, :]
    genefpkm_valid = genefpkm.iloc[valid_index, :]
    
    #
    # Select gene expressions
    #
    print('Selecting gene expressions')
    
    if os.path.isfile('output/selected_genes_fold_{}.pkl'.format(i)):
        with open('output/selected_genes_fold_{}.pkl'.format(i), 'rb') as file:
            selected_genes = pickle.load(file)
    
    else:
        
        selected_genes = select_genes(genefpkm_train, response_train)
        
        with open('output/selected_genes_fold_{}.pkl'.format(i), 'wb') as file:
            pickle.dump(selected_genes, file)
    
    genefpkm_train = genefpkm_train[selected_genes]
    
    genefpkm_valid = genefpkm_valid[selected_genes]
    
    #
    # Genetic Profiling
    #
    print('Computing genetic profling')
    
    if os.path.isfile('output/kmeans_genetic_profiling_fold_{}.pkl'.format(i)):
        
        with open('output/kmeans_genetic_profiling_fold_{}.pkl'.format(i), 'rb') as file:
            genetic_profiling = pickle.load(file)
        
    else:
        
        genetic_profiling = GeneticProfiling(random_state=10)

        genetic_profiling.fit(genefpkm_train)
        
        with open('output/kmeans_genetic_profiling_fold_{}.pkl'.format(i), 'wb') as file:
            pickle.dump(genetic_profiling, file)
        
    
    profiling_train = to_data_frame(genetic_profiling.transform(genefpkm_train), prefix='PV', index=genefpkm_train.index)
    clinical_train = pd.concat([clinical_train, profiling_train], axis=1)
    
    profiling_valid = to_data_frame(genetic_profiling.transform(genefpkm_valid), prefix='PV', index=genefpkm_valid.index)
    clinical_valid = pd.concat([clinical_valid, profiling_valid], axis=1)
    
    #
    # Genetic Clustering
    #
    print('Computing genetic clustering')
    
    if os.path.isfile('output/kmeans_genetic_clustering_fold_{}.pkl'.format(i)):
        
        with open('output/kmeans_genetic_clustering_fold_{}.pkl'.format(i), 'rb') as file:
            genetic_clustering = pickle.load(file)
        
    else:
        
        genetic_clustering = GeneticClustering(random_state=10, verbose=0, early_stopping_rounds=10)

        genetic_clustering.fit(genefpkm_train)
        
        with open('output/kmeans_genetic_clustering_fold_{}.pkl'.format(i), 'wb') as file:
            pickle.dump(genetic_clustering, file)
    
    gene_cluster_train = to_data_frame(genetic_clustering.transform(genefpkm_train), prefix='GC', index=genefpkm_train.index)
    clinical_train = pd.concat([clinical_train, gene_cluster_train], axis=1)
    
    gene_cluster_valid = to_data_frame(genetic_clustering.transform(genefpkm_valid), prefix='GC', index=genefpkm_valid.index)
    clinical_valid = pd.concat([clinical_valid, gene_cluster_valid], axis=1)
    
    #
    #
    #
    x_train = clinical_train.join(genefpkm_train, how='inner').fillna(0).values
    x_valid = clinical_valid.join(genefpkm_valid, how='inner').fillna(0).values
    
    #
    # Denoising Autoencoder
    #
    print('Denoising autoencoder')
    
    from sklearn.preprocessing import MinMaxScaler
    
    scaler = MinMaxScaler()
    
    x_train = scaler.fit_transform(x_train)
    
    dae = DenoisingAutoencoder(model_name='001_data_augmentation_adagrad_fold_{}'.format(i), summaries_dir='output/deep_models/')
    
    dae.build(n_inputs=x_train.shape[1], 
              encoder_units=(int(x_train.shape[1] * .9), int(x_train.shape[1] * .8), int(x_train.shape[1] * .7)), 
              decoder_units=(int(x_train.shape[1] * .8), int(x_train.shape[1] * .9)), 
              encoder_activation_function='relu', decoder_activation_function='relu')

    dae.fit(x_train, batch_size=100, steps=10000, optimizer='adagrad', learning_rate=1e-6)
    
    print('')
    
    