# Import Required Libraries

In [1]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from scipy.stats import ks_2samp
from model import OptimizedKMeans
from model import GeneticProfiling, GeneticClustering
from model import DenoisingAutoencoder
from correlation import select_genes
from util import to_data_frame
from itertools import compress
from datetime import datetime

import lightgbm as lgb
import pandas as pd
import numpy as np
import pickle
import os

# Loading Data

In [2]:
clinical = pd.read_csv('data/clinical_brfl.tsv', sep='\t', index_col='ID')

genefpkm = pd.read_csv('data/gene_fpkm.tsv', sep='\t', index_col='ID')

selected_index = clinical.join(genefpkm, how='inner').index

clinical = clinical.loc[selected_index,:]

clinical['response_best_response_first_line'] = clinical['response_best_response_first_line'].astype(int)

genefpkm = genefpkm.loc[selected_index,:]

# Defining General Classification Params

In [3]:
params = {'boosting_type': 'gbdt', 
          'objective': 'binary',
          'num_class': 1,
          'metric': 'logloss',
          'learning_rate': 0.01, 
          'num_leaves': 31, 
          'max_depth': 4,  
          'min_child_samples': 20, 
          'max_bin': 255,  
          'subsample': 0.8, 
          'subsample_freq': 0,  
          'colsample_bytree': 0.3,  
          'min_child_weight': 5, 
          'subsample_for_bin': 200000,
          'min_split_gain': 0, 
          'reg_alpha': 0, 
          'reg_lambda': 0, 
          'nthread': 6, 
          'verbose': 0}

# Transforming Qualitative Variables into Dummy Ones

In [4]:
for column in clinical:
    
    values = clinical[column]
    
    if values.dtype == 'object':
        
        values = pd.get_dummies(values)
        
        values.columns = [column + '_' + str(c).lower().replace(' ', '_') for c in values.columns]
    
        del clinical[column]
    
        clinical = clinical.join(values, how='inner')

clinical = clinical.fillna(0)

clinical.iloc[:8,:8]

Unnamed: 0_level_0,response_best_response_first_line,percent_aneuploid,percent_plama_cells_bone_marrow,percent_plama_cells_peripherical_blood,creatinine,iss,absolute_neutrophil,platelet
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MMRF1029,0,0.0,8.4,0.0,106.08,1,2.6,219.0
MMRF1030,1,15.4,9.6,0.0,55.692,1,2.5,215.0
MMRF1031,0,18.3,10.1,0.0,81.328,1,10.29,385.0
MMRF1032,0,20.7,11.1,0.0,70.72,2,1.3,166.0
MMRF1033,0,18.5,12.0,0.0,79.56,1,3.99,307.0
MMRF1037,0,20.7,17.0,0.0,70.72,1,3.2,361.0
MMRF1038,0,29.0,22.0,0.0,97.24,3,5.89,310.0
MMRF1048,0,0.0,9.6,0.6,60.112,1,2.1,215.0


In [7]:
from model import Model
import tensorflow as tf
import numpy as np
import os


class DenoisingAutoencoder(Model):

    def __init__(self, model_name=None, summaries_dir='../output/'):

        self.graph = tf.Graph()

        with self.graph.as_default():

            self.session = tf.Session(graph=self.graph)

            self.model_name = model_name

            self.input = None

            self.corrupted_input = None

            self.learning_rate = tf.placeholder(tf.float32, name='learning_rate')

            self.add_summaries = summaries_dir is not None

            self.summaries_dir = summaries_dir

            self.keep_probability = tf.placeholder(tf.float32, name='keep_probability')

            self.batch_size = None
            
            self.layer_index = None

    def build(self, n_inputs, encoder_units=(128,), decoder_units=(128,), encoder_activation_function='sigmoid', decoder_activation_function='identity'):

        assert isinstance(encoder_units, tuple) and len(encoder_units) > 0, 'encoder_units should tuple with at least one element'
        
        assert isinstance(decoder_units, tuple) and len(decoder_units) > 0, 'decoder_units should tuple with at least one element'
        
        with self.graph.as_default():
            self.input = tf.placeholder(dtype=tf.float32, shape=(None, n_inputs), name='input')

            with tf.name_scope('random_noise'):

                mask = tf.random_uniform(shape=tf.shape(self.input), minval=0, maxval=1, dtype=tf.float32, seed=None, name=None)

                mask = tf.where(mask <= self.keep_probability, tf.ones_like(self.input, dtype=tf.float32), tf.zeros_like(self.input, dtype=tf.float32))

                self.corrupted_input = tf.multiply(self.input, mask)

            with tf.name_scope('encoder'):

                self.encoder = self.corrupted_input
                
                for layer_index, units in enumerate(encoder_units):
                
                    name = 'last_encoder' if layer_index == len(encoder_units) - 1 else 'encoder_{}'.format(layer_index + 1)
               
                    self.encoder = tf.layers.dense(self.encoder, units, kernel_initializer=tf.truncated_normal_initializer(), 
                                                   kernel_regularizer=tf.contrib.layers.l2_regularizer(1e-4))
                    
                    self.encoder = self.get_activation_function(encoder_activation_function)(self.encoder, name=name)

            with tf.name_scope('decoder'):

                self.decoder = self.encoder
                
                for layer_index, units in enumerate(decoder_units):
                
                    self.decoder = tf.layers.dense(self.decoder, n_inputs, kernel_initializer=tf.truncated_normal_initializer(),
                                                  kernel_regularizer=tf.contrib.layers.l2_regularizer(1e-4))

                    self.decoder = self.get_activation_function(encoder_activation_function)(self.decoder, name='decoder_{}'.format(layer_index + 1))
                    
                self.decoder = tf.layers.dense(self.decoder, n_inputs, kernel_initializer=tf.truncated_normal_initializer(),
                                               kernel_regularizer=tf.contrib.layers.l2_regularizer(1e-4))

                self.decoder = self.get_activation_function(decoder_activation_function)(self.decoder, name='last_encoder')

            self.layer_index = layer_index + 1
                
            self.saver = tf.train.Saver()

            
    def fit(self, x, keep_probability=0.75, learning_rate=1e-4, steps=10000, batch_size=None, shuffle=True, optimizer='sgd', loss='mse'):
        '''
        
        '''
        assert steps > 0, 'steps should be an integer greater than zero'

        assert batch_size is None or 0 < batch_size <= x.shape[0], 'bath should be none or an integer between zero (exclusive) and number of input features (inclusive)'

        self.best_error = np.inf
        
        iterations_without_improvements = 0
        
        with self.graph.as_default():

            self.__build_optimizer(optimizer, loss)

            test_writer, test_results = None, None

            if batch_size is None:
                batch_size = x.shape[0]

            self.batch_size = batch_size

            with self.graph.as_default():

                self.session.run(tf.global_variables_initializer())

                self.session.run(tf.local_variables_initializer())
                        
                opt_metric_value = tf.placeholder(dtype=tf.float32, name='optimization_metric_ph')
        
                opt_metric_value_summary = tf.summary.scalar('mean_' + loss, opt_metric_value)

                if self.add_summaries:
                    test_writer = tf.summary.FileWriter(self.summaries_dir + '/{}'.format(self.model_name), tf.get_default_graph())

                n_rows = x.shape[0]

                index = np.array(list(range(n_rows)), dtype=np.int)

                j, logdata = 0, None
                
                
                for step in range(steps):
                    
                    logs = []
                    
                    current_block = 0

                    while current_block < n_rows:

                        if shuffle:
                            np.random.shuffle(index)

                        batch = list(range(current_block, (min(current_block + batch_size, n_rows))))

                        loss_value = self.session.run([self.optimizer, self.loss],
                                                        feed_dict={self.input: x[index[batch], :],
                                                                   self.learning_rate: learning_rate,
                                                                   self.keep_probability: keep_probability})[1]
                        
                        logs.append(loss_value)
                        
                        current_block += batch_size

                        j += 1

                    if self.add_summaries:
                        
                        summary_scalar = self.session.run(opt_metric_value_summary, feed_dict={opt_metric_value: np.mean(logs)})
                        
                        test_writer.add_summary(summary_scalar, step)

                    if step == steps - 1:
                        self.saver.save(self.session, '{0}/{1}/graph/{1}'.format(self.summaries_dir, self.model_name), global_step=step)

                    if self.best_error > np.mean(logs):
                        
                        iterations_without_improvements = 0
                        
                        self.best_error = np.mean(logs)
                        
                        self.saver.save(self.session, '{0}/{1}/graph/{1}__BESTONE__'.format(self.summaries_dir, self.model_name))
                        
                    else:
                        iterations_without_improvements += 1
                    
                    if iterations_without_improvements > 1000:
                        
                        print('early stopping after {} iterations without improvements: best metri value {}'.format(iterations_without_improvements, np.mean(logs)))
                        
                        break

    def predict(self, x):

        if self.batch_size is None:
            self.batch_size = 1000

        x_line = None

        start, end = 0, min(self.batch_size, x.shape[0])

        while start < x.shape[0]:

            with self.graph.as_default():
                x_ = self.session.run([self.decoder], feed_dict={self.input: x[start:end, :], self.keep_probability: 1.0})[0]

            if x_line is None:
                x_line = x_

            else:
                x_line = np.concatenate((x_line, x_), axis=0)

            start, end = end, min(x.shape[0], end + self.batch_size)

        return x_line

    def encode(self, x):

        if self.batch_size is None:
            self.batch_size = 1000

        x_line = None

        start, end = 0, min(self.batch_size, x.shape[0])

        with self.graph.as_default():

            while start < x.shape[0]:

                x_ = self.session.run([self.encoder], feed_dict={self.input: x[start:end, :],
                                                                 self.keep_probability: 1.0})[0]

                if x_line is None:
                    x_line = x_

                else:
                    x_line = np.concatenate((x_line, x_), axis=0)

                start, end = end, min(x.shape[0], end + self.batch_size)

        return x_line

    def transform(self, x):

        if self.batch_size is None:
            self.batch_size = 1000

        x_line = None

        start, end = 0, min(self.batch_size, x.shape[0])

        while start < x.shape[0]:

            with self.graph.as_default():
                x_ = self.session.run([tf.reduce_sum(tf.square(self.input - self.decoder), axis=1)],
                                      feed_dict={self.input: x[start:end, :], self.keep_probability: 1.0})[0]

            if x_line is None:
                x_line = x_

            else:
                x_line = np.concatenate((x_line, x_), axis=0)

            start, end = end, min(x.shape[0], end + self.batch_size)

        return x_line

    def get_error(self, x):

        if self.batch_size is None:
            self.batch_size = 1000

        x_line = None

        start, end = 0, min(self.batch_size, x.shape[0])

        while start < x.shape[0]:

            with self.graph.as_default():
                x_ = self.session.run([self.input - self.decoder],
                                      feed_dict={self.input: x[start:end, :], self.keep_probability: 1.0})[0]

            if x_line is None:
                x_line = x_

            else:
                x_line = np.concatenate((x_line, x_), axis=0)

            start, end = end, min(x.shape[0], end + self.batch_size)

        return x_line

    def fit_encode(self, x, keep_probability=0.75, learning_rate=1e-4, steps=1000, batch_size=None, shuffle=True):

        self.fit(x, keep_probability, learning_rate, steps, batch_size, shuffle)

        return self.encode(x)

    def fit_transform(self, x, keep_probability=0.75, learning_rate=1e-2, steps=1000, batch_size=None, shuffle=True):

        self.fit(x, keep_probability, learning_rate, steps, batch_size, shuffle)

        return self.transform(x)

    def load(self, model_path):

        if os.path.exists('{}.meta'.format(model_path)) and os.path.isfile('{}.meta'.format(model_path)):

            with self.graph.as_default():

                self.saver = tf.train.import_meta_graph('{}.meta'.format(model_path))

                self.saver.restore(self.session, tf.train.latest_checkpoint(os.path.dirname(model_path)))

                self.input = tf.get_default_graph().get_tensor_by_name('input:0')

                self.keep_probability = tf.get_default_graph().get_tensor_by_name('keep_probability_1:0')

                self.encoder = tf.get_default_graph().get_tensor_by_name('encoder/last_encoder:0')

                self.decoder = tf.get_default_graph().get_tensor_by_name('decoder/last_decoder:0')

    def __build_optimizer(self, optimizer, loss):

        with tf.name_scope('optimization'):

            with tf.name_scope('loss'):

                self.loss = self.get_loss(loss)(self.input, self.decoder)

                self.loss = tf.reduce_mean(self.loss)

                tf.summary.scalar('dae', self.loss)

            self.optimizer = self.get_optimizer(optimizer)(learning_rate=self.learning_rate)

            self.optimizer = self.optimizer.minimize(self.loss, name='optimizer')

        if self.add_summaries:
            #
            # Create summary tensors
            #
            self.merged = tf.summary.merge_all()

# Training Process

In [None]:
from collections import Counter

kfold = StratifiedKFold(10, random_state=13)

result = None
    
x, y = clinical.values[:, 1:], clinical.values[:, 0]

for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):
    
    print('Fold #{}'.format(i + 1))
    
    #
    # Split train & valid
    #
    response_train = clinical.iloc[train_index, 0]
    response_valid = clinical.iloc[valid_index, 0]
    
    clinical_train = clinical.iloc[train_index, 1:]
    clinical_valid = clinical.iloc[valid_index, 1:]
    
    genefpkm_train = genefpkm.iloc[train_index, :]
    genefpkm_valid = genefpkm.iloc[valid_index, :]
    
    #
    # Select gene expressions
    #
    print('Selecting gene expressions')
    
    if os.path.isfile('output/selected_genes_fold_{}.pkl'.format(i)):
        with open('output/selected_genes_fold_{}.pkl'.format(i), 'rb') as file:
            selected_genes = pickle.load(file)
    
    else:
        
        selected_genes = select_genes(genefpkm_train, response_train)
        
        with open('output/selected_genes_fold_{}.pkl'.format(i), 'wb') as file:
            pickle.dump(selected_genes, file)
    
    genefpkm_train = genefpkm_train[selected_genes]
    
    genefpkm_valid = genefpkm_valid[selected_genes]
    
    #
    # Genetic Profiling
    #
    print('Computing genetic profling')
    
    if os.path.isfile('output/kmeans_genetic_profiling_fold_{}.pkl'.format(i)):
        
        with open('output/kmeans_genetic_profiling_fold_{}.pkl'.format(i), 'rb') as file:
            genetic_profiling = pickle.load(file)
        
    else:
        
        genetic_profiling = GeneticProfiling(random_state=10)

        genetic_profiling.fit(genefpkm_train)
        
        with open('output/kmeans_genetic_profiling_fold_{}.pkl'.format(i), 'wb') as file:
            pickle.dump(genetic_profiling, file)
        
    
    profiling_train = to_data_frame(genetic_profiling.transform(genefpkm_train), prefix='PV', index=genefpkm_train.index)
    clinical_train = pd.concat([clinical_train, profiling_train], axis=1)
    
    profiling_valid = to_data_frame(genetic_profiling.transform(genefpkm_valid), prefix='PV', index=genefpkm_valid.index)
    clinical_valid = pd.concat([clinical_valid, profiling_valid], axis=1)
    
    #
    # Genetic Clustering
    #
    print('Computing genetic clustering')
    
    if os.path.isfile('output/kmeans_genetic_clustering_fold_{}.pkl'.format(i)):
        
        with open('output/kmeans_genetic_clustering_fold_{}.pkl'.format(i), 'rb') as file:
            genetic_clustering = pickle.load(file)
        
    else:
        
        genetic_clustering = GeneticClustering(random_state=10, verbose=0, early_stopping_rounds=10)

        genetic_clustering.fit(genefpkm_train)
        
        with open('output/kmeans_genetic_clustering_fold_{}.pkl'.format(i), 'wb') as file:
            pickle.dump(genetic_clustering, file)
    
    gene_cluster_train = to_data_frame(genetic_clustering.transform(genefpkm_train), prefix='GC', index=genefpkm_train.index)
    clinical_train = pd.concat([clinical_train, gene_cluster_train], axis=1)
    
    gene_cluster_valid = to_data_frame(genetic_clustering.transform(genefpkm_valid), prefix='GC', index=genefpkm_valid.index)
    clinical_valid = pd.concat([clinical_valid, gene_cluster_valid], axis=1)
    
    #
    #
    #
    x_train = clinical_train.join(genefpkm_train, how='inner').fillna(0).values
    x_valid = clinical_valid.join(genefpkm_valid, how='inner').fillna(0).values
    
    #
    # Denoising Autoencoder
    #
    print('Denoising autoencoder')
    
    from sklearn.preprocessing import MinMaxScaler
    
    scaler = MinMaxScaler()
    
    x_train = scaler.fit_transform(x_train)
    
    dae = DenoisingAutoencoder(model_name='001_data_augmentation_adagrad_fold_{}'.format(i), summaries_dir='output/deep_models/')
    
    dae.build(n_inputs=x_train.shape[1], 
              encoder_units=(int(x_train.shape[1] * .9), int(x_train.shape[1] * .8), int(x_train.shape[1] * .7)), 
              decoder_units=(int(x_train.shape[1] * .8), int(x_train.shape[1] * .9)), 
              encoder_activation_function='relu', decoder_activation_function='relu')

    dae.fit(x_train, batch_size=100, steps=10000, optimizer='adagrad', learning_rate=1e-6)
    
    print('')
    
    

Fold #1
Selecting gene expressions
Computing genetic profling
Computing genetic clustering
Denoising autoencoder
early stopping after 1001 iterations without improvements: best metri value 380812134449152.0

Fold #2
Selecting gene expressions
Computing genetic profling
Computing genetic clustering
Denoising autoencoder
