In [1]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

n_fold = 5
expName = "NT_Site_PredNTS_Classification_Ensemble_OHE_Kmer_Kgap_2"
outPath = "Results"
foldName = "folds.pickle"

# epochs = 100
# batch_size = 64
shuffle = True
seed = None

ohe_input_data_folder = "Data"
ohe_training_data_file = "Training-datasets-PredNTS.txt"
ohe_independent_data_file = "independent dataset-PredNTS.txt"

enc_data_folder = "PredNTS_MathFeature_ENC"
kmer_train_data_filename = 'Training-datasets-PredNTS_kmer.csv'
kmer_indpe_data_filename = 'independent-dataset-PredNTS_kmer.csv'

kgap_max = 4
kgap_train_data_filename = 'Training-datasets-PredNTS_kgap_{}.csv'
kgap_indpe_data_filename = 'independent-dataset-PredNTS_kgap_{}.csv'

callback_monitor = 'val_accuracy'

In [2]:
import os 
import pickle
import numpy as np
import pandas as pd

import tensorflow as tf

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression

import math

In [3]:
# print(tf.test.is_gpu_available(cuda_only=True))
# physical_devices = tf.config.experimental.list_physical_devices('GPU')
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [4]:
##################################################################################
##### define all CUSTOM functions
##################################################################################

def one_hot_encode_nt(sequence, char_dict):
    
    seq_encoded = np.zeros((len(sequence),len(char_dict)))
    
    i = 0
    for single_character in sequence:
        if(single_character.upper() in char_dict.keys()):
            seq_encoded[i][char_dict[single_character.upper()]] = 1
            i = i+1
        else:
            raise ValueError('Incorrect character in NT sequence: '+sequence)
    return seq_encoded

In [5]:
##################################################################################
##### Build k-fold functions
##################################################################################

## Build the K-fold from dataset
def build_kfold(ohe_features, kmer_features, kgap_features, labels, k=10, shuffle=False, seed=None):
    
    skf = StratifiedKFold(n_splits=k, shuffle=shuffle, random_state=seed)
    kfoldList = []
    for train_index, test_index in skf.split(ohe_features, labels):
        kfoldList.append({
            "X_OHE_train": ohe_features[train_index],
            "X_OHE_test": ohe_features[test_index],
            "X_Kmer_train": kmer_features[train_index],
            "X_Kmer_test": kmer_features[test_index],
            "X_Kgap_train": kgap_features[train_index],
            "X_Kgap_test": kgap_features[test_index],
            "y_train": labels[train_index],
            "y_test": labels[test_index]
        })
    return kfoldList

In [6]:
##################################################################################
##### define evaluator functions
##################################################################################

def pred2label(y_pred):
    y_pred = np.round(y_pred)
    return y_pred

# Neural network models

In [7]:
ohe_epochs = 200
ohe_batch_size = 16
    
##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def OHE_DLNN_CORENup(input_seq_shape = (41, 21),
                 conv_filters_per_layer_1 = 25, kernel_length_1 = 10, conv_strides_1 = 1, ## 1st Convolutional layer parameters
                 max_pool_width_1 = 3, max_pool_stride_1 = 3, ## 1st Maxpool layer parameters
                 lstm_decode_units = 25, ## LSTM layer parameters
                 conv_filters_per_layer_2 = 25,  kernel_length_2 = 5, conv_strides_2 = 1, ## 2nd Convolutional layer parameters
                 max_pool_width_2 = 3, max_pool_stride_2 = 3, ## 2nd Maxpool layer parameters
                 dense_decode_units = 256, ## Dense layer parameters
                 prob = 0.5, learn_rate = 0.0005, 
                 loss = 'binary_crossentropy', metrics = 'accuracy'):
    
    beta = 0.001
    
    ######################################################################################################
    ########  SEQUENCE  ##################################################################################
    ######################################################################################################
    
    input1 = tf.keras.layers.Input(shape=input_seq_shape)

    x1 = tf.keras.layers.Conv1D(conv_filters_per_layer_1, kernel_length_1,
                                strides = conv_strides_1, kernel_regularizer = tf.keras.regularizers.l2(beta), 
                                padding = "same")(input1)
    x1 = tf.keras.layers.Activation('relu')(x1)
    x1 = tf.keras.layers.MaxPool1D(pool_size = max_pool_width_1, strides = max_pool_stride_1)(x1)
    x1 = tf.keras.layers.Dropout(prob)(x1)
    
    x1 = tf.keras.layers.GaussianNoise(stddev=0.1)(x1)

    ## LSTM Path

    x2 = tf.keras.layers.LSTM(lstm_decode_units, return_sequences = True, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta))(x1)
    
    x2 = tf.keras.layers.Dropout(prob)(x2)
    
    x2 = tf.keras.layers.Flatten()(x2)
    
    x2 = tf.keras.layers.GaussianNoise(stddev=0.1)(x2)

    ## Conv Path

    x3 = tf.keras.layers.Conv1D(conv_filters_per_layer_2, kernel_length_2, strides = conv_strides_2, 
                                kernel_regularizer = tf.keras.regularizers.l2(beta), padding = 'same')(x1)
    x3 = tf.keras.layers.Activation('relu')(x3)
    x3 = tf.keras.layers.MaxPooling1D(pool_size = max_pool_width_2, strides = max_pool_stride_2)(x3)
    x3 = tf.keras.layers.Dropout(prob)(x3)
    
    x3 = tf.keras.layers.Flatten()(x3)
    
    x3 = tf.keras.layers.GaussianNoise(stddev=0.1)(x3)
    
    x4 = tf.keras.layers.Concatenate(1)([x2,x3])
    
    ######################################################################################################
    ########  Classifier  ################################################################################
    ######################################################################################################
    
    y = tf.keras.layers.Dense(dense_decode_units, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'relu')(x4)
    
    y = tf.keras.layers.GaussianNoise(stddev=0.1)(y)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(1, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'sigmoid')(y)

    ## Generate Model from input and output
    model = tf.keras.models.Model(inputs=input1, outputs=y)
    
    ## Compile model
    if(metrics != None):
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), loss = loss, metrics = metrics)
    else:
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), loss = loss)

    return model

In [8]:
kmer_epochs = 200
kmer_batch_size = 16

##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def Kmer_DLNN_Classifier(input_vec_shape,
                    dense_decode_units = 8, ## Dense layer parameters,
                    dense_layers = 2,
                    prob = 0.5, learn_rate = 0.0001, loss = 'binary_crossentropy', metrics = 'accuracy'):
    
    beta = 0.001
    
    input1 = tf.keras.layers.Input(shape=input_vec_shape)
    
    ######################################################################################################
    ########  Classifier  ################################################################################
    ######################################################################################################
    
    y = tf.keras.layers.Dense(dense_decode_units, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta))(input1)
    y = tf.keras.layers.BatchNormalization()(y)
    y = tf.keras.layers.Dropout(prob)(y)
    
    for i in range(1,dense_layers+1):
    
        y = tf.keras.layers.Dense(int(dense_decode_units/(2**i)), 
                                  kernel_regularizer = tf.keras.regularizers.l2(beta), 
                                 )(y)
        y = tf.keras.layers.BatchNormalization()(y)
        y = tf.keras.layers.Dropout(prob)(y) 
    
    y = tf.keras.layers.Dense(1, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'sigmoid')(y)

    ## Generate Model from input and output
    model = tf.keras.models.Model(inputs=input1, outputs=y)
    
    ## Compile model
    if(metrics != None):
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss, metrics = metrics)
    else:
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss)

    return model

In [9]:
kgap_epochs = 200
kgap_batch_size = 16

##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def Kgap_DLNN_Classifier(input_vec_shape,
                    dense_decode_units = 128, ## Dense layer parameters
                    prob = 0.5, learn_rate = 0.0001, loss = 'binary_crossentropy', metrics = 'accuracy'):
    
    beta = 0.001
    
    input1 = tf.keras.layers.Input(shape=input_vec_shape)
    
    ######################################################################################################
    ########  Classifier  ################################################################################
    ######################################################################################################
    
    y = tf.keras.layers.Dense(dense_decode_units, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta))(input1)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(int(dense_decode_units/2), 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'relu')(y)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(int(dense_decode_units/4), 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'relu')(y)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(1, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'sigmoid')(y)

    ## Generate Model from input and output
    model = tf.keras.models.Model(inputs=input1, outputs=y)
    
    ## Compile model
    if(metrics != None):
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss, metrics = metrics)
    else:
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss)

    return model

In [10]:
OHE_DLNN_CORENup().summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 41, 21)]     0           []                               
                                                                                                  
 conv1d (Conv1D)                (None, 41, 25)       5275        ['input_1[0][0]']                
                                                                                                  
 activation (Activation)        (None, 41, 25)       0           ['conv1d[0][0]']                 
                                                                                                  
 max_pooling1d (MaxPooling1D)   (None, 13, 25)       0           ['activation[0][0]']             
                                                                                              

In [11]:
Kmer_DLNN_Classifier((8420,)).summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 8420)]            0         
                                                                 
 dense_2 (Dense)             (None, 8)                 67368     
                                                                 
 batch_normalization (BatchN  (None, 8)                32        
 ormalization)                                                   
                                                                 
 dropout_4 (Dropout)         (None, 8)                 0         
                                                                 
 dense_3 (Dense)             (None, 4)                 36        
                                                                 
 batch_normalization_1 (Batc  (None, 4)                16        
 hNormalization)                                           

In [12]:
Kgap_DLNN_Classifier((2000,)).summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 2000)]            0         
                                                                 
 dense_6 (Dense)             (None, 128)               256128    
                                                                 
 dropout_7 (Dropout)         (None, 128)               0         
                                                                 
 dense_7 (Dense)             (None, 64)                8256      
                                                                 
 dropout_8 (Dropout)         (None, 64)                0         
                                                                 
 dense_8 (Dense)             (None, 32)                2080      
                                                                 
 dropout_9 (Dropout)         (None, 32)                0   

# Training data preparation

In [13]:
##################################################################################
##### read SEQUENCE training file
##################################################################################
ohe_train_file_path = os.path.join(ohe_input_data_folder, ohe_training_data_file)
ohe_train_data = pd.read_csv(ohe_train_file_path, sep='\t', header=None)
ohe_train_data.columns = ['Sequence', 'name', 'id', 'flag', 'label_original', 'type']
ohe_train_data.head()

##################################################################################
##### Create dictionary of all characters in the NT sequence 
##################################################################################
all_char_set = set({})
for val in [set(val) for val in ohe_train_data['Sequence']]:
    all_char_set = all_char_set.union(val)
all_char_list = list(all_char_set)
all_char_list.sort()
all_char_dict = {}
for i in range(len(all_char_list)):
    all_char_dict[all_char_list[i]] = i
    
##################################################################################
##### Create OHE of sequence
##################################################################################
ohe_train_data['OHE_Sequence'] = pd.Series([one_hot_encode_nt(val, all_char_dict) 
                                        for val in ohe_train_data["Sequence"]])

##################################################################################
##### Fix the labels
##################################################################################
ohe_train_data['label'] = pd.Series([1 if val == 1 else 0 
                                 for val in ohe_train_data["label_original"]])

##################################################################################
##### Extract features and labels, create folds
##################################################################################

ohe_train_features = np.array(list(ohe_train_data['OHE_Sequence']))
labels = np.array(list(ohe_train_data['label']))
labels = labels.reshape((labels.shape[0], 1))

ohe_input_seq_shape = ohe_train_features[0].shape

############################################################################################################
############################################################################################################
##### Read KMER data
############################################################################################################
############################################################################################################

kmer_train_data_filepath = os.path.join(enc_data_folder, kmer_train_data_filename)
kmer_train_data = pd.read_csv(kmer_train_data_filepath, sep=',', header=0)
kmer_train_data = kmer_train_data.drop('label', axis=1)

kmer_train_data['label'] = pd.Series([int(val.split('_')[-2])
                                      for val in kmer_train_data['nameseq']])

##################################################################################
##### Extract features and labels, create folds
##################################################################################

kmer_train_features = np.array(kmer_train_data.drop('label', axis=1).drop('nameseq', axis=1))
# train_labels = np.array(train_data['label'])
# train_labels = train_labels.reshape((train_labels.shape[0], 1))

kmer_input_vec_shape = kmer_train_features[0].shape

############################################################################################################
############################################################################################################
##### Read Kgap data
############################################################################################################
############################################################################################################

for i in range(kgap_max+1):
    
    kgap_current_train_data_filepath = os.path.join(enc_data_folder, kgap_train_data_filename.format(i))
    kgap_current_train_data = pd.read_csv(kgap_current_train_data_filepath, sep=',', header=0)
    kgap_current_train_data = kgap_current_train_data.drop('label', axis=1)
    
    if i == 0:
        kgap_train_data = kgap_current_train_data
    else:
        kgap_train_data = pd.merge(
            kgap_train_data,
            kgap_current_train_data,
            how="inner",
            on='nameseq'
        )

kgap_train_data['label'] = pd.Series([int(val.split('_')[-2])
                                      for val in kgap_train_data['nameseq']])

##################################################################################
##### Extract features and labels, create folds
##################################################################################

kgap_train_features = np.array(kgap_train_data.drop('label', axis=1).drop('nameseq', axis=1))
# train_labels = np.array(train_data['label'])
# train_labels = train_labels.reshape((train_labels.shape[0], 1))

kgap_input_vec_shape = kgap_train_features[0].shape

############################################################################################################
############################################################################################################
##### Build folds
############################################################################################################
############################################################################################################

folds = build_kfold(ohe_train_features, kmer_train_features, kgap_train_features, labels, k=n_fold, shuffle=shuffle, seed=seed)

## Write the k-fold dataset to file
foldPath = os.path.join(outPath, expName, "{}fold".format(n_fold))
if(not os.path.isdir(foldPath)):
    os.makedirs(foldPath)
pickle.dump(folds, open(os.path.join(foldPath, foldName), "wb"))

# Independent data preparation

In [14]:
##################################################################################
##### read independent data file
##################################################################################
ohe_indpe_file_path = os.path.join(ohe_input_data_folder, ohe_independent_data_file)
ohe_indpe_data = pd.read_csv(ohe_indpe_file_path, sep='\t', header=None)
ohe_indpe_data.columns = ['Sequence', 'name', 'id', 'flag', 'label_original', 'type']
ohe_indpe_data.head()
    
##################################################################################
##### Create OHE of sequence
##################################################################################
ohe_indpe_data['OHE_Sequence'] = pd.Series([one_hot_encode_nt(val, all_char_dict) 
                                        for val in ohe_indpe_data["Sequence"]])

##################################################################################
##### Fix the labels
##################################################################################
ohe_indpe_data['label'] = pd.Series([1 if val == 1 else 0 
                                 for val in ohe_indpe_data["label_original"]])

##################################################################################
##### Extract features and labels, create folds
##################################################################################

ohe_indpe_features = np.array(list(ohe_indpe_data['OHE_Sequence']))
indpe_labels = np.array(list(ohe_indpe_data['label']))
indpe_labels = indpe_labels.reshape((indpe_labels.shape[0], 1))

############################################################################################################
############################################################################################################
##### Read Kmer data
############################################################################################################
############################################################################################################

kmer_indpe_data_filepath = os.path.join(enc_data_folder, kmer_indpe_data_filename)
kmer_indpe_data = pd.read_csv(kmer_indpe_data_filepath, sep=',', header=0)
kmer_indpe_data = kmer_indpe_data.drop('label', axis=1)

kmer_indpe_data['label'] = pd.Series([int(val.split('_')[-2])
                                 for val in kmer_indpe_data['nameseq']])

# kmer_indpe_data = kmer_indpe_data.drop('nameseq', axis=1)

##################################################################################
##### Extract features and labels, create folds
##################################################################################

kmer_indpe_features = np.array(kmer_indpe_data.drop('label', axis=1).drop('nameseq', axis=1))
# kmer_indpe_labels = np.array(kmer_indpe_data['label'])
# kmer_indpe_labels = indpe_labels.reshape((indpe_labels.shape[0], 1))


############################################################################################################
############################################################################################################
##### Read Kgap data
############################################################################################################
############################################################################################################

for i in range(kgap_max+1):

    kgap_current_indpe_data_filepath = os.path.join(enc_data_folder, kgap_indpe_data_filename.format(i))
    kgap_current_indpe_data = pd.read_csv(kgap_current_indpe_data_filepath, sep=',', header=0)
    kgap_current_indpe_data = kgap_current_indpe_data.drop('label', axis=1)
    
    if i == 0:
        kgap_indpe_data = kgap_current_indpe_data
    else:
        kgap_indpe_data = pd.merge(
            kgap_indpe_data,
            kgap_current_indpe_data,
            how="inner",
            on='nameseq'
        )

kgap_indpe_data['label'] = pd.Series([int(val.split('_')[-2])
                                      for val in kgap_indpe_data['nameseq']])

# kgap_indpe_data = kgap_indpe_data.drop('nameseq', axis=1)

##################################################################################
##### Extract features and labels, create folds
##################################################################################

kgap_indpe_features = np.array(kgap_indpe_data.drop('label', axis=1).drop('nameseq', axis=1))
# kgap_indpe_labels = np.array(kgap_indpe_data['label'])
# kgap_indpe_labels = indpe_labels.reshape((kgap_indpe_labels.shape[0], 1))


# Training the ensemble

In [15]:
folds[0].keys()

dict_keys(['X_OHE_train', 'X_OHE_test', 'X_Kmer_train', 'X_Kmer_test', 'X_Kgap_train', 'X_Kgap_test', 'y_train', 'y_test'])

In [16]:
## Create and set directory to save all models
modelPath = os.path.join(outPath, expName, "{}fold".format(n_fold), "models")
if(not os.path.isdir(modelPath)):
    os.makedirs(modelPath)

In [17]:
##################################################################################
##### For each input file, train model and generate different outputs in a structured folder
##################################################################################



## create the evaluation data structure for all iterations
evaluations = {
    "Fold" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Train/Test model on all folds, generate evaluations
##################################################################################

i = -1
for fold in folds:
    i += 1
    
    print("\nTrain/Test model on Fold #"+str(i)+".")
    
    # adding random shuffling of the dataset for training purpose
    index_arr = np.arange(fold["X_OHE_train"].shape[0])
    index_arr = np.random.permutation(index_arr)
    
    ##################################################################################
    ##### Train OHE network
    ##################################################################################
    
    print("Training OHE network.")
    
    ohe_model = OHE_DLNN_CORENup(input_seq_shape = ohe_input_seq_shape)
    
    ## Define the model callbacks for early stopping and saving the model. Then train model
    ohe_current_model_path = os.path.join(modelPath, "OHE_bestModel-fold{}.hdf5".format(i))
    ohe_modelCallbacks = [
        tf.keras.callbacks.ModelCheckpoint(ohe_current_model_path,
                                           monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                           save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
    ]
    
    ohe_model.fit(x = fold["X_OHE_train"][index_arr], y = fold["y_train"][index_arr], 
                  batch_size = ohe_batch_size, epochs = ohe_epochs, verbose = 0, 
                  callbacks = ohe_modelCallbacks, validation_data = (fold["X_OHE_test"], fold["y_test"]))
    
    del ohe_model
    tf.keras.backend.clear_session()
    
#     ohe_model = tf.keras.models.load_model(ohe_current_model_path)
    
    ##################################################################################
    ##### Train Kmer network
    ##################################################################################
    
    print("Training Kmer network.")
    
    kmer_model = Kmer_DLNN_Classifier(input_vec_shape = kmer_input_vec_shape)
    
    ## Define the model callbacks for early stopping and saving the model. Then train model
    kmer_current_model_path = os.path.join(modelPath, "KMER_bestModel-fold{}.hdf5".format(i))
    kmer_modelCallbacks = [
        tf.keras.callbacks.ModelCheckpoint(kmer_current_model_path,
                                           monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                           save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
    ]
    
    kmer_model.fit(x = fold["X_Kmer_train"][index_arr], y = fold["y_train"][index_arr], 
                   batch_size = kmer_batch_size, epochs = kmer_epochs, verbose = 0, 
                   callbacks = kmer_modelCallbacks, validation_data = (fold["X_Kmer_test"], fold["y_test"]))
    
    del kmer_model
    tf.keras.backend.clear_session()
    
#     kmer_model = tf.keras.models.load_model(kmer_current_model_path)
    
    ##################################################################################
    ##### Train Kgap network
    ##################################################################################
    
    print("Training Kgap network.")
    
    kgap_model = Kgap_DLNN_Classifier(input_vec_shape = kgap_input_vec_shape)
    
    ## Define the model callbacks for early stopping and saving the model. Then train model
    kgap_current_model_path = os.path.join(modelPath, "KGAP_bestModel-fold{}.hdf5".format(i))
    kgap_modelCallbacks = [
        tf.keras.callbacks.ModelCheckpoint(kgap_current_model_path,
                                           monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                           save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
    ]
    
    kgap_model.fit(x = fold["X_Kgap_train"][index_arr], y = fold["y_train"][index_arr], 
                   batch_size = kgap_batch_size, epochs = kgap_epochs, verbose = 0, 
                   callbacks = kgap_modelCallbacks, validation_data = (fold["X_Kgap_test"], fold["y_test"]))
    
    del kgap_model
    tf.keras.backend.clear_session()
    
#     kgap_model = tf.keras.models.load_model(kgap_current_model_path)
    
    ##################################################################################
    ##### Generate scores for Train dataset
    ##################################################################################
    
    print("Generating the 3 scores.")
    
    ohe_model = tf.keras.models.load_model(ohe_current_model_path)
    ohe_train_y_pred = ohe_model.predict(fold["X_OHE_train"])
    ohe_test_y_pred = ohe_model.predict(fold["X_OHE_test"])
    
    del ohe_model
    tf.keras.backend.clear_session()
    
    kmer_model = tf.keras.models.load_model(kmer_current_model_path)
    kmer_train_y_pred = kmer_model.predict(fold["X_Kmer_train"])
    kmer_test_y_pred = kmer_model.predict(fold["X_Kmer_test"])
    
    del kmer_model
    tf.keras.backend.clear_session()
    
    kgap_model = tf.keras.models.load_model(kgap_current_model_path)
    kgap_train_y_pred = kgap_model.predict(fold["X_Kgap_train"])
    kgap_test_y_pred = kgap_model.predict(fold["X_Kgap_test"])
    
    del kgap_model
    tf.keras.backend.clear_session()
    
    ##################################################################################
    ##### Linear regression using the 3 scores
    ##################################################################################
    
    print("Training linear regression.")
    
    X_lr_train = np.concatenate((ohe_train_y_pred, kmer_train_y_pred, kgap_train_y_pred), axis=1)
    X_lr_test = np.concatenate((ohe_test_y_pred, kmer_test_y_pred, kgap_test_y_pred), axis=1)
    
    # lr_model = LogisticRegression(penalty='elasticnet')
    lr_model = LogisticRegression(class_weight={0:1, 1:1})
    lr_model.fit(X_lr_train, fold["y_train"].reshape((fold["y_train"].shape[0])))
    
    lr_current_model_path = os.path.join(modelPath, "LR_bestModel-fold{}.hdf5".format(i))
    lr_model_file_obj = open(lr_current_model_path, 'wb')
    pickle.dump(lr_model, lr_model_file_obj)
    lr_model_file_obj.close()
    
    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################
    
    print("Generating train set metrics.")
    
    y_pred = lr_model.predict(X_lr_train)
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_train"], label_pred)
    prec = precision_score(fold["y_train"],label_pred)
    mcc = matthews_corrcoef(fold["y_train"], label_pred)

    conf = confusion_matrix(fold["y_train"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_train"], y_pred)
    auc = roc_auc_score(fold["y_train"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Train")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################
    
    print("Generating test set metrics.")

    y_pred = lr_model.predict(X_lr_test)
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_test"], label_pred)
    prec = precision_score(fold["y_test"],label_pred)
    mcc = matthews_corrcoef(fold["y_test"], label_pred)

    conf = confusion_matrix(fold["y_test"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_test"], y_pred)
    auc = roc_auc_score(fold["y_test"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Test")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)


Train/Test model on Fold #0.
Training OHE network.
Training Kmer network.
Training Kgap network.
Generating the 3 scores.
Training linear regression.
Generating train set metrics.
Generating test set metrics.

Train/Test model on Fold #1.
Training OHE network.
Training Kmer network.
Training Kgap network.
Generating the 3 scores.
Training linear regression.
Generating train set metrics.
Generating test set metrics.

Train/Test model on Fold #2.
Training OHE network.
Training Kmer network.
Training Kgap network.
Generating the 3 scores.
Training linear regression.
Generating train set metrics.
Generating test set metrics.

Train/Test model on Fold #3.
Training OHE network.
Training Kmer network.
Training Kgap network.
Generating the 3 scores.
Training linear regression.
Generating train set metrics.
Generating test set metrics.

Train/Test model on Fold #4.
Training OHE network.
Training Kmer network.
Training Kgap network.
Generating the 3 scores.
Training linear regression.
Generatin

## k-fold Training evaluation

In [18]:
evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Test,0.820328,0.826885,0.820342,0.811139,0.829545,0.641813
Train,0.994753,0.995174,0.994753,0.994333,0.995172,0.989508


In [19]:
# linear regression
# 	Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# Train_Test						
# Test	0.790491	0.800674	0.864737	0.774104	0.806870	0.581834
# Train	0.996222	0.997270	0.999819	0.995172	0.997272	0.992449

In [20]:
evaluations_df[evaluations_df["Train_Test"] == "Test"]

Unnamed: 0,Fold,Train_Test,Accuracy,Precision,TPR,FPR,TPR_FPR_Thresholds,AUC,Sensitivity,Specificity,MCC
1,0,Test,0.798742,0.835681,"[0.0, 0.7447698744769874, 1.0]","[0.0, 0.14705882352941177, 1.0]","[2, 1, 0]",0.798856,0.74477,0.852941,0.601156
3,1,Test,0.819706,0.833333,"[0.0, 0.7983193277310925, 1.0]","[0.0, 0.1589958158995816, 1.0]","[2, 1, 0]",0.819662,0.798319,0.841004,0.639943
5,2,Test,0.82563,0.835498,"[0.0, 0.8109243697478992, 1.0]","[0.0, 0.15966386554621848, 1.0]","[2, 1, 0]",0.82563,0.810924,0.840336,0.651542
7,3,Test,0.838235,0.825911,"[0.0, 0.8571428571428571, 1.0]","[0.0, 0.18067226890756302, 1.0]","[2, 1, 0]",0.838235,0.857143,0.819328,0.676955
9,4,Test,0.819328,0.804,"[0.0, 0.8445378151260504, 1.0]","[0.0, 0.20588235294117646, 1.0]","[2, 1, 0]",0.819328,0.844538,0.794118,0.639469


In [21]:
# linear regression
# 	Fold	Train_Test	Accuracy	Precision	TPR	FPR	TPR_FPR_Thresholds	AUC	Sensitivity	Specificity	MCC
# 1	0	Test	0.828092	0.836910	[0.0, 0.0041841004184100415, 0.171548117154811...	[0.0, 0.0, 0.0, 0.004201680672268907, 0.004201...	[2.000568152515262, 1.000568152515262, 0.99902...	0.896171	0.815900	0.840336	0.656409
# 3	1	Test	0.802935	0.815789	[0.0, 0.004201680672268907, 0.0714285714285714...	[0.0, 0.0, 0.0, 0.0041841004184100415, 0.00418...	[2.0062331167570546, 1.0062331167570548, 1.006...	0.873264	0.781513	0.824268	0.606367
# 5	2	Test	0.787815	0.789030	[0.0, 0.012605042016806723, 0.0294117647058823...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00420168...	[2.003658229522343, 1.003658229522343, 1.00365...	0.859270	0.785714	0.789916	0.575635
# 7	3	Test	0.771008	0.765432	[0.0, 0.012605042016806723, 0.0252100840336134...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0042016806722...	[1.9996688129724591, 0.9996688129724591, 0.999...	0.841475	0.781513	0.760504	0.542136
# 9	4	Test	0.762605	0.796209	[0.0, 0.004201680672268907, 0.0840336134453781...	[0.0, 0.0, 0.0, 0.004201680672268907, 0.004201...	[2.0088285442808655, 1.0088285442808655, 1.008...	0.853506	0.705882	0.819328	0.528623

In [22]:
evaluations_df

Unnamed: 0,Fold,Train_Test,Accuracy,Precision,TPR,FPR,TPR_FPR_Thresholds,AUC,Sensitivity,Specificity,MCC
0,0,Train,0.9979,1.0,"[0.0, 0.9957983193277311, 1.0]","[0.0, 0.0, 1.0]","[2, 1, 0]",0.997899,0.995798,1.0,0.995809
1,0,Test,0.798742,0.835681,"[0.0, 0.7447698744769874, 1.0]","[0.0, 0.14705882352941177, 1.0]","[2, 1, 0]",0.798856,0.74477,0.852941,0.601156
2,1,Train,0.995801,0.995803,"[0.0, 0.9958027282266527, 1.0]","[0.0, 0.004201680672268907, 1.0]","[2, 1, 0]",0.995801,0.995803,0.995798,0.991601
3,1,Test,0.819706,0.833333,"[0.0, 0.7983193277310925, 1.0]","[0.0, 0.1589958158995816, 1.0]","[2, 1, 0]",0.819662,0.798319,0.841004,0.639943
4,2,Train,0.993704,0.994742,"[0.0, 0.9926547743966422, 1.0]","[0.0, 0.005246589716684155, 1.0]","[2, 1, 0]",0.993704,0.992655,0.994753,0.98741
5,2,Test,0.82563,0.835498,"[0.0, 0.8109243697478992, 1.0]","[0.0, 0.15966386554621848, 1.0]","[2, 1, 0]",0.82563,0.810924,0.840336,0.651542
6,3,Train,0.991605,0.991605,"[0.0, 0.9916054564533053, 1.0]","[0.0, 0.008394543546694649, 1.0]","[2, 1, 0]",0.991605,0.991605,0.991605,0.983211
7,3,Test,0.838235,0.825911,"[0.0, 0.8571428571428571, 1.0]","[0.0, 0.18067226890756302, 1.0]","[2, 1, 0]",0.838235,0.857143,0.819328,0.676955
8,4,Train,0.994753,0.993717,"[0.0, 0.9958027282266527, 1.0]","[0.0, 0.0062959076600209865, 1.0]","[2, 1, 0]",0.994753,0.995803,0.993704,0.989509
9,4,Test,0.819328,0.804,"[0.0, 0.8445378151260504, 1.0]","[0.0, 0.20588235294117646, 1.0]","[2, 1, 0]",0.819328,0.844538,0.794118,0.639469


# Independent data

## Using k-fold Models

### Performance of each k-fold model

In [23]:
## create the evaluation data structure for all iterations
evaluations = {
    "Fold" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

for i in range(n_fold):
    
    ### OHE #####
    
    ohe_current_model_path = os.path.join(modelPath, "OHE_bestModel-fold{}.hdf5".format(i))
    ohe_model = tf.keras.models.load_model(ohe_current_model_path)
    
    ohe_y_pred = ohe_model.predict(ohe_indpe_features)
    
    del ohe_model
    tf.keras.backend.clear_session()
    
    ### Kmer #####
    
    kmer_current_model_path = os.path.join(modelPath, "KMER_bestModel-fold{}.hdf5".format(i))
    kmer_model = tf.keras.models.load_model(kmer_current_model_path)
    
    kmer_y_pred = kmer_model.predict(kmer_indpe_features)
    
    del kmer_model
    tf.keras.backend.clear_session()
    
    ### Kgap #####
    
    kgap_current_model_path = os.path.join(modelPath, "KGAP_bestModel-fold{}.hdf5".format(i))
    kgap_model = tf.keras.models.load_model(kgap_current_model_path)
    
    kgap_y_pred = kgap_model.predict(kgap_indpe_features)
    
    del kgap_model
    tf.keras.backend.clear_session()
    
    ### LR #####
    
    lr_indpe_features = np.concatenate((ohe_y_pred, kmer_y_pred, kgap_y_pred), axis=1)
    
    lr_current_model_path = os.path.join(modelPath, "LR_bestModel-fold{}.hdf5".format(i))
    lr_model_file_obj = open(lr_current_model_path, 'rb')
    lr_model = pickle.load(lr_model_file_obj)
    lr_model_file_obj.close()
    
    ############

    y_pred = lr_model.predict(lr_indpe_features)
    label_pred = pred2label(y_pred)

    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(indpe_labels, label_pred)
    prec = precision_score(indpe_labels,label_pred)
    mcc = matthews_corrcoef(indpe_labels, label_pred)

    conf = confusion_matrix(indpe_labels, label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)

    fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
    auc = roc_auc_score(indpe_labels, y_pred)

    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Independent")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent,0.630041,0.254045,0.631821,0.634483,0.629159,0.199166


In [24]:
evaluations_df

Unnamed: 0,Fold,Train_Test,Accuracy,Precision,TPR,FPR,TPR_FPR_Thresholds,AUC,Sensitivity,Specificity,MCC
0,0,Independent,0.609796,0.237094,"[0.0, 0.6108374384236454, 1.0]","[0.0, 0.3904109589041096, 1.0]","[2, 1, 0]",0.610213,0.610837,0.609589,0.165698
1,1,Independent,0.617143,0.242248,"[0.0, 0.6157635467980296, 1.0]","[0.0, 0.38258317025440314, 1.0]","[2, 1, 0]",0.61659,0.615764,0.617417,0.175597
2,2,Independent,0.632653,0.25641,"[0.0, 0.6403940886699507, 1.0]","[0.0, 0.36888454011741684, 1.0]","[2, 1, 0]",0.635755,0.640394,0.631115,0.204971
3,3,Independent,0.660408,0.277662,"[0.0, 0.6551724137931034, 1.0]","[0.0, 0.3385518590998043, 1.0]","[2, 1, 0]",0.65831,0.655172,0.661448,0.241254
4,4,Independent,0.630204,0.256809,"[0.0, 0.6502463054187192, 1.0]","[0.0, 0.37377690802348335, 1.0]","[2, 1, 0]",0.638235,0.650246,0.626223,0.208307


### Mean score with k-fold models

In [25]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

total_pred = np.zeros(indpe_labels.shape)
all_preds = []

for i in range(n_fold):
    
    ### OHE #####
    
    ohe_current_model_path = os.path.join(modelPath, "OHE_bestModel-fold{}.hdf5".format(i))
    ohe_model = tf.keras.models.load_model(ohe_current_model_path)
    
    ohe_y_pred = ohe_model.predict(ohe_indpe_features)
    
    del ohe_model
    tf.keras.backend.clear_session()
    
    ### Kmer #####
    
    kmer_current_model_path = os.path.join(modelPath, "KMER_bestModel-fold{}.hdf5".format(i))
    kmer_model = tf.keras.models.load_model(kmer_current_model_path)
    
    kmer_y_pred = kmer_model.predict(kmer_indpe_features)
    
    del kmer_model
    tf.keras.backend.clear_session()
    
    ### Kgap #####
    
    kgap_current_model_path = os.path.join(modelPath, "KGAP_bestModel-fold{}.hdf5".format(i))
    kgap_model = tf.keras.models.load_model(kgap_current_model_path)
    
    kgap_y_pred = kgap_model.predict(kgap_indpe_features)
    
    del kgap_model
    tf.keras.backend.clear_session()
    
    ### LR #####
    
    lr_indpe_features = np.concatenate((ohe_y_pred, kmer_y_pred, kgap_y_pred), axis=1)
    
    lr_current_model_path = os.path.join(modelPath, "LR_bestModel-fold{}.hdf5".format(i))
    lr_model_file_obj = open(lr_current_model_path, 'rb')
    lr_model = pickle.load(lr_model_file_obj)
    lr_model_file_obj.close()
    
    ############

    y_pred = lr_model.predict(lr_indpe_features)
    
    total_pred += y_pred[:, np.newaxis]
    all_preds.append(y_pred[:, np.newaxis])
    
total_pred = total_pred / n_fold
label_pred = pred2label(total_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, total_pred)
auc = roc_auc_score(indpe_labels, total_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent,0.648163,0.266393,0.669086,0.640394,0.649706,0.220332


### Voting score with k-fold models

In [26]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

total_pred = np.zeros(indpe_labels.shape)
all_preds = []

for i in range(n_fold):
    
    ### OHE #####
    
    ohe_current_model_path = os.path.join(modelPath, "OHE_bestModel-fold{}.hdf5".format(i))
    ohe_model = tf.keras.models.load_model(ohe_current_model_path)
    
    ohe_y_pred = ohe_model.predict(ohe_indpe_features)
    
    del ohe_model
    tf.keras.backend.clear_session()
    
    ### Kmer #####
    
    kmer_current_model_path = os.path.join(modelPath, "KMER_bestModel-fold{}.hdf5".format(i))
    kmer_model = tf.keras.models.load_model(kmer_current_model_path)
    
    kmer_y_pred = kmer_model.predict(kmer_indpe_features)
    
    del kmer_model
    tf.keras.backend.clear_session()
    
    ### Kgap #####
    
    kgap_current_model_path = os.path.join(modelPath, "KGAP_bestModel-fold{}.hdf5".format(i))
    kgap_model = tf.keras.models.load_model(kgap_current_model_path)
    
    kgap_y_pred = kgap_model.predict(kgap_indpe_features)
    
    del kgap_model
    tf.keras.backend.clear_session()
    
    ### LR #####
    
    lr_indpe_features = np.concatenate((ohe_y_pred, kmer_y_pred, kgap_y_pred), axis=1)
    
    lr_current_model_path = os.path.join(modelPath, "LR_bestModel-fold{}.hdf5".format(i))
    lr_model_file_obj = open(lr_current_model_path, 'rb')
    lr_model = pickle.load(lr_model_file_obj)
    lr_model_file_obj.close()
    
    ############

    y_pred = lr_model.predict(lr_indpe_features)
    
    vote_pred = pred2label(y_pred[:, np.newaxis])
    total_pred += vote_pred
    all_preds.append(vote_pred)
    
total_pred = total_pred / n_fold
label_pred = pred2label(total_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, total_pred)
auc = roc_auc_score(indpe_labels, total_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent,0.648163,0.266393,0.669086,0.640394,0.649706,0.220332


## Using New Model

Train model on full data from training. Predict and evaluate on Independent data.

In [27]:
pos_indexes = np.where(indpe_labels==1)[0]
neg_indexes = np.random.permutation(np.where(indpe_labels==0)[0])[0:pos_indexes.shape[0]]
indpe_val_indexes = np.concatenate((pos_indexes, neg_indexes))

In [28]:
# adding random shuffling of the dataset for training purpose
index_arr = np.arange(ohe_train_features.shape[0])
index_arr = np.random.permutation(index_arr)

##################################################################################
##### Train OHE network
##################################################################################

print("Training OHE network.")

ohe_model = OHE_DLNN_CORENup(input_seq_shape = ohe_input_seq_shape)

## Define the model callbacks for early stopping and saving the model. Then train model
ohe_full_model_path = os.path.join(modelPath, "OHE_fullModel.hdf5".format(i))
ohe_modelCallbacks = [
    tf.keras.callbacks.ModelCheckpoint(ohe_full_model_path,
                                       monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                       save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
]

ohe_model.fit(x = ohe_train_features[index_arr], y = labels[index_arr], 
              batch_size = ohe_batch_size, epochs = ohe_epochs, verbose = 0, 
              callbacks = ohe_modelCallbacks, 
              validation_data = (ohe_indpe_features[indpe_val_indexes], indpe_labels[indpe_val_indexes]))

del ohe_model
tf.keras.backend.clear_session()

##################################################################################
##### Train Kmer network
##################################################################################

print("Training Kmer network.")

kmer_model = Kmer_DLNN_Classifier(input_vec_shape = kmer_input_vec_shape)

## Define the model callbacks for early stopping and saving the model. Then train model
kmer_full_model_path = os.path.join(modelPath, "KMER_fullModel.hdf5".format(i))
kmer_modelCallbacks = [
    tf.keras.callbacks.ModelCheckpoint(kmer_full_model_path,
                                       monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                       save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
]

kmer_model.fit(x = kmer_train_features[index_arr], y = labels[index_arr], 
               batch_size = kmer_batch_size, epochs = kmer_epochs, verbose = 0, 
               callbacks = kmer_modelCallbacks, 
               validation_data = (kmer_indpe_features[indpe_val_indexes], indpe_labels[indpe_val_indexes]))

del kmer_model
tf.keras.backend.clear_session()

##################################################################################
##### Train Kgap network
##################################################################################

print("Training Kgap network.")

kgap_model = Kgap_DLNN_Classifier(input_vec_shape = kgap_input_vec_shape)

## Define the model callbacks for early stopping and saving the model. Then train model
kgap_full_model_path = os.path.join(modelPath, "KGAP_fullModel.hdf5".format(i))
kgap_modelCallbacks = [
    tf.keras.callbacks.ModelCheckpoint(kgap_full_model_path,
                                       monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                       save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
]

kgap_model.fit(x = kgap_train_features[index_arr], y = labels[index_arr], 
               batch_size = kgap_batch_size, epochs = kgap_epochs, verbose = 0, 
               callbacks = kgap_modelCallbacks, 
               validation_data = (kgap_indpe_features[indpe_val_indexes], indpe_labels[indpe_val_indexes]))

del kgap_model
tf.keras.backend.clear_session()

# kgap_model = tf.keras.models.load_model(kgap_current_model_path)

Training OHE network.
Training Kmer network.
Training Kgap network.
Generating the 3 scores.
Training linear regression.


In [40]:
##################################################################################
##### Generate scores for Train dataset
##################################################################################

print("Generating the 3 scores.")

ohe_model = tf.keras.models.load_model(ohe_full_model_path)
ohe_train_y_pred = ohe_model.predict(ohe_train_features)
ohe_indpe_y_pred = ohe_model.predict(ohe_indpe_features)

del ohe_model
tf.keras.backend.clear_session()

kmer_model = tf.keras.models.load_model(kmer_full_model_path)
kmer_train_y_pred = kmer_model.predict(kmer_train_features)
kmer_indpe_y_pred = kmer_model.predict(kmer_indpe_features)

del kmer_model
tf.keras.backend.clear_session()

kgap_model = tf.keras.models.load_model(kgap_full_model_path)
kgap_train_y_pred = kgap_model.predict(kgap_train_features)
kgap_indpe_y_pred = kgap_model.predict(kgap_indpe_features)

del kgap_model
tf.keras.backend.clear_session()

##################################################################################
##### Linear regression using the 3 scores
##################################################################################

print("Training linear regression.")

X_lr_train = np.concatenate((ohe_train_y_pred, kmer_train_y_pred, kgap_train_y_pred), axis=1)
X_lr_indpe = np.concatenate((ohe_indpe_y_pred, kmer_indpe_y_pred, kgap_indpe_y_pred), axis=1)

lr_model = LogisticRegression(class_weight={0:1, 1:0.5})
lr_model.fit(X_lr_train, labels.reshape((labels.shape[0])))

lr_full_model_path = os.path.join(modelPath, "LR_fullModel.hdf5".format(i))
lr_model_file_obj = open(lr_full_model_path, 'wb')
pickle.dump(lr_model, lr_model_file_obj)
lr_model_file_obj.close()

Generating the 3 scores.
Training linear regression.


In [41]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

y_pred = lr_model.predict(X_lr_indpe)
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, label_pred)
auc = roc_auc_score(indpe_labels, y_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent,0.669388,0.267281,0.630137,0.571429,0.688845,0.202335


In [30]:
lr_model.coef_

array([[4.2630363 , 6.22372479, 0.41755114]])