In [1]:
import shutil
try:
  shutil.rmtree('Results_Colab')
except:
  pass

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install -q -U keras-tuner

# Import Statements

In [4]:
import os 
import pickle
import numpy as np
import pandas as pd

import tensorflow as tf

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score

import keras_tuner as kt

import math

# Define Parameters

In [5]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

expName = "PSI_Site_DLNN_MergedSeq_2DOP_autoencoder_kerasTuner"

input_data_folder = "/content/drive/MyDrive/_Uni/5. Thesis/Thesis_Work/Psi_Site_Data/Aziz"
drive_out_path = "/content/drive/MyDrive/_Uni/5. Thesis/Thesis_Work/Results_Colab"
# outPath = 'Results_Colab'
outPath = "/content/drive/MyDrive/_Uni/5. Thesis/Thesis_Work/Results_Colab"

output_data_folder = os.path.join(outPath, expName)

p_epoch = 30
p_factor = 2
# p_objective = 'val_sequential_loss'
objective = "val_sequential_loss"
p_objective = kt.Objective(objective, direction="min")
p_hyperband_iterations = 3

p_max_trials = 100
p_num_initial_points = 25

p_batch_size = 32
p_final_epochs = 30

n_fold = 10
foldName = "folds.pickle"

epochs = 100
batch_size = 32
shuffle = True
seed = None

# Utility functions

In [6]:
##################################################################################
##### define all CUSTOM functions
##################################################################################

def one_hot_encode_dna(sequence):
    seq_encoded = np.zeros((len(sequence),4))
    dict_nuc = {
        "A": 0,
        "C": 1,
        "G": 2,
        "T": 3
    }
    i = 0
    for single_character in sequence:
        if(single_character.upper() in dict_nuc.keys()):
            seq_encoded[i][dict_nuc[single_character.upper()]] = 1
            i = i+1
        else:
            raise ValueError('Incorrect character in DNA sequence: '+sequence)
    return seq_encoded

def one_hot_encode_rna(sequence):
    seq_encoded = np.zeros((len(sequence),4))
    dict_nuc = {
        "A": 0,
        "C": 1,
        "G": 2,
        "U": 3
    }
    i = 0
    for single_character in sequence:
        if(single_character.upper() in dict_nuc.keys()):
            seq_encoded[i][dict_nuc[single_character.upper()]] = 1
            i = i+1
        else:
            raise ValueError('Incorrect character in RNA sequence: '+sequence)
    return seq_encoded

def one_hot_encode_rnafold(sequence):
    seq_encoded = np.zeros((len(sequence),3))
    dict_fold = {
        "(": 0,
        ")": 1,
        ".": 2
    }
    i = 0
    for single_character in sequence:
        if(single_character in dict_fold.keys()):
            seq_encoded[i][dict_fold[single_character]] = 1
            i = i+1
        else:
            raise ValueError('Incorrect character in RNAfold: '+sequence)
    return seq_encoded

def one_hot_encode_rna_mergedseq(sequence):
    dict_nuc = {
        "A": 0,
        "C": 1,
        "G": 2,
        "U": 3
    }
    dict_fold = {
        "(": 0,
        ")": 1,
        ".": 2
    }
    list_seq = sequence.strip().split(' ')
    seq_encoded = np.zeros((len(list_seq),12))
    i = 0
    for single_character in list_seq:
        if(single_character[0].upper() in dict_nuc.keys()):
            idx1 = dict_nuc[single_character[0].upper()]+1
        else:
            raise ValueError('Incorrect RNA character in MergedSeq sequence: '+sequence)
        if(single_character[1] in dict_fold.keys()):
            idx2 = dict_fold[single_character[1]]+1
        else:
            raise ValueError('Incorrect RNAfold character in MergedSeq sequence: '+sequence)
        idx = (idx1 * idx2) - 1
        seq_encoded[i][idx] = 1
        i = i+1        
    return seq_encoded

In [7]:
##################################################################################
##### define evaluator functions
##################################################################################

## Build the K-fold from dataset
def build_kfold(features, labels, k=10, shuffle=False, seed=None):
    
    skf = StratifiedKFold(n_splits=k, shuffle=shuffle, random_state=seed)
    kfoldList = []
    for train_index, test_index in skf.split(features, labels):
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        kfoldList.append({
            "X_train": X_train,
            "X_test": X_test,
            "y_train":y_train,
            "y_test":y_test
        })
    return kfoldList

def build_kfold_multifeature(features_1, features_2, labels, k=10, shuffle=False, seed=None):
    
    skf = StratifiedKFold(n_splits=k, shuffle=shuffle, random_state=seed)
    kfoldList = []
    for train_index, test_index in skf.split(features_1, labels):
        X1_train, X1_test = features_1[train_index], features_1[test_index]
        X2_train, X2_test = features_2[train_index], features_2[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        kfoldList.append({
            "X1_train": X1_train,
            "X1_test": X1_test,
            "X2_train": X2_train,
            "X2_test": X2_test,
            "y_train":y_train,
            "y_test":y_test
        })
    return kfoldList

def pred2label(y_pred):
    y_pred = np.round(y_pred).astype(int)
    return y_pred

def label_scalar_to_vector(label):
    label_2d = np.zeros((label.shape[0], 2))
    for i in range(label.shape[0]):
        label_2d[i][int(label[i][0])] = 1
    return label_2d

def label_vector_to_scalar(label_2d_arr):
    labels = (label_2d_arr[:,1] > label_2d_arr[:,0]).astype(int)
    return labels

In [8]:
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

def save_plot2(H, path, loss_name):
    # plot the training loss and accuracy
    plt.style.use("ggplot")

    if(not os.path.isdir(path)):
        os.makedirs(path)

    plt.figure()
    plt.plot(H.history[loss_name+"_loss"], label="train_loss")
    plt.plot(H.history["val_"+loss_name+"_loss"], label="val_loss")
    plt.title("Training Loss")
    plt.xlabel("Epoch #")
    plt.ylabel("Loss")
    plt.legend()
    plt.savefig(os.path.join(path, 'loss.png'))

    plt.figure()
    plt.plot(H.history[loss_name+"_accuracy"], label="train_acc")
    plt.plot(H.history["val_"+loss_name+"_accuracy"], label="val_acc")
    plt.title("Training Accuracy")
    plt.xlabel("Epoch #")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.savefig(os.path.join(path, 'accuracy.png'))

def save_plot(H, path):
    # plot the training loss and accuracy
    plt.style.use("ggplot")

    if(not os.path.isdir(path)):
        os.makedirs(path)

    seq_model_no = [s for s in list(H.history.keys()) if "sequential" in s][0].split('_')[1]

    plt.figure()
    plt.plot(H.history['sequential_{}_loss'.format(seq_model_no)], label="train_loss")
    plt.plot(H.history['val_sequential_{}_loss'.format(seq_model_no)], label="val_loss")
    plt.title("Training Loss")
    plt.xlabel("Epoch #")
    plt.ylabel("Loss")
    plt.legend()
    plt.savefig(os.path.join(path, 'loss.png'))

    plt.figure()
    plt.plot(H.history['sequential_{}_accuracy'.format(seq_model_no)], label="train_acc")
    plt.plot(H.history['val_sequential_{}_accuracy'.format(seq_model_no)], label="val_acc")
    plt.title("Training Accuracy")
    plt.xlabel("Epoch #")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.savefig(os.path.join(path, 'accuracy.png'))

# Models

In [9]:
##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def model_builder_21(hp):
    
    hp_beta = hp.Choice('beta', values=[0.01, 0.001, 0.0001])
    
    hp_conv_activation = hp.Choice('conv_activation', values=['relu', 'selu', 'tanh'])
    hp_conv_filters_per_layer = hp.Int('conv_filters_per_layer', min_value=5, max_value=25, step=5)
    hp_conv_kernel_length_1 = hp.Choice('max_kernel_length_1', values=[3, 5, 7, 9])
    hp_conv_kernel_length_2 = hp.Choice('max_kernel_length_2', values=[3, 5, 7, 9])
    # hp_conv_stride = hp.Choice('conv_stride', values=[1, 2, 3])
    hp_conv_stride = 1

    hp_latent_dim = hp.Choice('count_conv_layers', values=[10, 20, 30])
    
    hp_count_dense_layers = hp.Choice('count_dense_layers', values=[1, 2, 3])
    hp_dense_units = hp.Int('dense_units_1', min_value=100, max_value=500, step=100)
    hp_dropout_prob = hp.Choice('dropout_prob', values=[0.1, 0.3, 0.5])
    
    hp_learning_rate = hp.Choice('learning_rate', values=[0.01, 0.001, 0.0001, 0.00001])
    hp_opt_func = hp.Choice('opt_func', values=['adam', 'adagrad', 'rmsprop'])
    
    input_shape = (21, 12)
    
    ae_input = tf.keras.layers.Input(shape=input_shape)

    ###########################################################################
    ##### Encoder
    ###########################################################################

    xe = tf.keras.layers.Conv1D(hp_conv_filters_per_layer, hp_conv_kernel_length_1,
                                strides = hp_conv_stride,
                                # kernel_regularizer=tf.keras.regularizers.l2(hp_beta),
                                # activation=hp_conv_activation
                               )(ae_input)

    xe = tf.keras.layers.Conv1D(hp_conv_filters_per_layer, hp_conv_kernel_length_2,
                                strides = hp_conv_stride,
                                # kernel_regularizer=tf.keras.regularizers.l2(hp_beta),
                                # activation=hp_conv_activation
                               )(xe)
    
    xe = tf.keras.layers.Flatten()(xe)
    xe = tf.keras.layers.Dense(hp_latent_dim)(xe)

    encoder = tf.keras.models.Model(inputs=ae_input, outputs=xe)
    
    ###########################################################################
    ##### Decoder
    ###########################################################################
    
    dec_input = tf.keras.layers.Input(shape=(hp_latent_dim,))
    
    xd = tf.keras.layers.RepeatVector(input_shape[0]-hp_conv_kernel_length_1-hp_conv_kernel_length_2+2)(dec_input)

    xd = tf.keras.layers.Conv1DTranspose(hp_conv_filters_per_layer, hp_conv_kernel_length_2,
                                         strides = hp_conv_stride,
                                        #  kernel_regularizer=tf.keras.regularizers.l2(hp_beta),
                                         # activation='relu'
                                        )(xd)

    xd = tf.keras.layers.Conv1DTranspose(4, hp_conv_kernel_length_1,
                                         strides = hp_conv_stride,
                                        #  kernel_regularizer=tf.keras.regularizers.l2(hp_beta)
                                        )(xd)

    xd = tf.keras.layers.Activation('softmax')(xd)

    decoder = tf.keras.models.Model(inputs=dec_input, outputs=xd)
    
    ###########################################################################
    ##### Classifier
    ###########################################################################
    
    classifier = tf.keras.models.Sequential()

    classifier.add(tf.keras.layers.Input(shape=(hp_latent_dim,)))

    for i in range(hp_count_dense_layers):
        classifier.add(tf.keras.layers.Dense(hp_dense_units/(i+1),
                                             kernel_regularizer=tf.keras.regularizers.l2(hp_beta)))
        classifier.add(tf.keras.layers.BatchNormalization())
        # classifier.add(tf.keras.layers.Activation(hp_dense_activation))
        classifier.add(tf.keras.layers.Dropout(hp_dropout_prob))

    classifier.add(tf.keras.layers.Dense(2, activation='softmax'))

    #########################
    ##### Generate Model from input and output
    #########################
    
    autoencoder = tf.keras.models.Model(ae_input, [decoder(encoder(ae_input)), classifier(encoder(ae_input))])

    if hp_opt_func == 'adam':
        optimizer_function = tf.keras.optimizers.Adam(learning_rate=hp_learning_rate, epsilon = 0.01)
    elif hp_opt_func == 'adagrad':
        optimizer_function = tf.keras.optimizers.Adagrad(learning_rate=hp_learning_rate)
    elif hp_opt_func == 'rmsprop':
        optimizer_function = tf.keras.optimizers.RMSprop(learning_rate=hp_learning_rate)

    autoencoder.compile(optimizer=optimizer_function, 
                        loss={autoencoder.output[0].name.split('/')[0]: 'categorical_crossentropy', 
                              autoencoder.output[1].name.split('/')[0]: 'categorical_crossentropy'}, 
                        metrics='accuracy')
    
    return autoencoder

In [10]:
# ##################################################################################
# ##### Function to customize the DLNN architecture with parameters
# ##################################################################################

# def model_builder_test():
    
#     hp_beta = 0.01
    
#     hp_conv_activation = 'relu'
#     hp_conv_filters_per_layer = 5
#     hp_conv_kernel_length_1 = 5
#     hp_conv_kernel_length_2 = 5
#     hp_conv_stride = 1

#     hp_latent_dim = 10
    
#     hp_count_dense_layers = 3
#     hp_dense_units = 100
#     hp_dropout_prob = 0.1, 0.3, 0.5
    
#     hp_learning_rate = 0.01
#     hp_opt_func = 'adam'
    
#     input_shape = (21, 12)
    
#     ae_input = tf.keras.layers.Input(shape=input_shape)

#     ###########################################################################
#     ##### Encoder
#     ###########################################################################

#     xe = tf.keras.layers.Conv1D(hp_conv_filters_per_layer, hp_conv_kernel_length_1,
#                                 strides = hp_conv_stride,
#                                 kernel_regularizer=tf.keras.regularizers.l2(hp_beta),
#                                 activation=hp_conv_activation
#                                )(ae_input)

#     xe = tf.keras.layers.Conv1D(hp_conv_filters_per_layer, hp_conv_kernel_length_2,
#                                 strides = hp_conv_stride,
#                                 kernel_regularizer=tf.keras.regularizers.l2(hp_beta),
#                                 activation=hp_conv_activation
#                                )(xe)
    
#     xe = tf.keras.layers.Flatten()(xe)
#     xe = tf.keras.layers.Dense(hp_latent_dim)(xe)

#     encoder = tf.keras.models.Model(inputs=ae_input, outputs=xe)
    
#     ###########################################################################
#     ##### Decoder
#     ###########################################################################
    
#     dec_input = tf.keras.layers.Input(shape=(hp_latent_dim,))
    
#     xd = tf.keras.layers.RepeatVector(input_shape[0]-hp_conv_kernel_length_1-hp_conv_kernel_length_2+2)(dec_input)

#     xd = tf.keras.layers.Conv1DTranspose(hp_conv_filters_per_layer, hp_conv_kernel_length_2,
#                                          strides = hp_conv_stride,
#                                          kernel_regularizer=tf.keras.regularizers.l2(hp_beta),
#                                          # activation='relu'
#                                         )(xd)

#     xd = tf.keras.layers.Conv1DTranspose(4, hp_conv_kernel_length_1,
#                                          strides = hp_conv_stride,
#                                          kernel_regularizer=tf.keras.regularizers.l2(hp_beta)
#                                         )(xd)

#     xd = tf.keras.layers.Activation('softmax')(xd)

#     decoder = tf.keras.models.Model(inputs=dec_input, outputs=xd)
    
#     ###########################################################################
#     ##### Classifier
#     ###########################################################################
    
#     classifier = tf.keras.models.Sequential()

#     classifier.add(tf.keras.layers.Input(shape=(hp_latent_dim,)))

#     for i in range(hp_count_dense_layers):
#         classifier.add(tf.keras.layers.Dense(hp_dense_units/(i+1),
#                                              kernel_regularizer=tf.keras.regularizers.l2(hp_beta)))
#         classifier.add(tf.keras.layers.BatchNormalization())
#         # classifier.add(tf.keras.layers.Activation(hp_dense_activation))
#         classifier.add(tf.keras.layers.Dropout(hp_dropout_prob))

#     classifier.add(tf.keras.layers.Dense(2, activation='softmax'))

#     #########################
#     ##### Generate Model from input and output
#     #########################
    
#     autoencoder = tf.keras.models.Model(ae_input, [decoder(encoder(ae_input)), classifier(encoder(ae_input))])

#     if hp_opt_func == 'adam':
#         optimizer_function = tf.keras.optimizers.Adam(learning_rate=hp_learning_rate, epsilon = 0.01)
#     elif hp_opt_func == 'adagrad':
#         optimizer_function = tf.keras.optimizers.Adagrad(learning_rate=hp_learning_rate)
#     elif hp_opt_func == 'rmsprop':
#         optimizer_function = tf.keras.optimizers.RMSprop(learning_rate=hp_learning_rate)

#     autoencoder.compile(optimizer=optimizer_function, 
#                         loss={autoencoder.output[0].name.split('/')[0]: 'categorical_crossentropy', 
#                               autoencoder.output[1].name.split('/')[0]: 'categorical_crossentropy'}, 
#                         metrics='accuracy')
    
#     return autoencoder

In [11]:
# tf.keras.utils.plot_model(model_builder_test())

# HS_990

## Hyperparameter optimization

In [12]:
###########################################################################
##### Prepare dataset
###########################################################################

file = 'HS_990.csv'
input_data_file = os.path.join(input_data_folder, file)

csv_data = pd.read_csv(input_data_file)

csv_data["OHE"] = pd.Series([one_hot_encode_rna(val) for val in csv_data["Sequence"]])
csv_data["OHE_MergedSeq"] = pd.Series([one_hot_encode_rna_mergedseq(val) for val in csv_data["MergedSeq"]])

df_positive = csv_data[csv_data['Number'].str.contains("P")]
df_negative = csv_data[csv_data['Number'].str.contains("N")]

positive_ohe_mergedseq = np.array(list(df_positive['OHE_MergedSeq']))
negative_ohe_mergedseq = np.array(list(df_negative['OHE_MergedSeq']))

positive_ohe_seq = np.array(list(df_positive['OHE']))
negative_ohe_seq = np.array(list(df_negative['OHE']))

print("\n======================================================================")
print("\nFile:", input_data_file)
print("Positive:", positive_ohe_mergedseq.shape[0])
print("Negative:", negative_ohe_mergedseq.shape[0])

## create the features and labels datasets for the training
input_size = positive_ohe_mergedseq[0].shape
output_size = positive_ohe_seq[0].shape

labels = np.concatenate((np.ones((df_positive.shape[0], 1), 
                                  dtype=np.float32), 
                          np.zeros((df_negative.shape[0], 1), 
                                  dtype=np.float32)), 
                        axis=0)

features_mergedseq = np.concatenate((positive_ohe_mergedseq, 
                                      negative_ohe_mergedseq), 
                                    axis=0)

features_seq = np.concatenate((positive_ohe_seq,
                               negative_ohe_seq),
                              axis=0)

labels_2d = label_scalar_to_vector(labels)

# shuffling data
index_arr = np.arange(labels.shape[0])
index_arr = np.random.permutation(index_arr)

labels = labels[index_arr]
labels_2d = labels_2d[index_arr]
features_mergedseq = features_mergedseq[index_arr]



File: /content/drive/MyDrive/_Uni/5. Thesis/Thesis_Work/Psi_Site_Data/Aziz/HS_990.csv
Positive: 495
Negative: 495


In [None]:
hs_tuner = kt.Hyperband(hypermodel =            model_builder_21,
                        objective =             p_objective,
                        max_epochs =            p_epoch,
                        factor =                p_factor,
                        hyperband_iterations =  p_hyperband_iterations,
                        directory =             output_data_folder, 
                        project_name =          'HS_990_hpo_1')

hs_stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

hs_tuner.search(features_mergedseq, [features_seq, labels_2d], epochs=p_epoch, validation_split=0.2, callbacks=[hs_stop_early])

Trial 271 Complete [00h 00m 09s]
val_sequential_loss: 0.7492715120315552

Best val_sequential_loss So Far: 0.6496452689170837
Total elapsed time: 00h 36m 33s

Search: Running Trial #272

Hyperparameter    |Value             |Best Value So Far 
beta              |0.001             |0.01              
conv_activation   |tanh              |relu              
conv_filters_pe...|10                |5                 
max_kernel_leng...|5                 |9                 
max_kernel_leng...|3                 |7                 
count_conv_layers |30                |10                
count_dense_layers|2                 |1                 
dense_units_1     |100               |500               
dropout_prob      |0.1               |0.1               
learning_rate     |0.01              |0.0001            
opt_func          |adagrad           |rmsprop           
tuner/epochs      |30                |30                
tuner/initial_e...|0                 |0                 
tuner/bracket  

In [None]:
# hs_tuner = kt.BayesianOptimization(
#     hypermodel =          model_builder_21,
#     objective =           p_objective,
#     max_trials =          p_max_trials,
#     num_initial_points =  p_num_initial_points,
#     directory =           output_data_folder,
#     project_name =        'HS_990_hpo_1'
# )

# hs_stop_early = tf.keras.callbacks.EarlyStopping(monitor=objective, patience=10)

# hs_tuner.search(features_mergedseq, [features_seq, labels_2d], epochs=p_epoch, validation_split=0.2, callbacks=[hs_stop_early])

In [None]:
hs_best_hps = hs_tuner.get_best_hyperparameters()[0]

print('beta:', hs_best_hps.get('beta'))
print('max_kernel_length_1:', hs_best_hps.get('max_kernel_length_1'))
print('max_kernel_length_2:', hs_best_hps.get('max_kernel_length_2'))
print('learning_rate:', hs_best_hps.get('learning_rate'))
print('conv_activation:', hs_best_hps.get('conv_activation'))
print('count_conv_layers:', hs_best_hps.get('count_conv_layers'))
print('conv_filters_per_layer:', hs_best_hps.get('conv_filters_per_layer'))
print('count_dense_layers:', hs_best_hps.get('count_dense_layers'))
print('dense_units_1:', hs_best_hps.get('dense_units_1'))
print('dropout_prob:', hs_best_hps.get('dropout_prob'))
print('opt_func:', hs_best_hps.get('opt_func'))

In [None]:
# model = hs_tuner.hypermodel.build(hs_best_hps)

# model_path = os.path.join(output_data_folder, 'HS_990', 'best_model_full_train.h5')

# modelCallbacks = [
#     tf.keras.callbacks.ModelCheckpoint(model_path,
#                                        monitor = 'val_sequential_loss', verbose = 1, save_best_only = True,
#                                        save_weights_only = False, mode = 'auto', save_freq = 'epoch'
#                                       )
# ]

# H = model.fit(x=features_mergedseq, y=[features_seq, labels],
#               validation_split=0.2, batch_size=p_batch_size,
#               epochs=p_final_epochs, callbacks=modelCallbacks, verbose=1)

# save_plot(H, os.path.join(drive_out_path, expName, 'HS_990', 'Full_Retrain'))

# model.evaluate(features_mergedseq, [features_seq, labels])

In [None]:
# tuner_best_models = hs_tuner.get_best_models(num_models=1)

# tuner_best_model = tuner_best_models[0]

# model_path = os.path.join(output_data_folder, 'HS_990', 'best_model_transfer_learning.h5')

# modelCallbacks = [
#     tf.keras.callbacks.ModelCheckpoint(model_path,
#                                        monitor = 'val_sequential_loss', verbose = 1, save_best_only = True,
#                                        save_weights_only = False, mode = 'auto', save_freq = 'epoch'
#                                       )
# ]

# H = tuner_best_model.fit(x=features_mergedseq, y=[features_seq, labels],
#               validation_split=0.2, batch_size=p_batch_size,
#               epochs=p_epoch, callbacks=modelCallbacks, verbose=1)

# save_plot2(H, os.path.join(drive_out_path, expName, 'HS_990', 'Transfer_Learning'))

# tuner_best_model.evaluate(features_mergedseq, [features_seq, labels])

## Execute on folds

In [None]:
##################################################################################
##### For each input file, train model and generate different outputs in a structured folder
##################################################################################

## create the evaluation data structure for all iterations
evaluations = {
    "Model" : [],
    # "Kernel_Length" : [],
    "Dataset" : [],
    "Fold" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}
        
input_data_file = os.path.join(input_data_folder, file)

current_dataset_variety = input_data_file.split("/")[-1].split(".")[0]

csv_data = pd.read_csv(input_data_file)

##################################################################################
##### extract data from the current CSV file
##################################################################################

csv_data["OHE"] = pd.Series([one_hot_encode_rna(val) for val in csv_data["Sequence"]])
csv_data["OHE_MergedSeq"] = pd.Series([one_hot_encode_rna_mergedseq(val) for val in csv_data["MergedSeq"]])

df_positive = csv_data[csv_data['Number'].str.contains("P")]
df_negative = csv_data[csv_data['Number'].str.contains("N")]

positive_ohe_mergedseq = np.array(list(df_positive['OHE_MergedSeq']))
negative_ohe_mergedseq = np.array(list(df_negative['OHE_MergedSeq']))

positive_ohe_seq = np.array(list(df_positive['OHE']))
negative_ohe_seq = np.array(list(df_negative['OHE']))

print("\n======================================================================")
print("\nFile:", input_data_file)
print("Positive:", positive_ohe_mergedseq.shape[0])
print("Negative:", negative_ohe_mergedseq.shape[0])

##################################################################################
##### Generate Folds from dataset, and store to file
##################################################################################

## create the features and labels datasets for the training
input_size = positive_ohe_mergedseq[0].shape
output_size = positive_ohe_seq[0].shape

labels = np.concatenate((np.ones((df_positive.shape[0], 1), 
                                  dtype=np.float32), 
                          np.zeros((df_negative.shape[0], 1), 
                                  dtype=np.float32)), 
                        axis=0)
features_mergedseq = np.concatenate((positive_ohe_mergedseq, 
                                      negative_ohe_mergedseq), 
                                    axis=0)

features_seq = np.concatenate((positive_ohe_seq, 
                                negative_ohe_seq), 
                              axis=0)

folds = build_kfold_multifeature(features_mergedseq, features_seq, labels, 
                                  k=n_fold, shuffle=shuffle, seed=seed)

##### 2D label generating for each fold
for j in range(0, len(folds)):
    folds[j]["y_train_2d"] = label_scalar_to_vector(folds[j]["y_train"])
    folds[j]["y_test_2d"] = label_scalar_to_vector(folds[j]["y_test"])

## Write the k-fold dataset to file
foldPath = os.path.join(outPath, expName, current_dataset_variety, "{}fold".format(n_fold))
if(not os.path.isdir(foldPath)):
    os.makedirs(foldPath)
pickle.dump(folds, open(os.path.join(foldPath, foldName), "wb"))

## Create and set directory to save model
modelPath = os.path.join(outPath, expName, current_dataset_variety, "{}fold".format(n_fold), "models")
if(not os.path.isdir(modelPath)):
    os.makedirs(modelPath)

##################################################################################
##### TRAIN and PREDICT for every Fold, using random generated models using best hyperparameters
##################################################################################

# fold counter
i = 0

for fold in folds:

    print("\nTrain/Test model "+current_dataset_variety+" on Fold #"+str(i)+".")

    # model, encoder, decoder, classifier, loss_names = DLNN_AutoEncoder_Classifier(input_shape=input_size,
                                                                                  # learn_rate = 0.01)

    # adding random shuffling of the dataset for training purpose
    index_arr = np.arange(fold["X1_train"].shape[0])
    index_arr = np.random.permutation(index_arr)

    model = hs_tuner.hypermodel.build(hs_best_hps)

    loss_name = [s for s in list(model.loss.keys()) if 'sequential' in s][0]

    current_model_path = os.path.join(modelPath, "{}_bestModel-fold{}.hdf5".format(current_dataset_variety, i))
    modelCallbacks = [
        tf.keras.callbacks.ModelCheckpoint(current_model_path,
                                           monitor = "val_"+loss_name+"_loss", verbose = 1, save_best_only = True,
                                           save_weights_only = False, mode = 'auto', save_freq = 'epoch'
                                          )
    ]
    
    H = model.fit(
        x = fold["X1_train"][index_arr], y = [fold["X2_train"][index_arr], fold["y_train_2d"][index_arr]],
        batch_size = batch_size, epochs = epochs,
        validation_split=0.2,
        # validation_data = (fold["X1_test"], [fold["X2_test"], fold["y_test_2d"]]),
        verbose = 1,
        callbacks = modelCallbacks
    )
    
    plot_path = os.path.join(outPath, expName, current_dataset_variety, "{}fold".format(n_fold), "models", "{}_bestModel-fold{}_charts".format(current_dataset_variety, i))
    save_plot2(H, plot_path, loss_name)

    model = tf.keras.models.load_model(current_model_path)

    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################

    y_pred_seq, y_pred_2d = model.predict(fold["X1_train"])
    label_pred_2d = pred2label(y_pred_2d)
    
    y_pred = y_pred_2d[:,1].reshape((y_pred_2d.shape[0], 1))

    label_pred = label_vector_to_scalar(label_pred_2d)
    label_actual = fold["y_train"]

    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(label_actual, label_pred)
    prec = precision_score(label_actual,label_pred)

    conf = confusion_matrix(label_actual, label_pred)
    if(conf[0][0]+conf[1][0]):
        sens = float(conf[0][0])/float(conf[0][0]+conf[1][0])
    else:
        sens = 0.0
    if(conf[1][1]+conf[0][1]):
        spec = float(conf[1][1])/float(conf[1][1]+conf[0][1])
    else:
        spec = 0.0
    if((conf[0][0]+conf[0][1])*(conf[0][0]+conf[1][0])*(conf[1][1]+conf[0][1])*(conf[1][1]+conf[1][0])):
        mcc = (float(conf[0][0])*float(conf[1][1]) - float(conf[1][0])*float(conf[0][1]))/math.sqrt((conf[0][0]+conf[0][1])*(conf[0][0]+conf[1][0])*(conf[1][1]+conf[0][1])*(conf[1][1]+conf[1][0]))
    else:
        mcc= 0.0
    fpr, tpr, thresholds = roc_curve(label_actual, y_pred)
    auc = roc_auc_score(label_actual, y_pred)

    evaluations["Model"].append(current_dataset_variety)
    # evaluations["Kernel_Length"].append(kernel_length)
    evaluations["Dataset"].append(current_dataset_variety)
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Train")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)

    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################

    y_pred_seq, y_pred_2d = model.predict(fold["X1_test"])
    label_pred_2d = pred2label(y_pred_2d)

    y_pred = y_pred_2d[:,1].reshape((y_pred_2d.shape[0], 1))

    label_pred = label_vector_to_scalar(label_pred_2d)
    label_actual = fold["y_test"]

    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(label_actual, label_pred)
    prec = precision_score(label_actual,label_pred)

    conf = confusion_matrix(label_actual, label_pred)
    if(conf[0][0]+conf[1][0]):
        sens = float(conf[0][0])/float(conf[0][0]+conf[1][0])
    else:
        sens = 0.0
    if(conf[1][1]+conf[0][1]):
        spec = float(conf[1][1])/float(conf[1][1]+conf[0][1])
    else:
        spec = 0.0
    if((conf[0][0]+conf[0][1])*(conf[0][0]+conf[1][0])*(conf[1][1]+conf[0][1])*(conf[1][1]+conf[1][0])):
        mcc = (float(conf[0][0])*float(conf[1][1]) - float(conf[1][0])*float(conf[0][1]))/math.sqrt((conf[0][0]+conf[0][1])*(conf[0][0]+conf[1][0])*(conf[1][1]+conf[0][1])*(conf[1][1]+conf[1][0]))
    else:
        mcc= 0.0
    fpr, tpr, thresholds = roc_curve(label_actual, y_pred)
    auc = roc_auc_score(label_actual, y_pred)

    evaluations["Model"].append(current_dataset_variety)
    # evaluations["Kernel_Length"].append(kernel_length)
    evaluations["Dataset"].append(current_dataset_variety)
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Test")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)

    i = i+1
    del model
    tf.keras.backend.clear_session()

##################################################################################
##### Dump evaluations to a file
##################################################################################

evalPath = os.path.join(outPath, expName, "_Evaluation_All_Datasets")
if(not os.path.isdir(evalPath)):
    os.makedirs(evalPath)

pickle.dump(evaluations,
            open(os.path.join(evalPath, "{}fold_evaluations.pickle".format(n_fold)), "wb"))

In [None]:
evaluations_df = pd.DataFrame.from_dict(evaluations)

##################################################################################
##### Group dataset (mean of metrics) by [Dataset, Model, Train_Test] combinations
##################################################################################

evaluations_df_grouped = evaluations_df.groupby(["Dataset", 
                                                 "Model", 
                                                 "Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

# DLNN_3 = evaluations_df_grouped[np.in1d(evaluations_df_grouped.index.get_level_values(1), ['DLNN_3'])]
# DLNN_5 = evaluations_df_grouped[np.in1d(evaluations_df_grouped.index.get_level_values(1), ['DLNN_5'])]

# DLNN_3_Train = DLNN_3[np.in1d(DLNN_3.index.get_level_values(2), ['Train'])]
# DLNN_3_Test = DLNN_3[np.in1d(DLNN_3.index.get_level_values(2), ['Test'])]

# DLNN_5_Train = DLNN_5[np.in1d(DLNN_5.index.get_level_values(2), ['Train'])]
# DLNN_5_Test = DLNN_5[np.in1d(DLNN_5.index.get_level_values(2), ['Test'])]

## Fold Results

In [None]:
evaluations_df[evaluations_df['Train_Test'] == 'Test']

In [None]:
evaluations_df_grouped

# MM_944

In [None]:
# ###########################################################################
# ##### Prepare dataset
# ###########################################################################

# file = 'MM_944.csv'
# input_data_file = os.path.join(input_data_folder, file)

# csv_data = pd.read_csv(input_data_file)

# csv_data["OHE"] = pd.Series([one_hot_encode_rna(val) for val in csv_data["Sequence"]])
# csv_data["OHE_MergedSeq"] = pd.Series([one_hot_encode_rna_mergedseq(val) for val in csv_data["MergedSeq"]])

# df_positive = csv_data[csv_data['Number'].str.contains("P")]
# df_negative = csv_data[csv_data['Number'].str.contains("N")]

# positive_ohe_mergedseq = np.array(list(df_positive['OHE_MergedSeq']))
# negative_ohe_mergedseq = np.array(list(df_negative['OHE_MergedSeq']))

# positive_ohe_seq = np.array(list(df_positive['OHE']))
# negative_ohe_seq = np.array(list(df_negative['OHE']))

# print("\n======================================================================")
# print("\nFile:", input_data_file)
# print("Positive:", positive_ohe_mergedseq.shape[0])
# print("Negative:", negative_ohe_mergedseq.shape[0])

# ## create the features and labels datasets for the training
# input_size = positive_ohe_mergedseq[0].shape
# output_size = positive_ohe_seq[0].shape

# labels = np.concatenate((np.ones((df_positive.shape[0], 1), 
#                                   dtype=np.float32), 
#                           np.zeros((df_negative.shape[0], 1), 
#                                   dtype=np.float32)), 
#                         axis=0)

# features_mergedseq = np.concatenate((positive_ohe_mergedseq, 
#                                       negative_ohe_mergedseq), 
#                                     axis=0)

# # shuffling data
# index_arr = np.arange(labels.shape[0])
# index_arr = np.random.permutation(index_arr)

# labels = labels[index_arr]
# features_mergedseq = features_mergedseq[index_arr]

In [None]:
# mm_tuner = kt.Hyperband(hypermodel =            model_builder_21,
#                         objective =             p_objective,
#                         max_epochs =            p_epoch,
#                         factor =                p_factor,
#                         hyperband_iterations =  p_hyperband_iterations,
#                         directory =             output_data_folder,
#                         project_name =          'mm_944_hpo_1')

# mm_stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

# mm_tuner.search(features_mergedseq, labels, epochs=p_epoch, validation_split=0.2, callbacks=[mm_stop_early])

In [None]:
# mm_tuner = kt.BayesianOptimization(
#     hypermodel =          model_builder_21,
#     objective =           p_objective,
#     max_trials =          p_max_trials,
#     num_initial_points =  p_num_initial_points,
#     directory =           output_data_folder,
#     project_name =        'MM_944_hpo_1'
# )

# mm_stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

# mm_tuner.search(features_mergedseq, labels, epochs=p_epoch, validation_split=0.2, callbacks=[mm_stop_early])

In [None]:
# mm_best_hps = mm_tuner.get_best_hyperparameters()[0]

# print('beta:', mm_best_hps.get('beta'))
# print('max_kernel_length:', mm_best_hps.get('max_kernel_length'))
# print('kernel_length_step:', mm_best_hps.get('kernel_length_step'))
# print('learning_rate:', mm_best_hps.get('learning_rate'))
# print('conv_activation:', mm_best_hps.get('conv_activation'))
# print('count_conv_layers:', mm_best_hps.get('count_conv_layers'))
# print('conv_filters_per_layer:', mm_best_hps.get('conv_filters_per_layer'))
# print('count_dense_layers:', mm_best_hps.get('count_dense_layers'))
# print('dense_units:', mm_best_hps.get('dense_units_1'))
# # print('dense_activation:', mm_best_hps.get('dense_activation'))
# print('dropout_prob:', mm_best_hps.get('dropout_prob'))
# print('opt_func:', mm_best_hps.get('opt_func'))

In [None]:
# model = mm_tuner.hypermodel.build(mm_best_hps)

# model_path = os.path.join(output_data_folder, 'MM_944', 'best_model.h5')

# modelCallbacks = [
#     tf.keras.callbacks.ModelCheckpoint(model_path,
#                                        monitor = 'val_loss', verbose = 1, save_best_only = True,
#                                        save_weights_only = False, mode = 'auto', save_freq = 'epoch'
#                                       )
# ]

# H = model.fit(x=features_mergedseq, y=labels,
#               validation_split=0.2, batch_size=p_batch_size,
#               epochs=p_final_epochs, callbacks=modelCallbacks, verbose=1)

# save_plot(H, os.path.join(output_data_folder, 'MM_944'))

# SN_628

In [None]:
# ###########################################################################
# ##### Prepare dataset
# ###########################################################################

# file = 'SN_628.csv'
# input_data_file = os.path.join(input_data_folder, file)

# csv_data = pd.read_csv(input_data_file)

# csv_data["OHE"] = pd.Series([one_hot_encode_rna(val) for val in csv_data["Sequence"]])
# csv_data["OHE_MergedSeq"] = pd.Series([one_hot_encode_rna_mergedseq(val) for val in csv_data["MergedSeq"]])

# df_positive = csv_data[csv_data['Number'].str.contains("P")]
# df_negative = csv_data[csv_data['Number'].str.contains("N")]

# positive_ohe_mergedseq = np.array(list(df_positive['OHE_MergedSeq']))
# negative_ohe_mergedseq = np.array(list(df_negative['OHE_MergedSeq']))

# positive_ohe_seq = np.array(list(df_positive['OHE']))
# negative_ohe_seq = np.array(list(df_negative['OHE']))

# print("\n======================================================================")
# print("\nFile:", input_data_file)
# print("Positive:", positive_ohe_mergedseq.shape[0])
# print("Negative:", negative_ohe_mergedseq.shape[0])

# ## create the features and labels datasets for the training
# input_size = positive_ohe_mergedseq[0].shape
# output_size = positive_ohe_seq[0].shape

# labels = np.concatenate((np.ones((df_positive.shape[0], 1), 
#                                   dtype=np.float32), 
#                           np.zeros((df_negative.shape[0], 1), 
#                                   dtype=np.float32)), 
#                         axis=0)

# features_mergedseq = np.concatenate((positive_ohe_mergedseq, 
#                                       negative_ohe_mergedseq), 
#                                     axis=0)

# # shuffling data
# index_arr = np.arange(labels.shape[0])
# index_arr = np.random.permutation(index_arr)

# labels = labels[index_arr]
# features_mergedseq = features_mergedseq[index_arr]

In [None]:
# sn_tuner = kt.Hyperband(hypermodel =            model_builder_31,
#                         objective =             p_objective,
#                         max_epochs =            p_epoch,
#                         factor =                p_factor,
#                         hyperband_iterations =  p_hyperband_iterations,
#                         directory =             output_data_folder,
#                         project_name =          'SN_628_hpo_1')

# sn_stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

# sn_tuner.search(features_mergedseq, labels, epochs=p_epoch, validation_split=0.2, callbacks=[sn_stop_early])

In [None]:
# sn_tuner = kt.BayesianOptimization(
#     hypermodel =          model_builder_31,
#     objective =           p_objective,
#     max_trials =          p_max_trials,
#     num_initial_points =  p_num_initial_points,
#     directory =           output_data_folder,
#     project_name =        'SN_944_hpo_1'
# )

# sn_stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

# sn_tuner.search(features_mergedseq, labels, epochs=p_epoch, validation_split=0.2, callbacks=[sn_stop_early])

In [None]:
# sn_best_hps = sn_tuner.get_best_hyperparameters()[0]

# print('beta:', sn_best_hps.get('beta'))
# print('max_kernel_length:', sn_best_hps.get('max_kernel_length'))
# print('kernel_length_step:', sn_best_hps.get('kernel_length_step'))
# print('learning_rate:', sn_best_hps.get('learning_rate'))
# print('conv_activation:', sn_best_hps.get('conv_activation'))
# print('count_conv_layers:', sn_best_hps.get('count_conv_layers'))
# print('conv_filters_per_layer:', sn_best_hps.get('conv_filters_per_layer'))
# print('count_dense_layers:', sn_best_hps.get('count_dense_layers'))
# print('dense_units:', sn_best_hps.get('dense_units_1'))
# # print('dense_activation:', sn_best_hps.get('dense_activation'))
# print('dropout_prob:', sn_best_hps.get('dropout_prob'))
# print('opt_func:', sn_best_hps.get('opt_func'))

In [None]:
# model = sn_tuner.hypermodel.build(sn_best_hps)

# model_path = os.path.join(output_data_folder, 'SN_628', 'best_model.h5')

# modelCallbacks = [
#     tf.keras.callbacks.ModelCheckpoint(model_path,
#                                        monitor = 'val_loss', verbose = 1, save_best_only = True,
#                                        save_weights_only = False, mode = 'auto', save_freq = 'epoch'
#                                       )
# ]

# H = model.fit(x=features_mergedseq, y=labels,
#               validation_split=0.2, batch_size=p_batch_size,
#               epochs=p_final_epochs, callbacks=modelCallbacks, verbose=1)

# save_plot(H, os.path.join(output_data_folder, 'SN_628'))