In [1]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

n_fold = 10
expName = "PSI_Site_DLNN_CORENup"
outPath = "Results"
foldName = "folds.pickle"

# modelNames = ["DLNN_3", "DLNN_5"]

epochs = 100
batch_size = 32
shuffle = True
seed = None

input_data_folder = "Data\\Psi_Site_Chen"

In [2]:
import os 
from Bio import SeqIO
import pickle
import numpy as np
import pandas as pd

import tensorflow as tf

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score

import math

In [3]:
# print(tf.test.is_gpu_available(cuda_only=True))
# physical_devices = tf.config.experimental.list_physical_devices('GPU')
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [4]:
##################################################################################
##### define all CUSTOM functions
##################################################################################

def one_hot_encode_dna(sequence):
    
    seq_encoded = np.zeros((len(sequence),4))
    dict_nuc = {
        "A": 0,
        "C": 1,
        "G": 2,
        "T":3
    }
    i = 0
    
    for single_character in sequence:
        if(single_character.upper() in dict_nuc.keys()):
            seq_encoded[i][dict_nuc[single_character.upper()]] = 1
            i = i+1
        else:
            return []
    
    return seq_encoded

def one_hot_encode_rna(sequence):
    
    seq_encoded = np.zeros((len(sequence),4))
    dict_nuc = {
        "A": 0,
        "C": 1,
        "G": 2,
        "U":3
    }
    i = 0
    
    for single_character in sequence:
        if(single_character.upper() in dict_nuc.keys()):
            seq_encoded[i][dict_nuc[single_character.upper()]] = 1
            i = i+1
        else:
            return []
    
    return seq_encoded

In [5]:
##################################################################################
##### define evaluator functions
##################################################################################

## Build the K-fold from dataset
def build_kfold(features, labels, k=10, shuffle=False, seed=None):
    
    skf = StratifiedKFold(n_splits=k, shuffle=shuffle, random_state=seed)
    kfoldList = []
    for train_index, test_index in skf.split(features, labels):
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        kfoldList.append({
            "X_train": X_train,
            "X_test": X_test,
            "y_train":y_train,
            "y_test":y_test
        })
    return kfoldList

def pred2label(y_pred):
    y_pred = np.round(y_pred).astype(int)
    return y_pred

In [6]:
##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def DLNN_CORENup(input_shape = (21,4),
                   conv_filters_per_layer_1 = 50, kernel_length_1 = 5, conv_strides_1 = 1, ## 1st Convolutional layer parameters
                   max_pool_width_1 = 2, max_pool_stride_1 = 2, ## 1st Maxpool layer parameters
                   lstm_decode_units = 50, ## LSTM layer parameters
                   conv_filters_per_layer_2 = 50,  kernel_length_2 = 10, conv_strides_2 = 1, ## 2nd Convolutional layer parameters
                   max_pool_width_2 = 2, max_pool_stride_2 = 2, ## 2nd Maxpool layer parameters
                   dense_decode_units = 370, ## Dense layer parameters
                   prob = 0.5, learn_rate = 0.0003, loss = 'binary_crossentropy', metrics = None):
    
    beta = 0.001
    
    input1 = tf.keras.layers.Input(shape=input_shape)

    x1 = tf.keras.layers.Conv1D(conv_filters_per_layer_1, kernel_length_1, input_shape = input_shape, 
                                strides = conv_strides_1, kernel_regularizer = tf.keras.regularizers.l2(beta), 
                                padding = "same"
                               )(input1)
    x1 = tf.keras.layers.Activation('relu')(x1)
    x1 = tf.keras.layers.MaxPool1D(pool_size = max_pool_width_1, strides = max_pool_stride_1)(x1)
    x1 = tf.keras.layers.Dropout(prob)(x1)

    ## LSTM Path

    x2 = tf.keras.layers.LSTM(lstm_decode_units, return_sequences = True, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta))(x1)
    x2 = tf.keras.layers.Dropout(prob)(x2)
    
    x2 = tf.keras.layers.Flatten()(x2)

    ## Conv Path

    x3 = tf.keras.layers.Conv1D(conv_filters_per_layer_2, kernel_length_2, strides = conv_strides_2, 
                                kernel_regularizer = tf.keras.regularizers.l2(beta), 
                                padding = 'same'
                               )(x1)
    x3 = tf.keras.layers.Activation('relu')(x3)
    x3 = tf.keras.layers.MaxPooling1D(pool_size = max_pool_width_2, strides = max_pool_stride_2)(x3)
    x3 = tf.keras.layers.Dropout(prob)(x3)
    
    x3 = tf.keras.layers.Flatten()(x3)

    ## Fully connected Layers

    y = tf.keras.layers.Concatenate(1)([x2,x3])
    
    y = tf.keras.layers.Dense(dense_decode_units, kernel_regularizer = tf.keras.regularizers.l2(beta), activation = 'relu')(y)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(1, kernel_regularizer = tf.keras.regularizers.l2(beta), activation = 'sigmoid')(y)

    ## Generate Model from input and output
    model = tf.keras.models.Model(inputs=[input1], outputs=y)
    
    ## Compile model
    if(metrics != None):
        model.compile(optimizer = tf.keras.optimizers.Adam(lr=learn_rate), loss = loss, metrics = metrics)
    else:
        model.compile(optimizer = tf.keras.optimizers.Adam(lr=learn_rate), loss = loss)

    return model

In [7]:
DLNN_CORENup().summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 21, 4)]      0           []                               
                                                                                                  
 conv1d (Conv1D)                (None, 21, 50)       1050        ['input_1[0][0]']                
                                                                                                  
 activation (Activation)        (None, 21, 50)       0           ['conv1d[0][0]']                 
                                                                                                  
 max_pooling1d (MaxPooling1D)   (None, 10, 50)       0           ['activation[0][0]']             
                                                                                              

  super(Adam, self).__init__(name, **kwargs)


In [8]:
##################################################################################
##### For each input file, train model and generate different outputs in a structured folder
##################################################################################

## create the evaluation data structure for all iterations
evaluations = {
    "Model" : [],
    "Dataset" : [],
    "Fold" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

In [9]:
for root, dirs, files in os.walk(input_data_folder):
    for file in files:
        
        input_data_file = os.path.join(root, file)
        
        current_dataset_variety = input_data_file.split("\\")[-1].split(".")[0]
        
        openFile = open(input_data_file)
        fastaSequences = SeqIO.parse(openFile, "fasta")
        
        ##################################################################################
        ##### extract data from the current fasta file
        ##################################################################################

        positive_List = []
        negative_List = []
        positive_onehotencoded_List = []
        negative_onehotencoded_List = []

        for fasta in fastaSequences: 
            name, sequence = fasta.id, str(fasta.seq)
            if "P" in name:
                positive_List.append(sequence)
                aus_seq = one_hot_encode_rna(sequence)
                if(len(aus_seq) != 0):
                    positive_onehotencoded_List.append(aus_seq)
            elif "N" in name:
                negative_List.append(sequence)
                aus_seq = one_hot_encode_rna(sequence)
                if(len(aus_seq) != 0):
                    negative_onehotencoded_List.append(aus_seq)

        openFile.close()

        print("\n======================================================================")
        print("\nFile: "+os.path.join(root, file))
        print("Positive: "+str(len(positive_onehotencoded_List)))
        print("Negative: "+str(len(negative_onehotencoded_List)))
        
        ##################################################################################
        ##### Generate Folds from dataset, and store to file
        ##################################################################################

        ## create the features and labels datasets for the training
        input_size = (len(positive_onehotencoded_List[1]), 4)
        labels = np.concatenate((np.ones((len(positive_onehotencoded_List), 1), dtype=np.float32), np.zeros((len(negative_onehotencoded_List), 1), dtype=np.float32)), axis=0)
        features = np.concatenate((positive_onehotencoded_List,negative_onehotencoded_List), 0)

        ## Generate the k-fold dataset
        folds = build_kfold(features, labels, k=n_fold, shuffle=shuffle, seed=seed)

        ## Write the k-fold dataset to file
        foldPath = os.path.join(outPath, expName, current_dataset_variety, "{}fold".format(n_fold))
        if(not os.path.isdir(foldPath)):
            os.makedirs(foldPath)
        pickle.dump(folds, open(os.path.join(foldPath, foldName), "wb"))

        ## Create and set directory to save model
        modelPath = os.path.join(outPath, expName, current_dataset_variety, "{}fold".format(n_fold), "models")
        if(not os.path.isdir(modelPath)):
            os.makedirs(modelPath)
            
        ##################################################################################
        ##### TRAIN and PREDICT for every Fold, using models
        ##################################################################################

        # fold counter
        i = 0

        for fold in folds:
            
            # adding random shuffling of the dataset for training purpose
            randomized_index_arr = np.arange(fold["X_train"].shape[0])
            randomized_index_arr = np.random.permutation(randomized_index_arr)

            print("\nTrain/Test model "+current_dataset_variety+" on Fold #"+str(i)+".")

            ## Generate model using function
            model = DLNN_CORENup(input_shape = input_size)
    
            model_file_path = os.path.join(modelPath, "{}_bestModel-fold{}.hdf5".format(current_dataset_variety, i))
            ## Define the model callbacks for early stopping and saving the model. Then train model
            modelCallbacks = [
                tf.keras.callbacks.ModelCheckpoint(model_file_path,
                                                   monitor = 'val_accuracy', verbose = 1, save_best_only = True, 
                                                   save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
#                 tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', min_delta = 0, patience = 10, verbose = 0, 
#                                                  mode = 'auto', baseline = None, restore_best_weights = True)
            ]
            model.fit(x = fold["X_train"][randomized_index_arr], y = fold["y_train"][randomized_index_arr], batch_size = batch_size, epochs = epochs, verbose = 1, 
                      validation_data = (fold["X_test"], fold["y_test"]))
            
            model = tf.keras.models.load_model(model_file_path)
            
            ##################################################################################
            ##### Prediction and metrics for TRAIN dataset
            ##################################################################################

            y_pred = model.predict(fold["X_train"])
            label_pred = pred2label(y_pred)
            # Compute precision, recall, sensitivity, specifity, mcc
            acc = accuracy_score(fold["y_train"], label_pred)
            prec = precision_score(fold["y_train"],label_pred)

            conf = confusion_matrix(fold["y_train"], label_pred)
            if(conf[0][0]+conf[1][0]):
                sens = float(conf[0][0])/float(conf[0][0]+conf[1][0])
            else:
                sens = 0.0
            if(conf[1][1]+conf[0][1]):
                spec = float(conf[1][1])/float(conf[1][1]+conf[0][1])
            else:
                spec = 0.0
            if((conf[0][0]+conf[0][1])*(conf[0][0]+conf[1][0])*(conf[1][1]+conf[0][1])*(conf[1][1]+conf[1][0])):
                mcc = (float(conf[0][0])*float(conf[1][1]) - float(conf[1][0])*float(conf[0][1]))/math.sqrt((conf[0][0]+conf[0][1])*(conf[0][0]+conf[1][0])*(conf[1][1]+conf[0][1])*(conf[1][1]+conf[1][0]))
            else:
                mcc= 0.0
            fpr, tpr, thresholds = roc_curve(fold["y_train"], y_pred)
            auc = roc_auc_score(fold["y_train"], y_pred)

            evaluations["Model"].append(current_dataset_variety)
            evaluations["Dataset"].append(current_dataset_variety)
            evaluations["Fold"].append(i)
            evaluations["Train_Test"].append("Train")
            evaluations["Accuracy"].append(acc)
            evaluations["Precision"].append(prec)
            evaluations["TPR"].append(tpr)
            evaluations["FPR"].append(fpr)
            evaluations["TPR_FPR_Thresholds"].append(thresholds)
            evaluations["AUC"].append(auc)
            evaluations["Sensitivity"].append(sens)
            evaluations["Specificity"].append(spec)
            evaluations["MCC"].append(mcc)

            ##################################################################################
            ##### Prediction and metrics for TEST dataset
            ##################################################################################

            y_pred = model.predict(fold["X_test"])
            label_pred = pred2label(y_pred)
            # Compute precision, recall, sensitivity, specifity, mcc
            acc = accuracy_score(fold["y_test"], label_pred)
            prec = precision_score(fold["y_test"],label_pred)

            conf = confusion_matrix(fold["y_test"], label_pred)
            if(conf[0][0]+conf[1][0]):
                sens = float(conf[0][0])/float(conf[0][0]+conf[1][0])
            else:
                sens = 0.0
            if(conf[1][1]+conf[0][1]):
                spec = float(conf[1][1])/float(conf[1][1]+conf[0][1])
            else:
                spec = 0.0
            if((conf[0][0]+conf[0][1])*(conf[0][0]+conf[1][0])*(conf[1][1]+conf[0][1])*(conf[1][1]+conf[1][0])):
                mcc = (float(conf[0][0])*float(conf[1][1]) - float(conf[1][0])*float(conf[0][1]))/math.sqrt((conf[0][0]+conf[0][1])*(conf[0][0]+conf[1][0])*(conf[1][1]+conf[0][1])*(conf[1][1]+conf[1][0]))
            else:
                mcc= 0.0
            fpr, tpr, thresholds = roc_curve(fold["y_test"], y_pred)
            auc = roc_auc_score(fold["y_test"], y_pred)

            evaluations["Model"].append(current_dataset_variety)
            evaluations["Dataset"].append(current_dataset_variety)
            evaluations["Fold"].append(i)
            evaluations["Train_Test"].append("Test")
            evaluations["Accuracy"].append(acc)
            evaluations["Precision"].append(prec)
            evaluations["TPR"].append(tpr)
            evaluations["FPR"].append(fpr)
            evaluations["TPR_FPR_Thresholds"].append(thresholds)
            evaluations["AUC"].append(auc)
            evaluations["Sensitivity"].append(sens)
            evaluations["Specificity"].append(spec)
            evaluations["MCC"].append(mcc)

            i = i+1
            del model
            tf.keras.backend.clear_session()

        ##################################################################################
        ##### Dump evaluations to a file
        ##################################################################################

        evalPath = os.path.join(outPath, expName, "_Evaluation_All_Datasets")
        if(not os.path.isdir(evalPath)):
            os.makedirs(evalPath)

        pickle.dump(evaluations,
                    open(os.path.join(evalPath, "{}fold_evaluations.pickle".format(n_fold)), "wb"))



File: Data\Psi_Site_Chen\HS_990.txt
Positive: 495
Negative: 495

Train/Test model HS_990 on Fold #0.
Epoch 1/100


  super(Adam, self).__init__(name, **kwargs)


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100


Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100


Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100

Train/Test model HS_990 on Fold #1.
Epoch 1/100


  super(Adam, self).__init__(name, **kwargs)


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100


Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100


Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100

Train/Test model HS_990 on Fold #2.
Epoch 1/100


  super(Adam, self).__init__(name, **kwargs)





KeyboardInterrupt



## Visualization of Evaluation

In [None]:
##################################################################################
##### Add import statement here, to make this next part of code standalone executable
##################################################################################

import os
import pickle
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter, FormatStrFormatter
import numpy as np
import pandas as pd


In [None]:
##################################################################################
##### Load file and convert to dataframe for easy manipulation
##################################################################################

# evalPath = os.path.join(outPath, expName, "_Evaluation_All_Datasets")
# if(not os.path.isdir(evalPath)):
#     os.makedirs(evalPath)

# evaluations = pickle.load(open(os.path.join(evalPath, "{}fold_evaluations.pickle".format(n_fold)), "rb"))

evaluations_df = pd.DataFrame.from_dict(evaluations)

In [None]:
##################################################################################
##### Group dataset (mean of metrics) by [Dataset, Model, Train_Test] combinations
##################################################################################

evaluations_df_grouped = evaluations_df.groupby(["Dataset", 
                                                 "Model", 
                                                 "Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

# DLNN_3 = evaluations_df_grouped[np.in1d(evaluations_df_grouped.index.get_level_values(1), ['DLNN_3'])]
# DLNN_5 = evaluations_df_grouped[np.in1d(evaluations_df_grouped.index.get_level_values(1), ['DLNN_5'])]

# DLNN_3_Train = DLNN_3[np.in1d(DLNN_3.index.get_level_values(2), ['Train'])]
# DLNN_3_Test = DLNN_3[np.in1d(DLNN_3.index.get_level_values(2), ['Test'])]

# DLNN_5_Train = DLNN_5[np.in1d(DLNN_5.index.get_level_values(2), ['Train'])]
# DLNN_5_Test = DLNN_5[np.in1d(DLNN_5.index.get_level_values(2), ['Test'])]

In [None]:
evaluations_df

In [None]:
evaluations_df_grouped

# Max values in evaluation

In [None]:
evaluations_df_max = evaluations_df[["Dataset",
                                     "Model",
                                     "Train_Test",
                                     "Accuracy",
                                     "Precision",
                                     "Sensitivity",
                                     "Specificity",
                                     "AUC",
                                     "MCC"]].groupby(["Dataset", 
                                                      "Model", 
                                                      "Train_Test"]).max().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC']).reset_index()

evaluations_df_test_max = evaluations_df_max[evaluations_df_max["Train_Test"] == 'Test']
evaluations_df_test_max

In [None]:
#     Dataset	Model	Train_Test	Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# 0	HS_990	HS_990	Test	0.676768	0.704545	0.704082	0.654545	0.704545	0.356886
# 2	MM_944	MM_944	Test	0.744681	0.744681	0.779538	0.769231	0.744681	0.489362
# 4	SS_628	SS_628	Test	0.730159	0.727273	0.776210	0.760000	0.727273	0.461469

In [None]:
# Dataset	Model	Train_Test	Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# 0	HS_990	HS_990	Test	0.707071	0.705882	0.722041	0.708333	0.705882	0.414047
# 2	MM_944	MM_944	Test	0.715789	0.678571	0.764599	0.769231	0.678571	0.440598
# 4	SS_628	SS_628	Test	0.777778	0.781250	0.826613	0.774194	0.781250	0.555444       

## Evaluate only top 5 model folds

In [None]:
evaluations_df5 = evaluations_df.sort_values(['Accuracy'], ascending=False).groupby(["Dataset", 
                                                                                    "Model", 
                                                                                    "Train_Test"]).head(5).reset_index()

evaluations_df5_grouped = evaluations_df5.groupby(["Dataset", 
                                                 "Model", 
                                                 "Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC']).reset_index()

In [None]:
evaluations_df5_grouped[evaluations_df5_grouped["Train_Test"] == 'Test']

In [None]:
evaluations_df5_grouped

In [None]:
#     Dataset	Model	Train_Test	Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# 0	HS_990	HS_990	Test	0.636364	0.647777	0.663347	0.629172	0.647777	0.274748
# 2	MM_944	MM_944	Test	0.692788	0.673316	0.746133	0.722696	0.673316	0.390784
# 4	SS_628	SS_628	Test	0.680389	0.670531	0.770259	0.707128	0.670531	0.369560

In [None]:
# 	Dataset	Model	Train_Test	Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# 0	HS_990	HS_990	Test	0.624242	0.646211	0.671184	0.611681	0.646211	0.252720
# 2	MM_944	MM_944	Test	0.692744	0.661864	0.743313	0.739310	0.661864	0.393179
# 4	SS_628	SS_628	Test	0.720635	0.731435	0.776210	0.715402	0.731435	0.443644

# Independent Testing

In [None]:
independent_data_folder = "Data\\Psi_Site_Chen_Independent"

In [None]:
##################################################################################
##### For each input file, train model and generate different outputs in a structured folder
##################################################################################

## create the evaluation data structure for all iterations
evaluations = {
    "Model" : [],
    "Dataset" : [],
    "Fold" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

sum_evaluations = {
    "Model" : [],
    "Dataset" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

vote_evaluations = {
    "Model" : [],
    "Dataset" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

for root, dirs, files in os.walk(independent_data_folder):
    for file in files:
        
        input_data_file = os.path.join(root, file)
        
        if 'HS' in file:
            bench_data = 'HS_990'
        elif 'SS' in file:
            bench_data = 'SS_628'
            
        current_dataset_variety = input_data_file.split("\\")[-1].split(".")[0]
        
        openFile = open(input_data_file)
        fastaSequences = SeqIO.parse(openFile, "fasta")
        
        ##################################################################################
        ##### extract data from the current fasta file
        ##################################################################################

        positive_List = []
        negative_List = []
        positive_onehotencoded_List = []
        negative_onehotencoded_List = []

        for fasta in fastaSequences: 
            name, sequence = fasta.id, str(fasta.seq)
            if "P" in name:
                positive_List.append(sequence)
                aus_seq = one_hot_encode_rna(sequence)
                if(len(aus_seq) != 0):
                    positive_onehotencoded_List.append(aus_seq)
            elif "N" in name:
                negative_List.append(sequence)
                aus_seq = one_hot_encode_rna(sequence)
                if(len(aus_seq) != 0):
                    negative_onehotencoded_List.append(aus_seq)

        openFile.close()

        print("\n======================================================================")
        print("\nFile: "+os.path.join(root, file))
        print("Positive: "+str(len(positive_onehotencoded_List)))
        print("Negative: "+str(len(negative_onehotencoded_List)))
        
        ##################################################################################
        ##### Generate Folds from dataset, and store to file
        ##################################################################################

        ## create the features and labels datasets for the training
        labels = np.concatenate((np.ones((len(positive_onehotencoded_List), 1), dtype=np.float32), np.zeros((len(negative_onehotencoded_List), 1), dtype=np.float32)), axis=0)
        features = np.concatenate((positive_onehotencoded_List,negative_onehotencoded_List), 0)
        
        benchModelPath = os.path.join(outPath, expName, bench_data, "{}fold".format(n_fold), "models")
            
        ##################################################################################
        ##### TRAIN and PREDICT for every Fold, using models
        ##################################################################################
        
        y_pred_list = []

        for fold in range(n_fold):

            print("\nIndependent test on "+current_dataset_variety+" using Fold #"+str(fold)+" model from "+bench_data+".")
            
            current_model_path = os.path.join(
                benchModelPath, 
                "{}_bestModel-fold{}.hdf5".format(bench_data, fold)
            )
            
            model = tf.keras.models.load_model(current_model_path)

            ##################################################################################
            ##### Prediction and metrics for TEST dataset
            ##################################################################################
            
            y_pred = model.predict(features)
            y_pred_list.append(y_pred)
            label_pred = pred2label(y_pred)
            # Compute precision, recall, sensitivity, specifity, mcc
            acc = accuracy_score(labels, label_pred)
            prec = precision_score(labels,label_pred)

            conf = confusion_matrix(labels, label_pred)
            if(conf[0][0]+conf[1][0]):
                sens = float(conf[0][0])/float(conf[0][0]+conf[1][0])
            else:
                sens = 0.0
            if(conf[1][1]+conf[0][1]):
                spec = float(conf[1][1])/float(conf[1][1]+conf[0][1])
            else:
                spec = 0.0
            if((conf[0][0]+conf[0][1])*(conf[0][0]+conf[1][0])*(conf[1][1]+conf[0][1])*(conf[1][1]+conf[1][0])):
                mcc = (float(conf[0][0])*float(conf[1][1]) - float(conf[1][0])*float(conf[0][1]))/math.sqrt((conf[0][0]+conf[0][1])*(conf[0][0]+conf[1][0])*(conf[1][1]+conf[0][1])*(conf[1][1]+conf[1][0]))
            else:
                mcc= 0.0
            fpr, tpr, thresholds = roc_curve(labels, y_pred)
            auc = roc_auc_score(labels, y_pred)

            evaluations["Model"].append(current_dataset_variety)
            evaluations["Dataset"].append(current_dataset_variety)
            evaluations["Fold"].append(i)
            evaluations["Accuracy"].append(acc)
            evaluations["Precision"].append(prec)
            evaluations["TPR"].append(tpr)
            evaluations["FPR"].append(fpr)
            evaluations["TPR_FPR_Thresholds"].append(thresholds)
            evaluations["AUC"].append(auc)
            evaluations["Sensitivity"].append(sens)
            evaluations["Specificity"].append(spec)
            evaluations["MCC"].append(mcc)

            del model
            tf.keras.backend.clear_session()
            
        ##################################################################################
        ##### Prediction and metrics using sum of all folds
        ##################################################################################
        
        y_pred_list_arr = np.swapaxes(np.array(y_pred_list), 0,1)
        y_pred_vote = np.mean(y_pred_list_arr, axis = 1)
        label_vote = pred2label(y_pred_vote)
        
        # Compute precision, recall, sensitivity, specifity, mcc
        acc = accuracy_score(labels, label_vote)
        prec = precision_score(labels, label_vote)

        conf = confusion_matrix(labels, label_vote)
        if(conf[0][0]+conf[1][0]):
            sens = float(conf[0][0])/float(conf[0][0]+conf[1][0])
        else:
            sens = 0.0
        if(conf[1][1]+conf[0][1]):
            spec = float(conf[1][1])/float(conf[1][1]+conf[0][1])
        else:
            spec = 0.0
        if((conf[0][0]+conf[0][1])*(conf[0][0]+conf[1][0])*(conf[1][1]+conf[0][1])*(conf[1][1]+conf[1][0])):
            mcc = (float(conf[0][0])*float(conf[1][1]) - float(conf[1][0])*float(conf[0][1]))/math.sqrt((conf[0][0]+conf[0][1])*(conf[0][0]+conf[1][0])*(conf[1][1]+conf[0][1])*(conf[1][1]+conf[1][0]))
        else:
            mcc= 0.0
        fpr, tpr, thresholds = roc_curve(labels, y_pred_vote)
        auc = roc_auc_score(labels, y_pred_vote)
        
        sum_evaluations["Model"].append(current_dataset_variety)
        sum_evaluations["Dataset"].append(current_dataset_variety)
        sum_evaluations["Accuracy"].append(acc)
        sum_evaluations["Precision"].append(prec)
        sum_evaluations["TPR"].append(tpr)
        sum_evaluations["FPR"].append(fpr)
        sum_evaluations["TPR_FPR_Thresholds"].append(thresholds)
        sum_evaluations["AUC"].append(auc)
        sum_evaluations["Sensitivity"].append(sens)
        sum_evaluations["Specificity"].append(spec)
        sum_evaluations["MCC"].append(mcc)
        
        ##################################################################################
        ##### Prediction and metrics using vote of all folds
        ##################################################################################
        
        y_pred_list_arr = np.swapaxes(np.array(y_pred_list), 0,1)
        y_pred_vote = np.sum(np.round(y_pred_list_arr), axis = 1)
        label_vote = (y_pred_vote > 5).astype(int)
        
        # Compute precision, recall, sensitivity, specifity, mcc
        acc = accuracy_score(labels, label_vote)
        prec = precision_score(labels, label_vote)

        conf = confusion_matrix(labels, label_vote)
        if(conf[0][0]+conf[1][0]):
            sens = float(conf[0][0])/float(conf[0][0]+conf[1][0])
        else:
            sens = 0.0
        if(conf[1][1]+conf[0][1]):
            spec = float(conf[1][1])/float(conf[1][1]+conf[0][1])
        else:
            spec = 0.0
        if((conf[0][0]+conf[0][1])*(conf[0][0]+conf[1][0])*(conf[1][1]+conf[0][1])*(conf[1][1]+conf[1][0])):
            mcc = (float(conf[0][0])*float(conf[1][1]) - float(conf[1][0])*float(conf[0][1]))/math.sqrt((conf[0][0]+conf[0][1])*(conf[0][0]+conf[1][0])*(conf[1][1]+conf[0][1])*(conf[1][1]+conf[1][0]))
        else:
            mcc= 0.0
        fpr, tpr, thresholds = roc_curve(labels, y_pred_vote)
        auc = roc_auc_score(labels, y_pred_vote)

        vote_evaluations["Model"].append(current_dataset_variety)
        vote_evaluations["Dataset"].append(current_dataset_variety)
        vote_evaluations["Accuracy"].append(acc)
        vote_evaluations["Precision"].append(prec)
        vote_evaluations["TPR"].append(tpr)
        vote_evaluations["FPR"].append(fpr)
        vote_evaluations["TPR_FPR_Thresholds"].append(thresholds)
        vote_evaluations["AUC"].append(auc)
        vote_evaluations["Sensitivity"].append(sens)
        vote_evaluations["Specificity"].append(spec)
        vote_evaluations["MCC"].append(mcc)
        
        ##################################################################################
        ##### Dump evaluations to a file
        ##################################################################################

        evalPath = os.path.join(outPath, expName, "_Evaluation_Independent_Datasets")
        if(not os.path.isdir(evalPath)):
            os.makedirs(evalPath)

        pickle.dump(evaluations,
                    open(os.path.join(evalPath, "{}fold_evaluations.pickle".format(n_fold)), "wb"))
        
        pickle.dump(sum_evaluations,
                    open(os.path.join(evalPath, "{}fold_sum_evaluations.pickle".format(n_fold)), "wb"))
        
        pickle.dump(vote_evaluations,
                    open(os.path.join(evalPath, "{}fold_vote_evaluations.pickle".format(n_fold)), "wb"))

## Predict using each fold, average result of all 10 folds

In [None]:
evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Dataset", 
                                                 "Model"]).mean().filter(['Accuracy', 
                                                                           'Precision', 
                                                                           'AUC', 
                                                                           'Sensitivity', 
                                                                           'Specificity', 
                                                                           'MCC'])

In [None]:
evaluations_df_grouped

In [None]:
# Dataset	Model	Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# HS_200	HS_200	0.6780	0.676877	0.72104	0.684022	0.676877	0.358413
# SS_200	SS_200	0.6765	0.646861	0.75923	0.733891	0.646861	0.366287

In [None]:
#                 Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# Dataset	Model						
# HS_200	HS_200	0.6750	0.664200	0.72708	0.693586	0.664200	0.353822
# SS_200	SS_200	0.6975	0.674367	0.76201	0.731014	0.674367	0.400129

## Predict using each fold, average result of top 5 folds

In [None]:
evaluations_df5 = evaluations_df.sort_values(['Accuracy'],ascending=False).groupby(["Dataset", 
                                                                                    "Model"]).head(5).reset_index()

evaluations_df5_grouped = evaluations_df5.groupby(["Dataset", 
                                                   "Model"]).mean().filter(['Accuracy', 
                                                                            'Precision', 
                                                                            'AUC', 
                                                                            'Sensitivity', 
                                                                            'Specificity', 
                                                                            'MCC']).reset_index()

In [None]:
evaluations_df5_grouped

In [None]:
# Dataset	Model	Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# 0	HS_200	HS_200	0.704	0.697657	0.73026	0.712350	0.697657	0.409001
# 1	SS_200	SS_200	0.701	0.679132	0.76770	0.731744	0.679132	0.406404

In [None]:
# 	Dataset	Model	Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# 0	HS_200	HS_200	0.690	0.690418	0.72364	0.690651	0.690418	0.380534
# 1	SS_200	SS_200	0.714	0.695143	0.76684	0.738915	0.695143	0.431015

## Predict using all 10 folds, vote using sum of scores of all 10 folds

In [None]:
sum_evaluations_df = pd.DataFrame.from_dict(sum_evaluations)

sum_evaluations_df.filter(["Dataset", "Model", 'Accuracy', 'Precision', 'AUC', 'Sensitivity', 'Specificity', 'MCC'])

In [None]:
# 	Dataset	Model	Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# 0	HS_200	HS_200	0.705	0.702970	0.7352	0.707071	0.702970	0.410021
# 1	SS_200	SS_200	0.710	0.666667	0.7834	0.783784	0.666667	0.434959

In [None]:
# 	Dataset	Model	Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# 0	HS_200	HS_200	0.695	0.685714	0.7445	0.705263	0.685714	0.390488
# 1	SS_200	SS_200	0.705	0.672269	0.7873	0.753086	0.672269	0.417607

## Predict using all 10 folds, vote using absolute vote of all 10 folds

In [None]:
vote_evaluations_df = pd.DataFrame.from_dict(vote_evaluations)

vote_evaluations_df.filter(["Dataset", "Model", 'Accuracy', 'Precision', 'AUC', 'Sensitivity', 'Specificity', 'MCC'])

In [None]:
#     Dataset	Model	Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# 0	HS_200	HS_200	0.715	0.712871	0.72785	0.717172	0.712871	0.430022
# 1	SS_200	SS_200	0.695	0.666667	0.76455	0.734940	0.666667	0.395761