In [41]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

n_fold = 5
expName = "PSI_Site_DLNN_MergedSeq_autoencoder"
outPath = "Results"
foldName = "folds.pickle"

epochs = 50
batch_size = 16
shuffle = False
seed = None


input_data_folder = "Data\\Aziz"

In [42]:
import os 
from Bio import SeqIO
import pickle
import numpy as np
import pandas as pd

import tensorflow as tf

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score

import math

In [43]:
# print(tf.test.is_gpu_available(cuda_only=True))
# physical_devices = tf.config.experimental.list_physical_devices('GPU')
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [44]:
##################################################################################
##### define all CUSTOM functions
##################################################################################

def one_hot_encode_dna(sequence):
    seq_encoded = np.zeros((len(sequence),4))
    dict_nuc = {
        "A": 0,
        "C": 1,
        "G": 2,
        "T": 3
    }
    i = 0
    for single_character in sequence:
        if(single_character.upper() in dict_nuc.keys()):
            seq_encoded[i][dict_nuc[single_character.upper()]] = 1
            i = i+1
        else:
            raise ValueError('Incorrect character in DNA sequence: '+sequence)
    return seq_encoded

def one_hot_encode_rna(sequence):
    seq_encoded = np.zeros((len(sequence),4))
    dict_nuc = {
        "A": 0,
        "C": 1,
        "G": 2,
        "U": 3
    }
    i = 0
    for single_character in sequence:
        if(single_character.upper() in dict_nuc.keys()):
            seq_encoded[i][dict_nuc[single_character.upper()]] = 1
            i = i+1
        else:
            raise ValueError('Incorrect character in RNA sequence: '+sequence)
    return seq_encoded

def one_hot_encode_rnafold(sequence):
    seq_encoded = np.zeros((len(sequence),3))
    dict_fold = {
        "(": 0,
        ")": 1,
        ".": 2
    }
    i = 0
    for single_character in sequence:
        if(single_character in dict_fold.keys()):
            seq_encoded[i][dict_fold[single_character]] = 1
            i = i+1
        else:
            raise ValueError('Incorrect character in RNAfold: '+sequence)
    return seq_encoded

def one_hot_encode_rna_mergedseq(sequence):
    dict_nuc = {
        "A": 0,
        "C": 1,
        "G": 2,
        "U": 3
    }
    dict_fold = {
        "(": 0,
        ")": 1,
        ".": 2
    }
    list_seq = sequence.strip().split(' ')
    seq_encoded = np.zeros((len(list_seq),12))
    i = 0
    for single_character in list_seq:
        if(single_character[0].upper() in dict_nuc.keys()):
            idx1 = dict_nuc[single_character[0].upper()]+1
        else:
            raise ValueError('Incorrect RNA character in MergedSeq sequence: '+sequence)
        if(single_character[1] in dict_fold.keys()):
            idx2 = dict_fold[single_character[1]]+1
        else:
            raise ValueError('Incorrect RNAfold character in MergedSeq sequence: '+sequence)
        idx = (idx1 * idx2) - 1
        seq_encoded[i][idx] = 1
        i = i+1        
    return seq_encoded

In [45]:
##################################################################################
##### define evaluator functions
##################################################################################

## Build the K-fold from dataset
def build_kfold(features, labels, k=10, shuffle=False, seed=None):
    
    skf = StratifiedKFold(n_splits=k, shuffle=shuffle, random_state=seed)
    kfoldList = []
    for train_index, test_index in skf.split(features, labels):
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        kfoldList.append({
            "X_train": X_train,
            "X_test": X_test,
            "y_train":y_train,
            "y_test":y_test
        })
    return kfoldList

def build_kfold_multifeature(features_1, features_2, labels, k=10, shuffle=False, seed=None):
    
    skf = StratifiedKFold(n_splits=k, shuffle=shuffle, random_state=seed)
    kfoldList = []
    for train_index, test_index in skf.split(features_1, labels):
        X1_train, X1_test = features_1[train_index], features_1[test_index]
        X2_train, X2_test = features_2[train_index], features_2[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        kfoldList.append({
            "X1_train": X1_train,
            "X1_test": X1_test,
            "X2_train": X2_train,
            "X2_test": X2_test,
            "y_train":y_train,
            "y_test":y_test
        })
    return kfoldList

def pred2label(y_pred):
    y_pred = np.round(np.clip(y_pred, 0, 1))
    return y_pred

In [46]:
##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def DLNN_AutoEncoder_Classifier(input_shape = (21,12), output_shape = (21, 4)):
    
    
    ae_input = tf.keras.layers.Input(shape=input_shape)
    
    ###########################################################################
    ##### Encoder
    ###########################################################################
    
    encoder = tf.keras.models.Sequential()
    encoder.add(tf.keras.layers.Flatten(input_shape = input_shape))
    encoder.add(tf.keras.layers.Dense(100))
    encoder.add(tf.keras.layers.Dense(3))
    
    ###########################################################################
    ##### Decoder
    ###########################################################################
    
    decoder = tf.keras.models.Sequential()
    decoder.add(tf.keras.layers.Dense(100, input_shape=(3,)))
    decoder.add(tf.keras.layers.Dense(output_shape[0]*output_shape[1]))
    decoder.add(tf.keras.layers.Reshape(output_shape))
    
    ###########################################################################
    ##### Classifier
    ###########################################################################
    
    classifier = tf.keras.models.Sequential()
    classifier.add(tf.keras.layers.Dense(10, input_shape=(3,)))
    classifier.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    
    ###########################################################################
    ##### Total Network
    ###########################################################################
    
    autoencoder = tf.keras.models.Model(ae_input, [decoder(encoder(ae_input)), classifier(encoder(ae_input))])
    
    autoencoder.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001), 
                        loss=['mean_squared_error', 'binary_crossentropy'], 
                        metrics='accuracy')
    
    return autoencoder, encoder, decoder, classifier

In [48]:
##################################################################################
##### For each input file, train model and generate different outputs in a structured folder
##################################################################################

## create the evaluation data structure for all iterations
evaluations = {
    "Model" : [],
    "Kernel_Length" : [],
    "Dataset" : [],
    "Fold" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

for root, dirs, files in os.walk(input_data_folder):
    for file in files:
        
        input_data_file = os.path.join(root, file)
        
        current_dataset_variety = input_data_file.split("\\")[-1].split(".")[0]

        csv_data = pd.read_csv(input_data_file)

        ##################################################################################
        ##### extract data from the current CSV file
        ##################################################################################
        
        csv_data["OHE"] = pd.Series([one_hot_encode_rna(val) for val in csv_data["Sequence"]])
        csv_data["OHE_MergedSeq"] = pd.Series([one_hot_encode_rna_mergedseq(val) for val in csv_data["MergedSeq"]])

        df_positive = csv_data[csv_data['Number'].str.contains("P")]
        df_negative = csv_data[csv_data['Number'].str.contains("N")]

        positive_ohe_mergedseq = np.array(list(df_positive['OHE_MergedSeq']))
        negative_ohe_mergedseq = np.array(list(df_negative['OHE_MergedSeq']))
        
        positive_ohe_seq = np.array(list(df_positive['OHE']))
        negative_ohe_seq = np.array(list(df_negative['OHE']))

        print("\n======================================================================")
        print("\nFile:", input_data_file)
        print("Positive:", positive_ohe_mergedseq.shape[0])
        print("Negative:", negative_ohe_mergedseq.shape[0])

        ##################################################################################
        ##### Generate Folds from dataset, and store to file
        ##################################################################################

        ## create the features and labels datasets for the training
        input_size = positive_ohe_mergedseq[0].shape

        labels = np.concatenate((np.ones((df_positive.shape[0], 1), 
                                         dtype=np.float32), 
                                 np.zeros((df_negative.shape[0], 1), 
                                          dtype=np.float32)), 
                                axis=0)
        features_mergedseq = np.concatenate((positive_ohe_mergedseq, 
                                             negative_ohe_mergedseq), 
                                            axis=0)
        
        features_seq = np.concatenate((positive_ohe_seq, 
                                       negative_ohe_seq), 
                                      axis=0)

        folds = build_kfold_multifeature(features_mergedseq, features_seq, labels, 
                                         k=n_fold, shuffle=shuffle, seed=seed)

#         ## Write the k-fold dataset to file
#         foldPath = os.path.join(outPath, expName, current_dataset_variety, "{}fold".format(n_fold))
#         if(not os.path.isdir(foldPath)):
#             os.makedirs(foldPath)
#         pickle.dump(folds, open(os.path.join(foldPath, foldName), "wb"))

#         ## Create and set directory to save model
#         modelPath = os.path.join(outPath, expName, current_dataset_variety, "{}fold".format(n_fold), "models")
#         if(not os.path.isdir(modelPath)):
#             os.makedirs(modelPath)

        ##################################################################################
        ##### TRAIN and PREDICT for every Fold, using models
        ##################################################################################

        # fold counter
        i = 0

        for fold in folds:

            print("\nTrain/Test model "+current_dataset_variety+" on Fold #"+str(i)+".")
            
            model, encoder, decoder, classifier = DLNN_AutoEncoder_Classifier()

            model.fit(x = fold["X1_train"], y = [fold["X1_train"], fold["y_train"]], 
                      batch_size = batch_size, epochs = epochs, 
                      verbose = 1, validation_split=0.2)

            ##################################################################################
            ##### Prediction and metrics for TRAIN dataset
            ##################################################################################

            y_seq, y_pred = model.predict(fold["X1_train"])
            label_pred = pred2label(y_pred)
            # Compute precision, recall, sensitivity, specifity, mcc
            acc = accuracy_score(fold["y_train"], label_pred)
            prec = precision_score(fold["y_train"],label_pred)

            conf = confusion_matrix(fold["y_train"], label_pred)
            if(conf[0][0]+conf[1][0]):
                sens = float(conf[0][0])/float(conf[0][0]+conf[1][0])
            else:
                sens = 0.0
            if(conf[1][1]+conf[0][1]):
                spec = float(conf[1][1])/float(conf[1][1]+conf[0][1])
            else:
                spec = 0.0
            if((conf[0][0]+conf[0][1])*(conf[0][0]+conf[1][0])*(conf[1][1]+conf[0][1])*(conf[1][1]+conf[1][0])):
                mcc = (float(conf[0][0])*float(conf[1][1]) - float(conf[1][0])*float(conf[0][1]))/math.sqrt((conf[0][0]+conf[0][1])*(conf[0][0]+conf[1][0])*(conf[1][1]+conf[0][1])*(conf[1][1]+conf[1][0]))
            else:
                mcc= 0.0
            fpr, tpr, thresholds = roc_curve(fold["y_train"], y_pred)
            auc = roc_auc_score(fold["y_train"], y_pred)

            evaluations["Model"].append(current_dataset_variety)
            evaluations["Kernel_Length"].append(kernel_length)
            evaluations["Dataset"].append(current_dataset_variety)
            evaluations["Fold"].append(i)
            evaluations["Train_Test"].append("Train")
            evaluations["Accuracy"].append(acc)
            evaluations["Precision"].append(prec)
            evaluations["TPR"].append(tpr)
            evaluations["FPR"].append(fpr)
            evaluations["TPR_FPR_Thresholds"].append(thresholds)
            evaluations["AUC"].append(auc)
            evaluations["Sensitivity"].append(sens)
            evaluations["Specificity"].append(spec)
            evaluations["MCC"].append(mcc)

            ##################################################################################
            ##### Prediction and metrics for TEST dataset
            ##################################################################################

            y_seq, y_pred = model.predict(fold["X1_test"])
            label_pred = pred2label(y_pred)
            # Compute precision, recall, sensitivity, specifity, mcc
            acc = accuracy_score(fold["y_test"], label_pred)
            prec = precision_score(fold["y_test"],label_pred)

            conf = confusion_matrix(fold["y_test"], label_pred)
            if(conf[0][0]+conf[1][0]):
                sens = float(conf[0][0])/float(conf[0][0]+conf[1][0])
            else:
                sens = 0.0
            if(conf[1][1]+conf[0][1]):
                spec = float(conf[1][1])/float(conf[1][1]+conf[0][1])
            else:
                spec = 0.0
            if((conf[0][0]+conf[0][1])*(conf[0][0]+conf[1][0])*(conf[1][1]+conf[0][1])*(conf[1][1]+conf[1][0])):
                mcc = (float(conf[0][0])*float(conf[1][1]) - float(conf[1][0])*float(conf[0][1]))/math.sqrt((conf[0][0]+conf[0][1])*(conf[0][0]+conf[1][0])*(conf[1][1]+conf[0][1])*(conf[1][1]+conf[1][0]))
            else:
                mcc= 0.0
            fpr, tpr, thresholds = roc_curve(fold["y_test"], y_pred)
            auc = roc_auc_score(fold["y_test"], y_pred)

            evaluations["Model"].append(current_dataset_variety)
            evaluations["Kernel_Length"].append(kernel_length)
            evaluations["Dataset"].append(current_dataset_variety)
            evaluations["Fold"].append(i)
            evaluations["Train_Test"].append("Test")
            evaluations["Accuracy"].append(acc)
            evaluations["Precision"].append(prec)
            evaluations["TPR"].append(tpr)
            evaluations["FPR"].append(fpr)
            evaluations["TPR_FPR_Thresholds"].append(thresholds)
            evaluations["AUC"].append(auc)
            evaluations["Sensitivity"].append(sens)
            evaluations["Specificity"].append(spec)
            evaluations["MCC"].append(mcc)

            i = i+1
            del model
            tf.keras.backend.clear_session()

        ##################################################################################
        ##### Dump evaluations to a file
        ##################################################################################

#         evalPath = os.path.join(outPath, expName, "_Evaluation_All_Datasets")
#         if(not os.path.isdir(evalPath)):
#             os.makedirs(evalPath)

#         pickle.dump(evaluations,
#                     open(os.path.join(evalPath, "{}fold_evaluations.pickle".format(n_fold)), "wb"))



File: Data\Aziz\HS_990.csv
Positive: 495
Negative: 495

Train/Test model HS_990 on Fold #0.


AttributeError: 'tuple' object has no attribute 'fit'

## Visualization of Evaluation

In [10]:
# ##################################################################################
# ##### Add import statement here, to make this next part of code standalone executable
# ##################################################################################

# import os
# import pickle
# import matplotlib as mpl
# import matplotlib.pyplot as plt
# from matplotlib.ticker import ScalarFormatter, FormatStrFormatter
# import numpy as np
# import pandas as pd


In [11]:
# ##################################################################################
# ##### Load file and convert to dataframe for easy manipulation
# ##################################################################################

# evalPath = os.path.join(outPath, expName, "_Evaluation_All_Datasets")
# if(not os.path.isdir(evalPath)):
#     os.makedirs(evalPath)

# evaluations = pickle.load(open(os.path.join(evalPath, "{}fold_evaluations.pickle".format(n_fold)), "rb"))

In [12]:
# evaluations["Model"] = evaluations["Model"][0:20]
# evaluations_df = pd.DataFrame.from_dict(evaluations)

In [13]:
# evaluations_df = pd.DataFrame.from_dict(evaluations)

##################################################################################
##### Group dataset (mean of metrics) by [Dataset, Model, Train_Test] combinations
##################################################################################

evaluations_df_grouped = evaluations_df.groupby(["Dataset", 
                                                 "Model", 
                                                 "Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

# DLNN_3 = evaluations_df_grouped[np.in1d(evaluations_df_grouped.index.get_level_values(1), ['DLNN_3'])]
# DLNN_5 = evaluations_df_grouped[np.in1d(evaluations_df_grouped.index.get_level_values(1), ['DLNN_5'])]

# DLNN_3_Train = DLNN_3[np.in1d(DLNN_3.index.get_level_values(2), ['Train'])]
# DLNN_3_Test = DLNN_3[np.in1d(DLNN_3.index.get_level_values(2), ['Test'])]

# DLNN_5_Train = DLNN_5[np.in1d(DLNN_5.index.get_level_values(2), ['Train'])]
# DLNN_5_Test = DLNN_5[np.in1d(DLNN_5.index.get_level_values(2), ['Test'])]

In [14]:
evaluations_df_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Dataset,Model,Train_Test,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
HS_990,HS_990,Test,0.573737,0.553526,0.598021,0.623792,0.553526,0.161572
HS_990,HS_990,Train,0.872475,0.797975,0.927289,0.998649,0.797975,0.770316
MM_944,MM_944,Test,0.631358,0.603158,0.682538,0.684545,0.603158,0.274942
MM_944,MM_944,Train,0.894071,0.827675,0.947606,0.994769,0.827675,0.805095
SN_628,SN_628,Test,0.546184,0.53321,0.59782,0.576655,0.53321,0.10067
SN_628,SN_628,Train,0.878178,0.804223,0.957775,1.0,0.804223,0.779913


In [15]:
# ##################################################################################
# ##### Decide on metric to visualize
# ##################################################################################

# print("Metrics Available : ")
# print(list(evaluations_df_grouped.columns))

#### Select a metric to plot below:

In [16]:
# metric_to_plot = "Accuracy"

In [17]:
# ##################################################################################
# ##### Visualize with a multiple Bar chart
# ##################################################################################

# x = np.arange(len(DLNN_3_Train[metric_to_plot]))
# width = 0.15

# fig, ax = plt.subplots(figsize=(17,6))
# rects1 = ax.bar(x - (4*(width/2)), round(DLNN_3_Train[metric_to_plot]*100, 3), width, label='DLNN_3, Train')
# rects2 = ax.bar(x - (1.5*(width/2)), round(DLNN_5_Train[metric_to_plot]*100, 3), width, label='DLNN_5, Train')
# rects3 = ax.bar(x + (1.5*(width/2)), round(DLNN_3_Test[metric_to_plot]*100, 3), width, label='DLNN_3, Test')
# rects4 = ax.bar(x + (4*(width/2)), round(DLNN_5_Test[metric_to_plot]*100, 3), width, label='DLNN_5, Test')

# ## Custom y-axis tick labels
# ax.set_ylabel(metric_to_plot)
# ax.set_ylim([(math.floor(min(evaluations_df_grouped[metric_to_plot])*10)-1)*10, 
#             (math.ceil(max(evaluations_df_grouped[metric_to_plot])*10)+1)*10])
# # ax.set_ylim([80, 105])

# ## Custom x-axis tick labels
# ax.set_xticks(x)
# # ax.set_xticklabels(DLNN_3_Train.index.get_level_values(0))
# # ax.set_xticklabels([m+" - "+str(n) for m,n in 
# #                         zip(DLNN_3_Train.index.get_level_values(0),DLNN_3_Train.index.get_level_values(1))],
# #                   rotation=30)
# ax.set_xticklabels(DLNN_3_Train.index.get_level_values(0))

# ax.set_title(metric_to_plot+' by Dataset, Model, Train/Test')
# ax.legend(loc='upper left')

# def autolabel(rects):
#     for rect in rects:
#         height = rect.get_height()
#         ax.annotate('{}'.format(height),
#                     xy=(rect.get_x() + rect.get_width() / 2, height),
#                     xytext=(0, 3),  # 3 points vertical offset
#                     textcoords="offset points", 
#                     ha='center', va='bottom', rotation=90)

# autolabel(rects1)
# autolabel(rects2)
# autolabel(rects3)
# autolabel(rects4)

# plt.show()

### Store all metrics' plots to file

In [18]:
# ##################################################################################
# ##### Iteratively generate comparison plot using every metric
# ##################################################################################

# for metric_to_plot in list(evaluations_df_grouped.columns):
    
#     x = np.arange(len(DLNN_3_Train[metric_to_plot]))
#     width = 0.15

#     fig, ax = plt.subplots(figsize=(17,6))
#     rects1 = ax.bar(x - (4*(width/2)), round(DLNN_3_Train[metric_to_plot]*100, 3), width, label='DLNN_3, Train')
#     rects2 = ax.bar(x - (1.5*(width/2)), round(DLNN_5_Train[metric_to_plot]*100, 3), width, label='DLNN_5, Train')
#     rects3 = ax.bar(x + (1.5*(width/2)), round(DLNN_3_Test[metric_to_plot]*100, 3), width, label='DLNN_3, Test')
#     rects4 = ax.bar(x + (4*(width/2)), round(DLNN_5_Test[metric_to_plot]*100, 3), width, label='DLNN_5, Test')

#     ## Custom y-axis tick labels
#     ax.set_ylabel(metric_to_plot)
#     ax.set_ylim([(math.floor(min(evaluations_df_grouped[metric_to_plot])*10)-1)*10, 
#                 (math.ceil(max(evaluations_df_grouped[metric_to_plot])*10)+1)*10])
#     # ax.set_ylim([80, 105])

#     ## Custom x-axis tick labels
#     ax.set_xticks(x)
#     # ax.set_xticklabels(DLNN_3_Train.index.get_level_values(0))
#     # ax.set_xticklabels([m+" - "+str(n) for m,n in 
#     #                         zip(DLNN_3_Train.index.get_level_values(0),DLNN_3_Train.index.get_level_values(1))],
#     #                   rotation=30)
#     ax.set_xticklabels(DLNN_3_Train.index.get_level_values(0))

#     ax.set_title(metric_to_plot+' by Dataset, Model, Train/Test')
#     ax.legend(loc='upper left')

#     def autolabel(rects):
#         for rect in rects:
#             height = rect.get_height()
#             ax.annotate('{}'.format(height),
#                         xy=(rect.get_x() + rect.get_width() / 2, height),
#                         xytext=(0, 3),  # 3 points vertical offset
#                         textcoords="offset points", 
#                         ha='center', va='bottom', rotation=90)

#     autolabel(rects1)
#     autolabel(rects2)
#     autolabel(rects3)
#     autolabel(rects4)
    
#     plt.savefig(os.path.join(evalPath, "{}_DLNN_Comparison".format(metric_to_plot)))
#     plt.close()
    