In [1]:
import os
import sys
# from Bio import SeqIO
import pickle
import numpy as np
import pandas as pd

# import tensorflow as tf
# import tensorflow.keras.backend as K

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
# import xgboost as xgb
# from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import math

In [2]:
##################################################################################
##### Define all experiment parameters
##################################################################################

expName = "Psi_Site_Chen_MathFeature_Latest_5_0_10_5_1_ALL_GRID_PCA_GradientBoostingClassifier"

dataset_path = "Data"
setting = "Psi_Site_Chen_MathFeature_Latest_5_0_10_5_1_ALL"

output_path = "Results"

datafile_extensions = ".csv"

modelNames = ["GradientBoosting"]

shuffle = False
seed = None

##################################################################################
##### Define the modelling hyperparameters
##################################################################################

n_fold = 10

validation_fraction = 0.1

##################################################################################
##### Define the GRID hyperparameters
##################################################################################

pca_n_components = [10, 20, 30, 50, 100]

loss = ['deviance', 'exponential']
learning_rate = [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001]
n_estimators = [10, 30, 50, 100, 200, 300, 500]
max_depth = [5, 10, 15, 20, 25, 30]
ccp_alpha = [0.0, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001]
n_iter_no_change = [10, 25, 50, 100]
tol = [0.1, 0.01]
criterion = ['friedman_mse', 'squared_error', 'mse', 'mae']
max_features = ['auto', 'sqrt', 'log2']

param_grid = {
    "reduction__n_components": pca_n_components,
    "model__loss": loss,
    "model__learning_rate": learning_rate,
    "model__n_estimators": n_estimators,
    "model__max_depth": max_depth,
    "model__ccp_alpha": ccp_alpha,
    "model__n_iter_no_change": n_iter_no_change,
    "model__tol": tol,
    "model__criterion": criterion,
    "model__max_features": max_features
}

In [3]:
##################################################################################
##### Checking the directory
##################################################################################

dataset_setting_path = os.path.join(dataset_path, setting)
dataset_varieties = next(os.walk(dataset_setting_path))
result_output_path = os.path.join(output_path, expName)

In [4]:
##################################################################################
##### define evaluator functions
##################################################################################

## Build the K-fold from dataset
def build_kfold(features, labels, k=10, shuffle=False, seed=None):
    
    skf = StratifiedKFold(n_splits=k, shuffle=shuffle, random_state=seed)
    kfoldList = []
    for train_index, test_index in skf.split(features, labels):
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        kfoldList.append({
            "X_train": X_train,
            "X_test": X_test,
            "y_train":y_train,
            "y_test":y_test
        })
    return kfoldList

def pred2label(y_pred):
    y_pred = np.round(np.clip(y_pred, 0, 1))
    return y_pred

In [5]:
##################################################################################
##### For each input file, train model and generate different outputs in a structured folder
##################################################################################

error_list = []

evaluations = {}

for root, dirs, files in os.walk(dataset_setting_path):
    for file in files:
        if os.path.splitext(file)[-1] == datafile_extensions:
            
            try:
            
                current_dataset_variety = "_".join(file.split(".")[0].split("_")[0:(len(file.split(".")[0].split("_")) - 1)])
                encoding_type = file.split(".")[0].split("_")[-1]

                ##################################################################################
                ##### read the current file
                ##################################################################################

                input_file_full_path = os.path.join(root, file)

                ## check if input file has header
                file_obj = open(input_file_full_path, "r")
                first_line = file_obj.readline()
                file_obj.close()
                file_has_header = None
                if first_line.split(",")[0] == "nameseq" or first_line.replace("\n", "").split(",")[-1] == "label":
                    file_has_header = 0

                sequences_df = pd.read_csv(input_file_full_path, header = file_has_header)

                ##################################################################################
                ##### extract data from the current dataframe file
                ##################################################################################

                sequences_df["class"] = np.where(sequences_df[sequences_df.columns[0]].str.contains("P"), 1, 0)

                print("\n======================================================================")
                print("\nFile: "+os.path.join(root, file))
                print("Positive: "+str(sum(sequences_df["class"])))
                print("Negative: "+str(len(sequences_df) - sum(sequences_df["class"])))

                ##################################################################################
                ##### Perform PCA on entire data
                ##################################################################################

                ## create the features and labels datasets for the training
                labels = sequences_df["class"].values
                features = sequences_df.drop('nameseq', axis = 1).drop('class', axis = 1).values
                # features = features.astype(np.float)
                
                ##################################################################################
                ##### Grid Search
                ##################################################################################
                
                pipe = Pipeline(steps=[("reduction", PCA()), ("model", GradientBoostingClassifier())])
                
                search = GridSearchCV(pipe, 
                                      param_grid, 
                                      cv=n_fold, 
                                      scoring=["accuracy", "roc_auc", "precision", "recall", "f1"], 
                                      refit="accuracy")
                search.fit(features, labels)
                
                evaluations[current_dataset_variety] = search
                        
            except Exception as error:
                error_list.append((input_file_full_path, error))
                
##################################################################################
##### Dump evaluations to a file
##################################################################################

evalPath = os.path.join(result_output_path, "_Evaluation_All_Datasets", "{}fold".format(n_fold))
if(not os.path.isdir(evalPath)):
    os.makedirs(evalPath)

pickle.dump(evaluations,
            open(os.path.join(evalPath, "{}fold_evaluations_{}.pickle".format(n_fold, modelNames[0])), "wb"))

##################################################################################
##### Dump exceptions to a file
##################################################################################

pickle.dump(error_list,
            open(os.path.join(result_output_path, "exceptions.pickle"), "wb"))
                



File: Data\Psi_Site_Chen_MathFeature_Latest_5_0_10_5_1_ALL\HS_990_ALL.csv
Positive: 495
Negative: 495


File: Data\Psi_Site_Chen_MathFeature_Latest_5_0_10_5_1_ALL\MM_944_ALL.csv
Positive: 472
Negative: 472


File: Data\Psi_Site_Chen_MathFeature_Latest_5_0_10_5_1_ALL\SS_628_ALL.csv
Positive: 314
Negative: 314


In [6]:
error_list

[('Data\\Psi_Site_Chen_MathFeature_Latest_5_0_10_5_1_ALL\\HS_990_ALL.csv',
  ValueError('For multi-metric scoring, the parameter refit must be set to a scorer key or a callable to refit an estimator with the best parameter setting on the whole data and make the best_* attributes available for that metric. If this is not needed, refit should be set to False explicitly. True was passed.')),
 ('Data\\Psi_Site_Chen_MathFeature_Latest_5_0_10_5_1_ALL\\MM_944_ALL.csv',
  ValueError('For multi-metric scoring, the parameter refit must be set to a scorer key or a callable to refit an estimator with the best parameter setting on the whole data and make the best_* attributes available for that metric. If this is not needed, refit should be set to False explicitly. True was passed.')),
 ('Data\\Psi_Site_Chen_MathFeature_Latest_5_0_10_5_1_ALL\\SS_628_ALL.csv',
  ValueError('For multi-metric scoring, the parameter refit must be set to a scorer key or a callable to refit an estimator with the best par

## Visualization of Evaluation

In [None]:
##################################################################################
##### Add import statement here, to make this next part of code standalone executable
##################################################################################

import os
import pickle
# import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import ScalarFormatter, FormatStrFormatter
import numpy as np
import pandas as pd


In [None]:
# ##################################################################################
# ##### Parameters used only in this section
# ##################################################################################

# n_fold = 10

# expName = "MathFeature_setting1_kgap_fickett"
# outPath = "Generated"
# setting = "Setting1"
# output_path = "Results"

# ExtraTreeForest, RandomForest, XGBoost
# modelNames = ["XGBoost"]

In [None]:
##################################################################################
##### Load file and convert to dataframe for easy manipulation
##################################################################################

evalPath = os.path.join(result_output_path, "_Evaluation_All_Datasets", "{}fold".format(n_fold))

evaluations = pickle.load(open(os.path.join(evalPath, "{}fold_evaluations_{}.pickle".format(n_fold, modelNames[0])), "rb"))

evaluations_df = pd.DataFrame.from_dict(evaluations)

In [None]:
##################################################################################
##### Group dataset (mean of metrics) by [Dataset, Model, Train_Test] combinations
##################################################################################

evaluations_df_grouped = evaluations_df.groupby(["Dataset",
                                                 "Encoding_Type",
                                                 "Model", 
                                                 "Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

Eval_Train = evaluations_df_grouped[np.in1d(evaluations_df_grouped.index.get_level_values(3), ['Train'])]
Eval_Test = evaluations_df_grouped[np.in1d(evaluations_df_grouped.index.get_level_values(3), ['Test'])]

datasets = np.unique(evaluations_df_grouped.index.get_level_values(0))

In [None]:
##################################################################################
##### Decide on metric to visualize
##################################################################################

print("Metrics Available : ", list(evaluations_df_grouped.columns))

#### Select a metric to plot below:

In [None]:
metric_to_plot = "Accuracy"
# dataset_to_print = "HS_990"

In [None]:
##################################################################################
##### Visualize with a multiple Bar chart
##################################################################################

# df = evaluations_df_grouped[np.in1d(evaluations_df_grouped.index.get_level_values(0), [dataset_to_print])]
df = evaluations_df_grouped.reset_index(level=['Dataset', 'Train_Test'])

# Some boilerplate to initialise things
sns.set()
plt.figure(figsize=(20,8))

# This is where the actual plot gets made
ax = sns.barplot(data=df, x="Dataset", y=metric_to_plot, hue="Train_Test")

# Customise some display properties
ax.set_title(modelNames[0])
ax.grid(color='#cccccc')
ax.set_ylabel(metric_to_plot)
ax.set_xlabel("Dataset")
ax.set_xticklabels(df["Dataset"].unique().astype(str), rotation='vertical')

for p in ax.patches:
    ax.annotate(format(p.get_height()*100, '.4f'),
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'center', 
                size=15,
                xytext = (0, -12), 
                textcoords = 'offset points')

##############################

# Ask Matplotlib to show it
plt.show()

In [None]:
# ##################################################################################
# ##### Visualize with a multiple Bar chart
# ##################################################################################

# df = evaluations_df_grouped[np.in1d(evaluations_df_grouped.index.get_level_values(0), [dataset_to_print])]
# df = df.reset_index(level=['Encoding_Type', 'Train_Test'])

# # Some boilerplate to initialise things
# sns.set()
# plt.figure(figsize=(20,8))

# # This is where the actual plot gets made
# ax = sns.barplot(data=df, x="Encoding_Type", y=metric_to_plot, hue="Train_Test")

# # Customise some display properties
# ax.set_title(dataset_to_print+" - "+modelNames[0])
# ax.grid(color='#cccccc')
# ax.set_ylabel(metric_to_plot)
# ax.set_xlabel("Encoding_Type")
# ax.set_xticklabels(df["Encoding_Type"].unique().astype(str), rotation='vertical')

# # for p in ax.patches:
# #     ax.annotate(format(p.get_height()*100, '.4f'),
# #                 (p.get_x() + p.get_width() / 2., p.get_height()), 
# #                 ha = 'center', va = 'center', 
# #                 size=15,
# #                 xytext = (0, -12), 
# #                 textcoords = 'offset points')

# ##############################

# # Ask Matplotlib to show it
# plt.show()

### Store all metrics' plots to file

In [None]:
# ##################################################################################
# ##### Iteratively generate comparison plot using every metric
# ##################################################################################

# for metric_to_plot in list(evaluations_df_grouped.columns):
#     for dataset_to_print in datasets:
    
#         df = evaluations_df_grouped[np.in1d(evaluations_df_grouped.index.get_level_values(0), [dataset_to_print])]
#         df = df.reset_index(level=['Encoding_Type', 'Train_Test'])

#         # Some boilerplate to initialise things
#         sns.set()
#         plt.figure(figsize=(20,8))

#         # This is where the actual plot gets made
#         ax = sns.barplot(data=df, x="Encoding_Type", y=metric_to_plot, hue="Train_Test")

#         # Customise some display properties
#         ax.set_title(dataset_to_print+" - "+modelNames[0])
#         ax.grid(color='#cccccc')
#         ax.set_ylabel(metric_to_plot)
#         ax.set_xlabel("Encoding_Type")
#         ax.set_xticklabels(df["Encoding_Type"].unique().astype(str), rotation='vertical')
        
# #         for p in ax.patches:
# #             ax.annotate(format(p.get_height()*100, '.4f'),
# #                         (p.get_x() + p.get_width() / 2., p.get_height()), 
# #                         ha = 'center', va = 'center', 
# #                         size=15,
# #                         xytext = (0, -12), 
# #                         textcoords = 'offset points')
        
#         plt.savefig(os.path.join(evalPath, "{}_{}_{}_Comparison".format(metric_to_plot, dataset_to_print, modelNames[0])))
#         plt.close()
    