In [None]:
# Input Data 
filename = 'C:/TestPopulation/Data/Population_Feature_Outputs.csv' # file with data 
label = 'IKrBlock_Label'
features = ['Vrest', 'Upstroke', 'Vpeak', 'APD20', 'APD40', 'APD50', 'APD90',
       'TriAP']

# Output Data 
folder_save = 'C:/TestPopulation/Data/'
output_name = 'SteadyState_APFeatures_IKrBlock'

In [None]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import roc_curve, auc
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, recall_score, precision_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, normalize
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

import time
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import pandas as pd
import numpy as np

import warnings 
warnings.simplefilter('ignore')

%load_ext autoreload
%autoreload 2
import machine_learning_funs as ml
import tune_parameters as pt

In [None]:
# plot settings 
sns.set_style("dark")
sns.despine()
sns.set_context("notebook", font_scale=2, rc={"lines.linewidth": 5})

In [None]:
# set seed
seed = 134556
np.random.seed(seed)

In [None]:
# read the data
df = pd.read_csv(filename)
df = df.dropna()
y = df[[label]]

In [None]:
df.columns

In [None]:
df1 = df[features]
df1.head()

In [None]:
# Normalize the inputs
x = df1 
x_1 = StandardScaler().fit_transform(x) # for other classifiers except ANN
x_2 = MinMaxScaler().fit_transform(x) # for ANN only

In [None]:
# split the data into train, test set
x_train, x_test, y_train, y_test = train_test_split(x_1, y, random_state = seed, test_size = 0.10)
x_train_ann, x_test_ann, y_train_ann, y_test_ann = train_test_split(x_2, y, random_state = seed, test_size = 0.10)

print('Train Set Size = ' + str(x_train.shape[0]))
print('Test Set Size = ' + str(x_test.shape[0]))

In [None]:
nfold = 3
lr_GKr = pt.lr_param_selection(x_train, y_train, nfolds = nfold, seed = seed)
svc_GKr = pt.svc_param_selection(x_train, y_train, nfolds = nfold, seed = seed)
knn_GKr = pt.knn_param_selection(x_train, y_train, nfolds = nfold, seed = seed)
rfc_GKr = pt.random_forest_selection(x_train, y_train, nfolds=nfold, seed = seed)
gb_GKr = pt.gradientboosting_selection(x_train, y_train, nfolds=nfold, seed = seed)
xgb_GKr = pt.xgboost_selection(x_train, y_train, nfolds=nfold, seed = seed)
ann_GKr = pt.ann_selection(x_train_ann, y_train_ann, nfolds=nfold, seed = seed)
gnb_GKr = pt.nb_selection(x_train, y_train, nfolds=nfold, seed = seed)

keys = ['RF', 'KNN', 'LR', 'GB', 'SVM','ANN','Bayes','XGB']
classifiers_GKr = [rfc_GKr, knn_GKr, lr_GKr, gb_GKr,svc_GKr, ann_GKr, gnb_GKr, xgb_GKr]

In [None]:
output_GKr = ml.create_table(x_train_ann,x_test_ann,y_train_ann,y_test_ann,x_train,x_test,y_train,y_test,classifiers_GKr,keys)
result_table = output_GKr[0]
metrics = output_GKr[1]
rocs = output_GKr[2]
prob = output_GKr[3]
pred = output_GKr[4]
conf_matrix = output_GKr[5]

output_GKr[0]

In [None]:
ml.plot_algs(rocs,result_table)

In [None]:
df_prob = ml.save_prob(prob,y_test,keys)
df_pred = ml.save_pred(pred,y_test,keys)
df_roc = ml.save_ROCs(rocs,keys)

In [None]:
import os

save_filename = folder_save + output_name + '.xlsx'
exists = os.path.isfile(save_filename)
if exists:
    print("File already exists")
else:
    writer = pd.ExcelWriter(save_filename, engine='xlsxwriter')
    result_table.to_excel(writer, sheet_name = 'Results')
    df_roc.to_excel(writer, sheet_name='ROCs',index=False)
    df_pred.to_excel(writer, sheet_name='Prediction')
    df_prob.to_excel(writer, sheet_name='Probability')
    writer.save()  

In [None]:
import rpy2
#import rpy2.robjects.packages as rpackages
#from rpy2.robjects.vectors import StrVector
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
pandas2ri.activate()

In [None]:
d = {'print.me': 'print_dot_me', 'print_me': 'print_uscore_me'}
try:
    proc = importr('pROC', robject_translations = d, lib_loc = "C:/Users/MeeraVarshneya/Documents/R/win-library/3.6")
except:
    proc = importr('pROC', robject_translations = d, lib_loc = "C:/Program Files/R/R-3.6.1/library")

In [None]:
l = df_prob.Label
rf = df_prob.RF
knn = df_prob.KNN
lr = df_prob.LR
gb = df_prob.GB
svm = df_prob.SVM
ann = df_prob.ANN
bayes = df_prob.Bayes
xgb = df_prob.XGB

In [None]:
roc_rf = proc.roc(l, rf)
roc_knn = proc.roc(l, knn)
roc_lr = proc.roc(l, lr)
roc_gb = proc.roc(l, gb)
roc_svm = proc.roc(l, svm)
roc_ann = proc.roc(l, ann)
roc_bayes = proc.roc(l, bayes)
roc_xgb = proc.roc(l, xgb)

In [None]:
dfAUC = result_table.AUC 
max_AUC = dfAUC.idxmax(axis=0, skipna=True)
max_AUC

In [None]:
def get_ci(roc_object):
    x = proc.ci_auc(roc_object)
    lower = x[0]
    upper = x[-1]
    return ['{:.2f}'.format(lower), '{:.2f}'.format(upper)]

def get_pvalue(roc1, roc2):
    y = proc.roc_test(roc1, roc2, alternative = 'two.sided')
    p = np.array(y.rx2('p.value'))
    return p

In [None]:
p = get_pvalue(roc_svm, roc_rf)
print('SVM vs RF:', p)
p = get_pvalue(roc_svm, roc_knn)
print('SVM vs KNN:', p)
p = get_pvalue(roc_svm, roc_lr)
print('SVM vs LR:', p)
p = get_pvalue(roc_svm, roc_gb)
print('SVM vs GB:', p)
p = get_pvalue(roc_svm, roc_ann)
print('SVM vs ANN:', p)
p = get_pvalue(roc_svm, roc_bayes)
print('SVM vs Bayes:', p)
p = get_pvalue(roc_svm, roc_xgb)
print('SVM vs XGB:', p)

In [None]:
from statsmodels.stats.contingency_tables import mcnemar
def get_p(df, model1, model2):
    table = pd.crosstab(df[model1], df[model2])
    result = mcnemar(table, exact=True)
    print('p-value=%.5f'%result.pvalue)
    return result.pvalue

In [None]:
cutoffs = result_table.Threshold
dfprob = pd.DataFrame()
for idx, clf in enumerate(prob[:8]):
    dfprob[keys[idx]] = clf[:, 1]
    dfprob[keys[idx]+"_pred"] = clf[:, 1]>cutoffs[idx]
dfprob['Label'] = y_test

In [None]:
dfprob.head()

In [None]:
dfsen = dfprob[dfprob['Label']==1] ## True positives
dfspe = dfprob[dfprob['Label']==0] ## True negatives

In [None]:
get_p(dfsen, 'SVM_pred', 'RF_pred') # sensitivity shows significant difference 
get_p(dfsen, 'SVM_pred', 'LR_pred') # sensitivity shows significant difference 
get_p(dfsen, 'SVM_pred', 'ANN_pred') # sensitivity shows significant difference 
get_p(dfsen, 'SVM_pred', 'XGB_pred') # sensitivity shows significant difference 

In [None]:
get_p(dfspe, 'SVM_pred', 'RF_pred')# specificity shows significant difference 
get_p(dfspe, 'SVM_pred', 'LR_pred') # sensitivity shows significant difference 
get_p(dfspe, 'SVM_pred', 'ANN_pred') # sensitivity shows significant difference 
get_p(dfspe, 'SVM_pred', 'XGB_pred') # sensitivity shows significant difference 