In [1]:
# First ensemble with NSL-KDD
# Parameters

#----------------------------------------------
# 0 for not using it as base learner
# 1 for using it as base learner

use_model_ada = 1 
use_model_dnn = 1 
use_model_mlp = 1 
use_model_lgbm = 1 
use_model_rf = 1 
use_model_svm = 1
use_model_knn = 1 
#----------------------------------------------
# 0 for training the model
# 1 for using the saved version of the model

load_model_ada = 0 
load_model_dnn = 0 
load_model_mlp = 0 
load_model_lgbm = 0 
load_model_rf = 0 
load_model_svm = 0
load_model_knn = 0 
#----------------------------------------------

# load_model_ada = 1
# load_model_dnn = 1 
# load_model_mlp = 1 
# load_model_lgbm = 1 
# load_model_rf = 1 
# load_model_svm = 1
# load_model_knn = 1 
#----------------------------------------------




In [2]:

# Specify the name of the output text file
output_file_name = "ensemble_prob_FS.txt"
with open(output_file_name, "w") as f: print('---------------------------------------------------------------------------------', file = f)
with open(output_file_name, "a") as f: print('---- Start Ensemble Model Info - v0 ----', file = f)


In [3]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:
# importing required libraries
import numpy as np
import pandas as pd
import pickle # saving and loading trained model
from os import path


# importing required libraries for normalizing data
from sklearn import preprocessing
from sklearn.preprocessing import (StandardScaler, OrdinalEncoder,LabelEncoder, MinMaxScaler, OneHotEncoder)
from sklearn.preprocessing import Normalizer, MaxAbsScaler , RobustScaler, PowerTransformer

# importing library for plotting
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.metrics import accuracy_score # for calculating accuracy of model
from sklearn.model_selection import train_test_split # for splitting the dataset for training and testing
from sklearn.metrics import classification_report # for generating a classification report of model

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc

import tensorflow as tf
from tensorflow.keras.utils import to_categorical

from keras.layers import Dense # importing dense layer

from keras.layers import Input
from keras.models import Model
# representation of model layers
#from keras.utils import plot_model
from sklearn.metrics import confusion_matrix
import shap




# In[2]:


# #Defining metric functions
# def ACC(TP,TN,FP,FN):
#     Acc = (TP+TN)/(TP+FP+FN+TN)
#     return Acc
# def ACC_2 (TP, FN):
#     ac = (TP/(TP+FN))
#     return ac
# def PRECISION(TP,FP):
#     eps = 1e-7
#     Precision = TP/(TP+FP+eps)
    

#     return Precision
# def RECALL(TP,FN):
#     Recall = TP/(TP+FN)
#     return Recall
# def F1(Recall, Precision):
#     F1 = 2 * Recall * Precision / (Recall + Precision)
#     return F1
# def BACC(TP,TN,FP,FN):
#     BACC =(TP/(TP+FN)+ TN/(TN+FP))*0.5
#     return BACC
# def MCC(TP,TN,FP,FN):
#     eps = 1e-7
#     MCC = (TN*TP-FN*FP)/(((TP+FP+eps)*(TP+FN+eps)*(TN+FP+eps)*(TN+FN+eps))**.5)
#     return MCC
# def AUC_ROC(y_test_bin,y_score):
#     fpr = dict()
#     tpr = dict()
#     roc_auc = dict()
#     auc_avg = 0
#     counting = 0
#     for i in range(n_classes):
#       fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
#      # plt.plot(fpr[i], tpr[i], color='darkorange', lw=2)
#       #print('AUC for Class {}: {}'.format(i+1, auc(fpr[i], tpr[i])))
#       auc_avg += auc(fpr[i], tpr[i])
#       counting = i+1
#     return auc_avg/counting


# In[3]:


# attach the column names to the dataset
feature=["duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent","hot",
          "num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root","num_file_creations","num_shells",
          "num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count","srv_count","serror_rate","srv_serror_rate",
          "rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count", 
          "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate",
          "dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","label","difficulty"]
# KDDTrain+_2.csv & KDDTest+_2.csv are the datafiles without the last column about the difficulty score
# these have already been removed.

train='KDDTrain+.txt'
test='KDDTest+.txt'

df=pd.read_csv(train,names=feature)
df_test=pd.read_csv(test,names=feature)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:


# shape, this gives the dimensions of the dataset
print('Dimensions of the Training set:',df.shape)
print('Dimensions of the Test set:',df_test.shape)


df.drop(['difficulty'],axis=1,inplace=True)
df_test.drop(['difficulty'],axis=1,inplace=True)



print('Label distribution Training set:')
print(df['label'].value_counts())
print()
print('Label distribution Test set:')
print(df_test['label'].value_counts())



# colums that are categorical and not binary yet: protocol_type (column 2), service (column 3), flag (column 4).
# explore categorical features
print('Training set:')
for col_name in df.columns:
    if df[col_name].dtypes == 'object' :
        unique_cat = len(df[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

#see how distributed the feature service is, it is evenly distributed and therefore we need to make dummies for all.
print()
print('Distribution of categories in service:')
print(df['service'].value_counts().sort_values(ascending=False).head())



# Test set
print('Test set:')
for col_name in df_test.columns:
    if df_test[col_name].dtypes == 'object' :
        unique_cat = len(df_test[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))


from sklearn.preprocessing import LabelEncoder,OneHotEncoder
categorical_columns=['protocol_type', 'service', 'flag']
# insert code to get a list of categorical columns into a variable, categorical_columns
categorical_columns=['protocol_type', 'service', 'flag'] 
 # Get the categorical values into a 2D numpy array
df_categorical_values = df[categorical_columns]
testdf_categorical_values = df_test[categorical_columns]
df_categorical_values.head()


# protocol type
unique_protocol=sorted(df.protocol_type.unique())
string1 = 'Protocol_type_'
unique_protocol2=[string1 + x for x in unique_protocol]
# service
unique_service=sorted(df.service.unique())
string2 = 'service_'
unique_service2=[string2 + x for x in unique_service]
# flag
unique_flag=sorted(df.flag.unique())
string3 = 'flag_'
unique_flag2=[string3 + x for x in unique_flag]
# put together
dumcols=unique_protocol2 + unique_service2 + unique_flag2
print(dumcols)

#do same for test set
unique_service_test=sorted(df_test.service.unique())
unique_service2_test=[string2 + x for x in unique_service_test]
testdumcols=unique_protocol2 + unique_service2_test + unique_flag2




df_categorical_values_enc=df_categorical_values.apply(LabelEncoder().fit_transform)
print(df_categorical_values_enc.head())
# test set
testdf_categorical_values_enc=testdf_categorical_values.apply(LabelEncoder().fit_transform)



enc = OneHotEncoder()
df_categorical_values_encenc = enc.fit_transform(df_categorical_values_enc)
df_cat_data = pd.DataFrame(df_categorical_values_encenc.toarray(),columns=dumcols)
# test set
testdf_categorical_values_encenc = enc.fit_transform(testdf_categorical_values_enc)
testdf_cat_data = pd.DataFrame(testdf_categorical_values_encenc.toarray(),columns=testdumcols)

df_cat_data.head()


trainservice=df['service'].tolist()
testservice= df_test['service'].tolist()
difference=list(set(trainservice) - set(testservice))
string = 'service_'
difference=[string + x for x in difference]
difference

for col in difference:
    testdf_cat_data[col] = 0

testdf_cat_data.shape

newdf=df.join(df_cat_data)
newdf.drop('flag', axis=1, inplace=True)
newdf.drop('protocol_type', axis=1, inplace=True)
newdf.drop('service', axis=1, inplace=True)
# test data
newdf_test=df_test.join(testdf_cat_data)
newdf_test.drop('flag', axis=1, inplace=True)
newdf_test.drop('protocol_type', axis=1, inplace=True)
newdf_test.drop('service', axis=1, inplace=True)
print(newdf.shape)
print(newdf_test.shape)


# take label column
labeldf=newdf['label']
labeldf_test=newdf_test['label']
# change the label column
newlabeldf=labeldf.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})
newlabeldf_test=labeldf_test.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})
# put the new label column back
newdf['label'] = newlabeldf
newdf_test['label'] = newlabeldf_test
print(newdf['label'].head())


# Specify your selected features. Note that you'll need to modify this list according to your final processed dataframe
#Uncomment the below lines to use these top 20 features from shap analysis
#selected_features = ["root_shell","service_telnet","num_shells","service_uucp","dst_host_same_src_port_rate"
#                     ,"dst_host_rerror_rate","dst_host_srv_serror_rate","dst_host_srv_count","service_private","logged_in",
#                    "dst_host_serror_rate","serror_rate","srv_serror_rate","flag_S0","diff_srv_rate","dst_host_srv_diff_host_rate","num_file_creations","flag_RSTR"#,"dst_host_same_srv_rate","service_Idap","label"]
                     

# Select those features from your dataframe
#newdf = newdf[selected_features]
#newdf_test = newdf_test[selected_features]

# Now your dataframe only contains your selected features.

# creating a dataframe with multi-class labels (Dos,Probe,R2L,U2R,normal)
multi_data = newdf.copy()
multi_label = pd.DataFrame(multi_data.label)

multi_data_test=newdf_test.copy()
multi_label_test = pd.DataFrame(multi_data_test.label)


# using standard scaler for normalizing
std_scaler = StandardScaler()
def standardization(df,col):
    for i in col:
        arr = df[i]
        arr = np.array(arr)
        df[i] = std_scaler.fit_transform(arr.reshape(len(arr),1))
    return df

numeric_col = multi_data.select_dtypes(include='number').columns
data = standardization(multi_data,numeric_col)
numeric_col_test = multi_data_test.select_dtypes(include='number').columns
data_test = standardization(multi_data_test,numeric_col_test)

# label encoding (0,1,2,3,4) multi-class labels (Dos,normal,Probe,R2L,U2R)
le2 = preprocessing.LabelEncoder()
le2_test = preprocessing.LabelEncoder()
enc_label = multi_label.apply(le2.fit_transform)
enc_label_test = multi_label_test.apply(le2_test.fit_transform)
multi_data = multi_data.copy()
multi_data_test = multi_data_test.copy()

multi_data['intrusion'] = enc_label
multi_data_test['intrusion'] = enc_label_test

#y_mul = multi_data['intrusion']
multi_data
multi_data_test



multi_data.drop(labels= [ 'label'], axis=1, inplace=True)
multi_data
multi_data_test.drop(labels= [ 'label'], axis=1, inplace=True)
multi_data_test


y_train_multi= multi_data[['intrusion']]
X_train_multi= multi_data.drop(labels=['intrusion'], axis=1)

print('X_train has shape:',X_train_multi.shape,'\ny_train has shape:',y_train_multi.shape)

y_test_multi= multi_data_test[['intrusion']]
X_test_multi= multi_data_test.drop(labels=['intrusion'], axis=1)

print('X_test has shape:',X_test_multi.shape,'\ny_test has shape:',y_test_multi.shape)


from collections import Counter

label_counts = Counter(y_train_multi['intrusion'])
print(label_counts)


from sklearn.preprocessing import LabelBinarizer

y_train_multi = LabelBinarizer().fit_transform(y_train_multi)

y_test_multi = LabelBinarizer().fit_transform(y_test_multi)


Y_train=y_train_multi.copy()
X_train=X_train_multi.copy()

Y_test=y_test_multi.copy()
X_test=X_test_multi.copy()




Dimensions of the Training set: (125973, 43)
Dimensions of the Test set: (22544, 43)
Label distribution Training set:
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: label, dtype: int64

Label distribution Test set:
normal             9711
neptune            4657
guess_passwd       1231
mscan               996
warezmaster         944
apache2             737
satan               735
processtable        685
smurf               665
back                359
snmpguess  

In [5]:

# In[24]:

'''
from sklearn.feature_selection import SelectKBest, f_classif

# Number of best features you want to select
k = 15

# Initialize a dataframe to store the scores for each feature against each class
feature_scores = pd.DataFrame(index=X_train.columns)

# Loop through each class
for class_index in range(Y_train.shape[1]):
    
    # Get the current class labels
    y_train_current_class = Y_train[:, class_index]
    
    # Select K best features for the current class
    best_features = SelectKBest(score_func=f_classif, k='all')
    fit = best_features.fit(X_train, y_train_current_class)

    # Get the scores
    df_scores = pd.DataFrame(fit.scores_, index=X_train.columns, columns=[f"class_{class_index}"])
    
    # Concatenate the scores to the main dataframe
    feature_scores = pd.concat([feature_scores, df_scores],axis=1)

# Get the sum of the scores for each feature
feature_scores['total'] = feature_scores.sum(axis=1)

# Get the top k features in a list
top_k_features = feature_scores.nlargest(k, 'total').index.tolist()

print(top_k_features)

'''
# In[32]:

'\nfrom sklearn.feature_selection import SelectKBest, f_classif\n\n# Number of best features you want to select\nk = 15\n\n# Initialize a dataframe to store the scores for each feature against each class\nfeature_scores = pd.DataFrame(index=X_train.columns)\n\n# Loop through each class\nfor class_index in range(Y_train.shape[1]):\n    \n    # Get the current class labels\n    y_train_current_class = Y_train[:, class_index]\n    \n    # Select K best features for the current class\n    best_features = SelectKBest(score_func=f_classif, k=\'all\')\n    fit = best_features.fit(X_train, y_train_current_class)\n\n    # Get the scores\n    df_scores = pd.DataFrame(fit.scores_, index=X_train.columns, columns=[f"class_{class_index}"])\n    \n    # Concatenate the scores to the main dataframe\n    feature_scores = pd.concat([feature_scores, df_scores],axis=1)\n\n# Get the sum of the scores for each feature\nfeature_scores[\'total\'] = feature_scores.sum(axis=1)\n\n# Get the top k features in a l

In [6]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.datasets import make_classification

# Assuming you have features X and labels Y
# X, Y = make_classification()

ros = RandomOverSampler(sampling_strategy='minority', random_state=100)

X_train, Y_train = ros.fit_resample(X_train, Y_train)


# In[33]:


print(Y_test)


# In[34]:


X_train.values


# In[35]:



[[0 1 0 0 0]
 [0 1 0 0 0]
 [1 0 0 0 0]
 ...
 [0 1 0 0 0]
 [1 0 0 0 0]
 [0 0 1 0 0]]


array([[-1.10249223e-01, -7.67859947e-03, -4.91864438e-03, ...,
        -1.97262160e-02,  8.25150071e-01, -4.64315895e-02],
       [-1.10249223e-01, -7.73736981e-03, -4.91864438e-03, ...,
        -1.97262160e-02,  8.25150071e-01, -4.64315895e-02],
       [-1.10249223e-01, -7.76224074e-03, -4.91864438e-03, ...,
        -1.97262160e-02, -1.21190076e+00, -4.64315895e-02],
       ...,
       [-9.29714678e-02, -7.36430591e-03, -3.87394518e-03, ...,
        -1.97262160e-02,  8.25150071e-01, -4.64315895e-02],
       [-8.68282658e-02, -7.36430591e-03, -3.87568593e-03, ...,
        -1.97262160e-02,  8.25150071e-01, -4.64315895e-02],
       [ 1.61587463e-01, -7.46804833e-03,  1.06953862e-03, ...,
        -1.97262160e-02,  8.25150071e-01, -4.64315895e-02]])

In [7]:
single_class_train = np.argmax(y_train_multi, axis=1)
single_class_test = np.argmax(y_test_multi, axis=1)


df1 = X_train_multi.assign(Label = single_class_train)
df2 =  X_test_multi.assign(Label = single_class_test)

frames = [df1,  df2]

df = pd.concat(frames,ignore_index=True)

feature_selection = [
                    'dst_host_same_srv_rate',
                    'dst_host_srv_count',
                    'dst_host_same_src_port_rate',
                    'logged_in',
                    'dst_host_serror_rate',
                    'count',
                    'srv_count',
                    'dst_host_rerror_rate',
                    'Label'
                    ]

df_og = df
df = df[feature_selection]

# df.pop('dst host same srv rate')
# df.pop('dst host srv count')
# df.pop('dst host same src port rate')
# df.pop('logged in')
# df.pop('dst host serror rate')
# df.pop('count')
# df.pop('srv count')
# df.pop('dst host rerror rate')




In [8]:

y = df.pop('Label')
X = df

y1, y2 = pd.factorize(y)

y_0 = pd.DataFrame(y1)
y_1 = pd.DataFrame(y1)
y_2 = pd.DataFrame(y1)
y_3 = pd.DataFrame(y1)
y_4 = pd.DataFrame(y1)


# y_0 = y_0.replace(0, 0)
# y_0 = y_0.replace(1, 1)
y_0 = y_0.replace(2, 1)
y_0 = y_0.replace(3, 1)
y_0 = y_0.replace(4, 1)


y_1 = y_1.replace(1, 999)
y_1 = y_1.replace(0, 1)
# y_1 = y_1.replace(1, 0)
y_1 = y_1.replace(2, 1)
y_1 = y_1.replace(3, 1)
y_1 = y_1.replace(4, 1)
y_1 = y_1.replace(999, 1)


y_2 = y_2.replace(0, 1)
y_2 = y_2.replace(1, 1)
y_2 = y_2.replace(2, 0)
y_2 = y_2.replace(3, 1)
y_2 = y_2.replace(4, 1)


y_3 = y_3.replace(0, 1)
# y_3 = y_3.replace(1, 1)
y_3 = y_3.replace(2, 1)
y_3 = y_3.replace(3, 0)
y_3 = y_3.replace(4, 1)


y_4 = y_4.replace(0, 1)
# y_4 = y_4.replace(1, 1)
y_4 = y_4.replace(2, 1)
y_4 = y_4.replace(3, 1)
y_4 = y_4.replace(4, 0)



df = df.assign(Label = y)

In [9]:
#Divide the dataset between level 00 and level 01
import sklearn
from sklearn.model_selection import train_test_split
split = 0.5 # 0.7

# X_00,X_01, y_00, y_01 = sklearn.model_selection.train_test_split(X, y, train_size=split)
X_train,X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=split)

In [10]:
from collections import Counter

label_counts2 = Counter(y)
print(label_counts2)


Counter({0: 77054, 1: 53387, 2: 14077, 3: 3880, 4: 119})


In [11]:
#Base learner Split
# split = 0.7

# X_train,X_test, y_train, y_test = sklearn.model_selection.train_test_split(X_00, y_00, train_size=split)

In [12]:
X_train

Unnamed: 0,dst_host_same_srv_rate,dst_host_srv_count,dst_host_same_src_port_rate,logged_in,dst_host_serror_rate,count,srv_count,dst_host_rerror_rate
113397,1.066401,0.807092,-0.383108,-0.809262,-0.639532,-0.559851,-0.106530,2.548205
135078,0.898090,1.022079,-0.431856,1.123125,-0.358118,-0.544813,-0.248420,-0.602719
106474,-0.826915,-0.692425,-0.447834,-0.809262,1.608759,-0.097002,-0.120298,-0.387635
110039,-1.138756,-1.026654,-0.480197,-0.809262,1.608759,0.287250,-0.354343,-0.387635
112503,1.066401,1.258754,-0.480197,1.235694,-0.639532,-0.708312,-0.078996,-0.387635
...,...,...,...,...,...,...,...,...
87261,1.066401,1.258754,-0.415471,1.235694,-0.639532,-0.647181,-0.216669,-0.387635
102882,1.066401,-1.035688,2.756092,1.235694,-0.639532,-0.717045,-0.354343,-0.387635
15251,-1.161030,-1.035688,-0.480197,-0.809262,-0.279805,3.710587,-0.368110,2.352482
47685,-0.002766,0.147666,1.202673,-0.809262,-0.639532,3.728053,6.653245,-0.387635


In [13]:
y_train

113397    0
135078    0
106474    1
110039    1
112503    0
         ..
87261     0
102882    0
15251     2
47685     1
17342     2
Name: Label, Length: 74258, dtype: int64

## LEVEL 0 - Weak models - Base Learner

In [14]:
with open(output_file_name, "a") as f: print('------------START of WEAK LEARNERS (BASE MODELS) - STACK 00 -----------------', file = f)

#Defining Basemodels


print('---------------------------------------------------------------------------------')
print('Defining RF Model')
print('---------------------------------------------------------------------------------')
#Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
rf = RandomForestClassifier(max_depth = 5,  n_estimators = 10, min_samples_split = 2, n_jobs = -1)
#------------------------------------------------------------------------------


print('---------------------------------------------------------------------------------')
print('Defining ADA Model')
print('---------------------------------------------------------------------------------')
#ADA
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import AdaBoostClassifier
import time
abc = AdaBoostClassifier(n_estimators=50, learning_rate=1.0)


print('---------------------------------------------------------------------------------')
print('Defining LGBM Model')
print('---------------------------------------------------------------------------------')
#LGBM
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier()



#KNN
print('---------------------------------------------------------------------------------')
print('Defining KNN Model')
print('---------------------------------------------------------------------------------')
from sklearn.neighbors import KNeighborsClassifier
knn_clf=KNeighborsClassifier(n_neighbors = 5)


#SVM
print('---------------------------------------------------------------------------------')
print('Defining SVM Model')
print('---------------------------------------------------------------------------------')

from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import SGDClassifier

# Instantiate the SGDClassifier with additional hyperparameters
clf = SGDClassifier(
    loss='hinge',           # hinge loss for linear SVM
    penalty='l2',           # L2 regularization to prevent overfitting
    alpha=1e-4,             # Learning rate (small value for fine-grained updates)
    max_iter=1000,          # Number of passes over the training data
    random_state=42,        # Seed for reproducible results
    learning_rate='optimal' # Automatically adjusts the learning rate based on the training data
)


#MLP
print('---------------------------------------------------------------------------------')
print('Defining MLP Model')
print('---------------------------------------------------------------------------------')


from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
import time

# create MLPClassifier instance
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=200, random_state=1)


#DNN
print('---------------------------------------------------------------------------------')
print('Defining DNN Model')
print('---------------------------------------------------------------------------------')

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# #Model Parameters
# dropout_rate = 0.01
# nodes = 70
# out_layer = 5
# optimizer='adam'
# loss='sparse_categorical_crossentropy'
# epochs=1
# batch_size=2*256

#Model Parameters
dropout_rate = 0.2
nodes = 3
out_layer = 5
optimizer='adam'
loss='sparse_categorical_crossentropy'
epochs=100
batch_size=128


num_columns = X_train.shape[1]

dnn = tf.keras.Sequential()

# Input layer
dnn.add(tf.keras.Input(shape=(num_columns,)))

# Dense layers with dropout
dnn.add(tf.keras.layers.Dense(nodes))
dnn.add(tf.keras.layers.Dropout(dropout_rate))

dnn.add(tf.keras.layers.Dense(nodes))
dnn.add(tf.keras.layers.Dropout(dropout_rate))

dnn.add(tf.keras.layers.Dense(nodes))
dnn.add(tf.keras.layers.Dropout(dropout_rate))

dnn.add(tf.keras.layers.Dense(nodes))
dnn.add(tf.keras.layers.Dropout(dropout_rate))

dnn.add(tf.keras.layers.Dense(nodes))
dnn.add(tf.keras.layers.Dropout(dropout_rate))

# Output layer
dnn.add(tf.keras.layers.Dense(out_layer))



dnn.compile(optimizer=optimizer, loss=loss,metrics=['accuracy'])

dnn.summary()



# dnn = Sequential()
# dnn.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))  # Input layer
# dnn.add(Dense(64, activation='relu'))  # Hidden layer
# dnn.add(Dense(5))  # Output layer

# dnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# # summary of model layers
# dnn.summary()

---------------------------------------------------------------------------------
Defining RF Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Defining ADA Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Defining LGBM Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Defining KNN Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Defining SVM Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Def

In [15]:
# #SVM
# # Wrap SGDClassifier with MultiOutputClassifier
# multi_target_clf = MultiOutputClassifier(clf)

# # Fit the model on the training data
# multi_target_clf.fit(X_train, y_train)

# Make predictions on the test data
# y_pred = clf.predict(X_test)



In [16]:
#Training Basemodels
import joblib
from sklearn.model_selection import StratifiedKFold, cross_val_score
n_splits = 5  # You can adjust the number of folds as needed



print('---------------------------------------------------------------------------------')
print('Training Model')
with open(output_file_name, "a") as f: print('Training weak models - level 0', file = f)

print('---------------------------------------------------------------------------------')

if use_model_ada == 1 and load_model_ada == 0:

    print('---------------------------------------------------------------------------------')
    print('Training ADA')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training ADA', file = f)
    print('---------------------------------------------------------------------------------')
    #ADA


    start = time.time()
    ada = abc.fit(X_train, y_train)
    end = time.time()

    # Create the StratifiedKFold object
    stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # Perform cross-validation
    cv_scores = cross_val_score(ada, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # Print the cross-validation scores
    print("Cross-validation scores:", cv_scores)
    print("Mean accuracy:", cv_scores.mean())
    with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)

    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)

    # Assuming 'model' is your trained model
    joblib.dump(ada, 'ada_base_model.joblib')


if use_model_rf == 1 and load_model_rf == 0:

    print('---------------------------------------------------------------------------------')
    print('Training RF')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)
    with open(output_file_name, "a") as f: print('Training RF', file = f)
    print('---------------------------------------------------------------------------------')
    #RF
    start = time.time()
    model_rf = rf.fit(X_train,y_train)
    end = time.time()

    # Create the StratifiedKFold object
    stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # Perform cross-validation
    cv_scores = cross_val_score(model_rf, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # Print the cross-validation scores
    print("Cross-validation scores:", cv_scores)
    print("Mean accuracy:", cv_scores.mean())
    with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)


    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(model_rf, 'rf_base_model.joblib')

if use_model_svm == 1 and load_model_svm == 0:

    print('---------------------------------------------------------------------------------')
    print('Training SVM')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training SVM', file = f)
    print('---------------------------------------------------------------------------------')
    #SVM

    start = time.time()
    clf.fit(X_train, y_train)
    end = time.time()
    # clf.score(X_train, y_train)
    time_taken = end - start

    # Create the StratifiedKFold object
    stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # Perform cross-validation
    cv_scores = cross_val_score(clf, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # Print the cross-validation scores
    print("Cross-validation scores:", cv_scores)
    print("Mean accuracy:", cv_scores.mean())
    with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)


    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(clf, 'svm_base_model.joblib')


if use_model_knn == 1 and load_model_knn == 0:

    #KNN
    print('---------------------------------------------------------------------------------')
    print('Training KNN')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training KNN', file = f)
    print('---------------------------------------------------------------------------------')
    start = time.time()
    knn_clf.fit(X_train,y_train)
    end = time.time()


    # Create the StratifiedKFold object
    stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # Perform cross-validation
    cv_scores = cross_val_score(knn_clf, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # Print the cross-validation scores
    print("Cross-validation scores:", cv_scores)
    print("Mean accuracy:", cv_scores.mean())
    with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)


    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(knn_clf, 'knn_base_model.joblib')


if use_model_lgbm == 1 and load_model_lgbm == 0:


    print('---------------------------------------------------------------------------------')
    print('Training LGBM')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training LGBM', file = f)
    print('---------------------------------------------------------------------------------')
    start = time.time()
    lgbm.fit(X_train, y_train)
    end = time.time()

    # Create the StratifiedKFold object
    stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # Perform cross-validation
    cv_scores = cross_val_score(lgbm, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # Print the cross-validation scores
    print("Cross-validation scores:", cv_scores)
    print("Mean accuracy:", cv_scores.mean())
    with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)

    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(lgbm, 'lgbm_base_model.joblib')

if use_model_mlp == 1 and load_model_mlp == 0:


    print('---------------------------------------------------------------------------------')
    print('Training MLP')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training MLP', file = f)
    print('---------------------------------------------------------------------------------')

    start = time.time()
    MLP = mlp.fit(X_train, y_train)
    end = time.time()

    # Create the StratifiedKFold object
    stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # Perform cross-validation
    cv_scores = cross_val_score(MLP, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # Print the cross-validation scores
    print("Cross-validation scores:", cv_scores)
    print("Mean accuracy:", cv_scores.mean())
    with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)

    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(MLP, 'mlp_base_model.joblib')


if use_model_dnn == 1 and load_model_dnn == 0:
    from keras.callbacks import EarlyStopping

    # Define EarlyStopping callback
    early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)
    print('---------------------------------------------------------------------------------')
    print('Training DNN')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training DNN', file = f)
    print('---------------------------------------------------------------------------------')
    # Convert Y_test back to its original format
    # y_test = np.argmax(Y_test, axis=1)

    # Start the timer
    start = time.time()
    # dnn.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)
    dnn.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.2, callbacks=[early_stopping])

    # End the timer
    end = time.time()

    # # Create the StratifiedKFold object
    # stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # # Perform cross-validation
    # cv_scores = cross_val_score(dnn, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # # Print the cross-validation scores
    # print("Cross-validation scores:", cv_scores)
    # print("Mean accuracy:", cv_scores.mean())
    # with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)


    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    dnn.save("DNN_base_model.h5")

    # Calculate the time taken and print it out
    # print(f'Time taken for training: {time_taken} seconds')


with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)



---------------------------------------------------------------------------------
Training Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Training ADA
---------------------------------------------------------------------------------


Cross-validation scores: [0.73990035 0.76521681 0.75680043 0.57598815 0.75106053]
Mean accuracy: 0.7177932540894639
---------------------------------------------------------------------------------
Training RF
---------------------------------------------------------------------------------
Cross-validation scores: [0.9107191  0.91698088 0.92041476 0.92296815 0.9143492 ]
Mean accuracy: 0.9170864168778934
---------------------------------------------------------------------------------
Training SVM
---------------------------------------------------------------------------------
Cross-validation scores: [0.85974953 0.86978185 0.86385672 0.86472291 0.86364555]
Mean accuracy: 0.8643513111831409
---------------------------------------------------------------------------------
Training KNN
---------------------------------------------------------------------------------
Cross-validation scores: [0.97333692 0.97326959 0.97225963 0.97299845 0.97320046]
Mean accuracy: 0.9730130110275945
------

Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


Cross-validation scores: [0.96397792 0.96613251 0.96404525 0.9643795  0.96128207]
Mean accuracy: 0.9639634481762942
---------------------------------------------------------------------------------
Training DNN
---------------------------------------------------------------------------------
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


In [17]:
# from keras.models import Sequential
# from keras.layers import Dense
# from keras.wrappers.scikit_learn import KerasClassifier
# from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import StratifiedKFold

# # Define your Keras model as a function
# def create_model(optimizer='adam', hidden_layer_size=16):
#     # model = Sequential()
#     # model.add(Dense(hidden_layer_size, input_dim=input_size, activation='relu'))
#     # model.add(Dense(1, activation='sigmoid'))
#     # model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

        
#     dnn = tf.keras.Sequential()

#     # Input layer
#     dnn.add(tf.keras.Input(shape=(num_columns,)))

#     # Dense layers with dropout
#     dnn.add(tf.keras.layers.Dense(nodes))
#     dnn.add(tf.keras.layers.Dropout(dropout_rate))

#     dnn.add(tf.keras.layers.Dense(nodes))
#     dnn.add(tf.keras.layers.Dropout(dropout_rate))

#     dnn.add(tf.keras.layers.Dense(nodes))
#     dnn.add(tf.keras.layers.Dropout(dropout_rate))

#     dnn.add(tf.keras.layers.Dense(nodes))
#     dnn.add(tf.keras.layers.Dropout(dropout_rate))

#     dnn.add(tf.keras.layers.Dense(nodes))
#     dnn.add(tf.keras.layers.Dropout(dropout_rate))

#     # Output layer
#     dnn.add(tf.keras.layers.Dense(out_layer))



#     dnn.compile(optimizer=optimizer, loss=loss)

#     dnn.summary()
#     return dnn

# # Create a KerasClassifier
# dnn = KerasClassifier(build_fn=create_model, epochs=10, batch_size=32, verbose=0)

# # Define the parameter grid for GridSearchCV
# param_grid = {
#     'optimizer': ['adam', 'sgd'],
#     'hidden_layer_size': [8, 16, 32]
# }

# # Create the StratifiedKFold
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# # Create GridSearchCV
# grid = GridSearchCV(estimator=dnn, param_grid=param_grid, cv=cv, scoring='accuracy')
# grid_result = grid.fit(X_train, y_train)

# # Print the best parameters and best accuracy
# print("Best Parameters: ", grid_result.best_params_)
# print("Best Accuracy: ", grid_result.best_score_)



In [18]:
# stratified_kfold

In [19]:
# Loading Models
from tensorflow.keras.models import load_model

if load_model_ada == 1:
    ada = joblib.load('ada_base_model.joblib')

if load_model_svm == 1:
    clf =  joblib.load('svm_base_model.joblib')

if load_model_dnn == 1:
    dnn = load_model("DNN_base_model.h5")

if load_model_knn == 1:
    knn_clf = joblib.load('knn_base_model.joblib')

if load_model_mlp == 1:
    MLP = joblib.load('mlp_base_model.joblib')

if load_model_rf == 1:
    rf = joblib.load('rf_base_model.joblib')

if load_model_lgbm == 1:
    lgbm = joblib.load('lgbm_base_model.joblib')







In [20]:
# Make predictions on the test data
# preds_svm = clf.predict(X_test)



# y_scores = y_pred
# y_true = y_test



### Base leaners predictions

In [21]:
from sklearn.calibration import CalibratedClassifierCV
with open(output_file_name, "a") as f: print('Generating Predictions', file = f)

if use_model_rf == 1:

    print('---------------------------------------------------------------------------------')
    print('Prediction RF')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Prediction RF', file = f)
    print('---------------------------------------------------------------------------------')
    #RF
    start = time.time()
    preds_rf = rf.predict(X_test)
    preds_rf_prob = rf.predict_proba(X_test)
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')

if use_model_svm == 1:

    print('Prediction SVM')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Prediction SVM', file = f)
    print('---------------------------------------------------------------------------------')
    #SVM
    start = time.time()
    preds_svm = clf.predict(X_test)
    # preds_svm_prob = clf.predict_proba(X_test)

    #Since SVM does not deal with prob by nature we use a meta learner
    # https://stackoverflow.com/questions/55250963/how-to-get-probabilities-for-sgdclassifier-linearsvm

    model = CalibratedClassifierCV(clf)

    model.fit(X, y)
    preds_svm_prob = model.predict_proba(X)

    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')

if use_model_lgbm == 1:

    print('Prediction LGBM')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Prediction LGBM', file = f)
    print('---------------------------------------------------------------------------------')
    #LGBM
    start = time.time()
    preds_lgbm = lgbm.predict(X_test)
    preds_lgbm_prob = lgbm.predict_proba(X_test)

    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')

if use_model_dnn == 1:

    print('Prediction DNN')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Prediction DNN', file = f)
    print('---------------------------------------------------------------------------------')
    #DNN
    start = time.time()
    pred_dnn = dnn.predict(X_test)
    preds_dnn_prob = pred_dnn
    preds_dnn = np.argmax(pred_dnn,axis = 1)
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')

if use_model_ada == 1:

    print('Prediction ADA')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Prediction ADA', file = f)
    print('---------------------------------------------------------------------------------')
    #ADA
    start = time.time()
    preds_ada = ada.predict(X_test)
    preds_ada_prob = ada.predict_proba(X_test)

    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')
    print('Prediction MLP')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Prediction MLP', file = f)
    print('---------------------------------------------------------------------------------')

if use_model_mlp == 1:

    #MLP
    start = time.time()
    y_pred = MLP.predict_proba(X_test)
    preds_mlp_prob = y_pred
    preds_mlp = np.argmax(y_pred,axis = 1)
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')
    print('Prediction KNN')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Prediction KNN', file = f)
    print('---------------------------------------------------------------------------------')

if use_model_knn == 1:

    #KNN
    start = time.time()
    preds_knn =knn_clf.predict(X_test)
    preds_knn_prob =knn_clf.predict_proba(X_test)

    preds_knn
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)


---------------------------------------------------------------------------------
Prediction RF
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Prediction SVM
---------------------------------------------------------------------------------


---------------------------------------------------------------------------------
Prediction LGBM
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Prediction DNN
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Prediction ADA
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Prediction MLP
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Prediction KNN
---------------------------------------------------------------------------------


In [22]:
# from sklearn.calibration import CalibratedClassifierCV
# model = CalibratedClassifierCV(clf)

# model.fit(X, y)
# preds_svm_prob = model.predict_proba(X)

# print(preds_ada_prob)
# print(preds_knn_prob)
# print(preds_dnn_prob)
# print(preds_mlp_prob)
# print(preds_rf_prob)
# print(preds_svm_prob)


In [23]:
print(preds_svm_prob)
preds_3 = np.argmax(preds_svm_prob,axis = 1)
print(preds_3)

print(preds_svm)
# print(y_train)

[[7.86755451e-01 9.15808640e-02 7.78774782e-02 4.31012582e-02
  6.84948135e-04]
 [2.62305098e-01 2.71329463e-02 6.45857927e-01 6.23936481e-02
  2.31038051e-03]
 [1.69951428e-02 9.10446456e-01 5.10282199e-02 2.14799179e-02
  5.02632551e-05]
 ...
 [9.42972082e-01 3.31526500e-02 1.34223706e-03 2.24056309e-02
  1.27400343e-04]
 [9.12741440e-01 3.97164126e-02 1.80198033e-02 2.95111286e-02
  1.12159303e-05]
 [5.57736620e-01 3.72736907e-01 5.55977328e-02 1.35376604e-02
  3.91079556e-04]]
[0 2 1 ... 0 0 0]
[2 0 1 ... 1 0 0]


### METRICS - Base Learners

In [24]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score



# >>> 
# >>> roc_auc_score(y, clf.predict_proba(X)[:, 1])
# 0.99...
# >>> roc_auc_score(y, clf.decision_function(X))

In [25]:
import catboost

cat_00 = catboost.CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='MultiClass', custom_metric='Accuracy')

# Fit the model
cat_00.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=10)

# Make predictions on the test set
preds_cat = cat_00.predict(X_test)
preds_cat_prob = cat_00.predict_proba(X_test)
preds_cat = np.squeeze(preds_cat)


if 1 == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Catboost base model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_cat
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)

    Acc = accuracy_score(y_test, pred_label)
    cat_acc_00 = Acc
    Precision = precision_score(y_test, pred_label, average='macro')
    Recall = recall_score(y_test, pred_label, average='macro')
    F1 =  f1_score(y_test, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test, pred_label)
    MCC = matthews_corrcoef(y_test, pred_label)

    cat_acc_00 = Acc
    cat_pre_00 = Precision
    cat_rec_00 = Recall
    cat_f1_00 = F1
    cat_bacc_00 = BACC
    cat_mcc_00 = MCC

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)



0:	learn: 1.3162994	test: 1.3152727	best: 1.3152727 (0)	total: 73.6ms	remaining: 7.29s
10:	learn: 0.4762946	test: 0.4729843	best: 0.4729843 (10)	total: 273ms	remaining: 2.21s
20:	learn: 0.2779608	test: 0.2743050	best: 0.2743050 (20)	total: 430ms	remaining: 1.62s


30:	learn: 0.1976391	test: 0.1945537	best: 0.1945537 (30)	total: 531ms	remaining: 1.18s
40:	learn: 0.1608685	test: 0.1584695	best: 0.1584695 (40)	total: 613ms	remaining: 883ms
50:	learn: 0.1386506	test: 0.1365561	best: 0.1365561 (50)	total: 692ms	remaining: 665ms
60:	learn: 0.1253986	test: 0.1236924	best: 0.1236924 (60)	total: 772ms	remaining: 494ms
70:	learn: 0.1154201	test: 0.1142025	best: 0.1142025 (70)	total: 850ms	remaining: 347ms
80:	learn: 0.1086751	test: 0.1077649	best: 0.1077649 (80)	total: 931ms	remaining: 218ms
90:	learn: 0.1029298	test: 0.1022821	best: 0.1022821 (90)	total: 1.01s	remaining: 99.7ms
99:	learn: 0.0978748	test: 0.0973233	best: 0.0973233 (99)	total: 1.08s	remaining: 0us

bestTest = 0.09732327488
bestIteration = 99

---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0        1       2       3    4
0  37691.0    284.0   288.0  

Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


In [26]:
import xgboost as xgb
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score

# Assuming you have your features and labels as X and y
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set XGBoost parameters
params = {
    'objective': 'multi:softmax',  # for multi-class classification
    'num_class': 5,  # specify the number of classes
    'max_depth': 3,
    'learning_rate': 0.1,
    'eval_metric': 'mlogloss'  # metric for multi-class classification
}

# Train the XGBoost model
num_round = 100
xgb_00 = xgb.train(params, dtrain, num_round)

# Make predictions on the test set
preds_xgb = xgb_00.predict(dtest)
# preds_xgb_prob = xgb_00.predict_proba(dtest)


# Get class probabilities
# Assuming binary classification, get the probability for the positive class (class 1)
preds_xgb_margin = xgb_00.predict(dtest, output_margin=True)
preds_xgb_prob = 1 / (1 + np.exp(-preds_xgb_margin))

# Print or use positive_class_probabilities as needed
# print(positive_class_probabilities)


# Convert predicted probabilities to class labels (if necessary)
# y_pred_labels = [round(value) for value in y_pred]

# Evaluate the accuracy
# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy: %.2f%%" % (accuracy * 100.0))


In [27]:

if 1 == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('xgboost base model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_xgb
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)

    Acc = accuracy_score(y_test, pred_label)
    xgb_acc_00 = Acc
    Precision = precision_score(y_test, pred_label, average='macro')
    Recall = recall_score(y_test, pred_label, average='macro')
    F1 =  f1_score(y_test, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test, pred_label)
    MCC = matthews_corrcoef(y_test, pred_label)
    xgb_acc_00 = Acc
    xgb_pre_00 = Precision
    xgb_rec_00 = Recall
    xgb_f1_00 = F1
    xgb_bacc_00 = BACC
    xgb_mcc_00 = MCC
    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)


---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0.0      1.0     2.0     3.0  4.0
0.0  37487.0    361.0   403.0   322.0  0.0
1.0    795.0  25689.0   151.0    32.0  0.0
2.0    347.0    132.0  6519.0    63.0  0.0
3.0    631.0     14.0    31.0  1225.0  0.0
4.0     31.0      3.0     2.0    21.0  0.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Accuracy total:  0.9550357532420313
Precision total:  0.7177268205303429
Recall total:  0.7005618139133347
F1 total:  0.7084949959099308
BACC total:  0.7005618139133347
MCC total:  0.9237570309178226


#### RF

In [28]:
# y_test
# pred_label

In [29]:
#RF
if use_model_rf == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('RF base model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_rf
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)

    Acc = accuracy_score(y_test, pred_label)
    rf_acc_00 = Acc
    Precision = precision_score(y_test, pred_label, average='macro')
    Recall = recall_score(y_test, pred_label, average='macro')
    F1 =  f1_score(y_test, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test, pred_label)
    MCC = matthews_corrcoef(y_test, pred_label)

    rf_acc_00 = Acc
    rf_pre_00 = Precision
    rf_rec_00 = Recall
    rf_f1_00 = F1
    rf_bacc_00 = BACC
    rf_mcc_00 = MCC

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)



---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0        1       2     3    4
0  38192.0    187.0   191.0   3.0  0.0
1   1798.0  24763.0   106.0   0.0  0.0
2   1161.0    355.0  5545.0   0.0  0.0
3   1641.0     37.0   169.0  54.0  0.0
4     48.0      2.0     4.0   3.0  0.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9231742953716048
Precision total:  0.7380881443061241
Recall total:  0.5464858303636617
F1 total:  0.5587285632916946
BACC total:  0.5464858303636617
MCC total:  0.8693646979033856


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


In [30]:
#DNN
if use_model_dnn == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('DNN base model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_dnn
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)

    Acc = accuracy_score(y_test, pred_label)
    dnn_acc_00 = Acc

    Precision = precision_score(y_test, pred_label, average='macro')
    Recall = recall_score(y_test, pred_label, average='macro')
    F1 =  f1_score(y_test, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test, pred_label)
    MCC = matthews_corrcoef(y_test, pred_label)

    dnn_acc_00 = Acc
    dnn_pre_00 = Precision
    dnn_rec_00 = Recall
    dnn_f1_00 = F1
    dnn_bacc_00 = BACC
    dnn_mcc_00 = MCC

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0       1      2    3    4
0  34876.0  2961.0  736.0  0.0  0.0
1  21282.0  5366.0   19.0  0.0  0.0
2   4503.0  2437.0  121.0  0.0  0.0
3   1331.0   464.0  106.0  0.0  0.0
4     33.0    12.0   12.0  0.0  0.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.5435435435435435
Precision total:  0.23228438339215812
Recall total:  0.224502924856971
F1 total:  0.2013062027461246
BACC total:  0.224502924856971
MCC total:  0.1329918769284351


In [31]:
#ADA
if use_model_ada == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('ADA base model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_ada
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)

    Acc = accuracy_score(y_test, pred_label)
    ada_acc_00 = Acc

    Precision = precision_score(y_test, pred_label, average='macro')
    Recall = recall_score(y_test, pred_label, average='macro')
    F1 =  f1_score(y_test, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test, pred_label)
    MCC = matthews_corrcoef(y_test, pred_label)


    ada_acc_00 = Acc
    ada_pre_00 = Precision
    ada_rec_00 = Recall
    ada_f1_00 = F1
    ada_bacc_00 = BACC
    ada_mcc_00 = MCC

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)



---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0        1       2      3     4
0  30527.0   5357.0  1791.0  882.0  16.0
1    708.0  19208.0  6530.0  219.0   2.0
2    775.0    441.0  5739.0  106.0   0.0
3    963.0     48.0    14.0  864.0  12.0
4     25.0      6.0     1.0   23.0   2.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.7586959156465883
Precision total:  0.5148897595750748
Recall total:  0.5628118482439187
F1 total:  0.5232524495141136
BACC total:  0.5628118482439187
MCC total:  0.6247111151222441


In [32]:
#SVM
if use_model_svm == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('SVM base model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_svm
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)

    Acc = accuracy_score(y_test, pred_label)
    Precision = precision_score(y_test, pred_label, average='macro')
    Recall = recall_score(y_test, pred_label, average='macro')
    F1 =  f1_score(y_test, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test, pred_label)
    MCC = matthews_corrcoef(y_test, pred_label)

    svm_acc_00 = Acc
    svm_pre_00 = Precision
    svm_rec_00 = Recall
    svm_f1_00 = F1
    svm_bacc_00 = BACC
    svm_mcc_00 = MCC

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)



---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0        1       2    3    4
0  36019.0    910.0  1635.0  9.0  0.0
1   2131.0  24339.0   197.0  0.0  0.0
2   1231.0   1869.0  3961.0  0.0  0.0
3   1622.0     82.0   197.0  0.0  0.0
4     42.0      7.0     8.0  0.0  0.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.8661441710769066
Precision total:  0.4865043558587877
Recall total:  0.48149153505386855
F1 total:  0.4829959148201005
BACC total:  0.48149153505386855
MCC total:  0.768422624124082


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


In [33]:
#KNN
if use_model_knn == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('KNN base model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_knn
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)

    Acc = accuracy_score(y_test, pred_label)
    Precision = precision_score(y_test, pred_label, average='macro')
    Recall = recall_score(y_test, pred_label, average='macro')
    F1 =  f1_score(y_test, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test, pred_label)
    MCC = matthews_corrcoef(y_test, pred_label)


    knn_acc_00 = Acc
    knn_pre_00 = Precision
    knn_rec_00 = Recall
    knn_f1_00 = F1
    knn_bacc_00 = BACC
    knn_mcc_00 = MCC

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)



---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0        1       2       3    4
0  37684.0    306.0   245.0   330.0  8.0
1    215.0  26399.0    26.0    27.0  0.0
2    300.0     44.0  6683.0    34.0  0.0
3    267.0      6.0    22.0  1606.0  0.0
4     36.0      3.0     0.0    16.0  2.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.974615871476858
Precision total:  0.784230172592528
Recall total:  0.7586551316929457
F1 total:  0.7597319520860626
BACC total:  0.7586551316929457
MCC total:  0.957101590350768


In [34]:
#MLP
if use_model_mlp == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('MLP base model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_mlp
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)

    Acc = accuracy_score(y_test, pred_label)
    Precision = precision_score(y_test, pred_label, average='macro')
    Recall = recall_score(y_test, pred_label, average='macro')
    F1 =  f1_score(y_test, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test, pred_label)
    MCC = matthews_corrcoef(y_test, pred_label)

    mlp_acc_00 = Acc
    mlp_pre_00 = Precision
    mlp_rec_00 = Recall
    mlp_f1_00 = F1
    mlp_bacc_00 = BACC
    mlp_mcc_00 = MCC

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)



---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0        1       2       3    4
0  37658.0    346.0   270.0   299.0  0.0
1    351.0  26196.0    90.0    30.0  0.0
2    473.0     60.0  6504.0    24.0  0.0
3    429.0     34.0    38.0  1400.0  0.0
4     37.0      2.0     1.0    16.0  1.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9663340470515358
Precision total:  0.9367785393943964
Recall total:  0.72674616229928
F1 total:  0.7366784048823265
BACC total:  0.72674616229928
MCC total:  0.9429046243180847


In [35]:
#lgbm

if use_model_lgbm == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('LGBM base model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_lgbm
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)

    Acc = accuracy_score(y_test, pred_label)
    Precision = precision_score(y_test, pred_label, average='macro')
    Recall = recall_score(y_test, pred_label, average='macro')
    F1 =  f1_score(y_test, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test, pred_label)
    MCC = matthews_corrcoef(y_test, pred_label)
    lgbm_acc_00 = Acc
    lgbm_pre_00 = Precision
    lgbm_rec_00 = Recall
    lgbm_f1_00 = F1
    lgbm_bacc_00 = BACC
    lgbm_mcc_00 = MCC
    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)



---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0        1       2       3     4
0  37851.0    220.0   181.0   274.0  47.0
1    149.0  26468.0    24.0    18.0   8.0
2    172.0     19.0  6841.0    22.0   7.0
3    195.0      1.0    13.0  1682.0  10.0
4     34.0      0.0     0.0    16.0   7.0


---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9810124025370662
Precision total:  0.7740801027644095
Recall total:  0.7900534539974579
F1 total:  0.7813739805540139
BACC total:  0.7900534539974579
MCC total:  0.9679610450939468


## Training the stronger model - STACK level 01

In [36]:
print(len(preds_dnn_prob), len(y_test))

74259 74259


In [37]:
print(y_test)

123264    2
56743     0
88220     1
36372     1
109357    1
         ..
117763    1
25489     0
89481     1
13032     0
87389     0
Name: Label, Length: 74259, dtype: int64


In [38]:
df_from_series = y_test.to_frame()
y_test_reset_index = df_from_series.reset_index()
# y_test2 = y_test.reset_index(inplace=True)
print(y_test_reset_index)
y_test_reset_index.pop('index')

        index  Label
0      123264      2
1       56743      0
2       88220      1
3       36372      1
4      109357      1
...       ...    ...
74254  117763      1
74255   25489      0
74256   89481      1
74257   13032      0
74258   87389      0

[74259 rows x 2 columns]


0        123264
1         56743
2         88220
3         36372
4        109357
          ...  
74254    117763
74255     25489
74256     89481
74257     13032
74258     87389
Name: index, Length: 74259, dtype: int64

In [39]:
y_test_reset_index.values[0][0]

2

In [40]:
preds_dnn_2 = []
preds_svm_2 = []
preds_rf_2 = []
preds_mlp_2 = []
preds_ada_2 = []
preds_knn_2 = []
preds_lgbm_2 = []
preds_cat_2 = []
preds_xgb_2 = []

for i in range(0,len(preds_dnn_prob)):  
    # print(i)
    # print(preds_dnn_prob[i][y_test_reset_index.values[i][0]])
    preds_dnn_2.append(preds_dnn_prob[i][y_test_reset_index.values[i][0]])
    preds_svm_2.append(preds_svm_prob[i][y_test_reset_index.values[i][0]])
    preds_rf_2.append(preds_rf_prob[i][y_test_reset_index.values[i][0]])
    preds_mlp_2.append(preds_mlp_prob[i][y_test_reset_index.values[i][0]])
    preds_ada_2.append(preds_ada_prob[i][y_test_reset_index.values[i][0]])
    preds_knn_2.append(preds_knn_prob[i][y_test_reset_index.values[i][0]])
    preds_lgbm_2.append(preds_lgbm_prob[i][y_test_reset_index.values[i][0]])
    preds_cat_2.append(preds_cat_prob[i][y_test_reset_index.values[i][0]])
    preds_xgb_2.append(preds_xgb_prob[i][y_test_reset_index.values[i][0]])

    

In [41]:
with open(output_file_name, "a") as f: print('------------------------------------------------------------------', file = f)
with open(output_file_name, "a") as f: print('------------------------------------------------------------------', file = f)
with open(output_file_name, "a") as f: print('------------------------------------------------------------------', file = f)

with open(output_file_name, "a") as f: print('------------START of STRONGER LEARNER - STACK 01 -----------------', file = f)


# Stack the vectors horizontally to create a matrix
column_features = ['dnn','rf','lgbm','ada','knn','mlp','svm','cat','xgb','label']
training_matrix = np.column_stack((
                          preds_dnn_2,
                          preds_rf_2,
                          preds_lgbm_2,
                          preds_ada_2,
                          preds_knn_2, 
                          preds_mlp_2,
                          preds_svm_2,
                          preds_cat_2,
                          preds_xgb_2,
                          y_test
                          ))

# Print the resulting matrix
print(training_matrix)

[[0.05081991 0.79070027 0.99919391 ... 0.99399468 0.96087444 2.        ]
 [0.20403558 0.98587062 0.99986679 ... 0.99796718 0.99363387 0.        ]
 [0.18833527 0.99344781 0.9999872  ... 0.99824577 0.99533552 1.        ]
 ...
 [0.01682695 0.9315379  0.99981855 ... 0.99506191 0.98959243 1.        ]
 [0.29020634 0.88943441 0.99942833 ... 0.98743547 0.93419242 0.        ]
 [0.18264192 0.98510389 0.99979137 ... 0.99588572 0.99194461 0.        ]]


In [42]:
df_level_01 = pd.DataFrame(training_matrix, columns=column_features)

In [43]:

# Assuming df is your DataFrame
df_level_01.to_csv('models7dataset_prob.csv', index=False)


In [44]:
y_01 = df_level_01.pop('label')
X_01 = df_level_01
df_level_01 = df_level_01.assign(label = y_01)

In [45]:
X_01

Unnamed: 0,dnn,rf,lgbm,ada,knn,mlp,svm,cat,xgb
0,0.050820,0.790700,0.999194,0.319236,1.0,0.999794,0.077877,0.993995,0.960874
1,0.204036,0.985871,0.999867,0.263216,1.0,0.999937,0.262305,0.997967,0.993634
2,0.188335,0.993448,0.999987,0.269991,1.0,1.000000,0.910446,0.998246,0.995336
3,0.243874,0.993448,0.999994,0.268088,1.0,0.999999,0.017458,0.998620,0.995455
4,0.236328,0.938978,0.999887,0.268088,1.0,1.000000,0.022950,0.997787,0.981621
...,...,...,...,...,...,...,...,...,...
74254,0.263535,0.993448,0.999996,0.268088,1.0,0.999997,0.014842,0.998767,0.995455
74255,0.172091,0.822487,0.995918,0.236307,1.0,0.999989,0.007036,0.973484,0.945671
74256,0.016827,0.931538,0.999819,0.245178,1.0,0.999938,0.913073,0.995062,0.989592
74257,0.290206,0.889434,0.999428,0.243277,1.0,1.000000,0.902577,0.987435,0.934192


In [46]:
y_01

0        2.0
1        0.0
2        1.0
3        1.0
4        1.0
        ... 
74254    1.0
74255    0.0
74256    1.0
74257    0.0
74258    0.0
Name: label, Length: 74259, dtype: float64

In [47]:
df_level_01

Unnamed: 0,dnn,rf,lgbm,ada,knn,mlp,svm,cat,xgb,label
0,0.050820,0.790700,0.999194,0.319236,1.0,0.999794,0.077877,0.993995,0.960874,2.0
1,0.204036,0.985871,0.999867,0.263216,1.0,0.999937,0.262305,0.997967,0.993634,0.0
2,0.188335,0.993448,0.999987,0.269991,1.0,1.000000,0.910446,0.998246,0.995336,1.0
3,0.243874,0.993448,0.999994,0.268088,1.0,0.999999,0.017458,0.998620,0.995455,1.0
4,0.236328,0.938978,0.999887,0.268088,1.0,1.000000,0.022950,0.997787,0.981621,1.0
...,...,...,...,...,...,...,...,...,...,...
74254,0.263535,0.993448,0.999996,0.268088,1.0,0.999997,0.014842,0.998767,0.995455,1.0
74255,0.172091,0.822487,0.995918,0.236307,1.0,0.999989,0.007036,0.973484,0.945671,0.0
74256,0.016827,0.931538,0.999819,0.245178,1.0,0.999938,0.913073,0.995062,0.989592,1.0
74257,0.290206,0.889434,0.999428,0.243277,1.0,1.000000,0.902577,0.987435,0.934192,0.0


In [48]:
split = 0.7

X_train_01,X_test_01, y_train_01, y_test_01 = sklearn.model_selection.train_test_split(X_01, y_01, train_size=split)

In [49]:
# from keras.callbacks import EarlyStopping

# # Define EarlyStopping callback
# early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)

# # Compile the model
# # model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


# # Train the model with EarlyStopping callback
# model.fit(x_train, Y_train, epochs=100, batch_size=128, validation_split=0.2, callbacks=[early_stopping])

# # Save the trained model
# # model.save("CNN_CIC_1.h5")
# model = load_model("CNN_CIC_1.h5")

In [50]:
print('---------------------------------------------------------------------------------')
print('Defining DNN Model')
print('---------------------------------------------------------------------------------')

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

#Model Parameters
dropout_rate = 0.2
nodes = 3
out_layer = 5
optimizer='adam'
loss='sparse_categorical_crossentropy'
epochs=100
batch_size=128


num_columns = X_train_01.shape[1]

dnn_01 = tf.keras.Sequential()

# Input layer
dnn_01.add(tf.keras.Input(shape=(num_columns,)))

# # Dense layers with dropout
# dnn_01.add(tf.keras.layers.Dense(nodes))
# dnn_01.add(tf.keras.layers.Dropout(dropout_rate))

# dnn_01.add(tf.keras.layers.Dense(2*nodes))
# dnn_01.add(tf.keras.layers.Dropout(dropout_rate))

# dnn_01.add(tf.keras.layers.Dense(3*nodes))
# dnn_01.add(tf.keras.layers.Dropout(dropout_rate))

# dnn_01.add(tf.keras.layers.Dense(2*nodes))
# dnn_01.add(tf.keras.layers.Dropout(dropout_rate))

# dnn.add(tf.keras.layers.Dense(nodes))
# dnn.add(tf.keras.layers.Dropout(dropout_rate))



# Dense layers with dropout
dnn_01.add(tf.keras.layers.Dense(nodes))
dnn_01.add(tf.keras.layers.Dropout(dropout_rate))

dnn_01.add(tf.keras.layers.Dense(nodes))
dnn_01.add(tf.keras.layers.Dropout(dropout_rate))

dnn_01.add(tf.keras.layers.Dense(nodes))
dnn_01.add(tf.keras.layers.Dropout(dropout_rate))

dnn_01.add(tf.keras.layers.Dense(nodes))
dnn_01.add(tf.keras.layers.Dropout(dropout_rate))

dnn.add(tf.keras.layers.Dense(nodes))
dnn.add(tf.keras.layers.Dropout(dropout_rate))

# Output layer
# dnn_01.add(tf.keras.layers.Dense(out_layer))

dnn_01.add(tf.keras.layers.Dense(out_layer, activation='softmax'))


dnn_01.compile(optimizer=optimizer, loss=loss,metrics=['accuracy'])

dnn_01.summary()

---------------------------------------------------------------------------------
Defining DNN Model
---------------------------------------------------------------------------------
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 3)                 30        
_________________________________________________________________
dropout_5 (Dropout)          (None, 3)                 0         
_________________________________________________________________
dense_7 (Dense)              (None, 3)                 12        
_________________________________________________________________
dropout_6 (Dropout)          (None, 3)                 0         
_________________________________________________________________
dense_8 (Dense)              (None, 3)                 12        
_________________________________________________________________
dro

In [51]:
#DNN
from keras.callbacks import EarlyStopping

# Define EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)

print('---------------------------------------------------------------------------------')
print('Training DNN')
with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

with open(output_file_name, "a") as f: print('Training DNN', file = f)
print('---------------------------------------------------------------------------------')
# Convert Y_test back to its original format
# y_test = np.argmax(Y_test, axis=1)

# Start the timer
start = time.time()
# dnn_01.fit(X_train_01, y_train_01, epochs=epochs, batch_size=batch_size)
dnn_01.fit(X_train_01, y_train_01, epochs=epochs, batch_size=batch_size,validation_split=0.2, callbacks=[early_stopping])

# model.fit(x_train, Y_train, epochs=100, batch_size=128, validation_split=0.2, callbacks=[early_stopping])

# End the timer
end = time.time()
time_taken = end - start
with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
# joblib.dump(dnn_01, 'dnn_level_01.joblib')
dnn_01.save("dnn_level_01.h5")

# Calculate the time taken and print it out
# print(f'Time taken for training: {time_taken} seconds')


---------------------------------------------------------------------------------
Training DNN
---------------------------------------------------------------------------------
Epoch 1/100

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100


In [52]:
dnn_01 = load_model("dnn_level_01.h5")


In [53]:
#DNN
start = time.time()
pred_dnn = dnn_01.predict(X_test_01)
preds_dnn_01 = np.argmax(pred_dnn,axis = 1)
end = time.time()
time_taken = end - start
with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)

In [54]:
# y_test = y_test_01

In [55]:
#----------------------------------------------------------------
with open(output_file_name, "a") as f: print('Stack model - Strong learner - level 01', file = f)
with open(output_file_name, "a") as f: print('-------------------------------------------------------', file = f)

In [56]:

print('---------------------------------------------------------------------------------')
print('CONFUSION MATRIX')
print('---------------------------------------------------------------------------------')
with open(output_file_name, "a") as f: print('DNN', file = f)
pred_label = preds_dnn_01

# pred_label = ypred
#pred_label = label[ypred]

confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
all_unique_values = sorted(set(pred_label) | set(y_test_01))
z = np.zeros((len(all_unique_values), len(all_unique_values)))
rows, cols = confusion_matrix.shape
z[:rows, :cols] = confusion_matrix
confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
# confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
# with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
print(confusion_matrix)
with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)
TP_total = sum(TP)
TN_total = sum(TN)
FP_total = sum(FP)
FN_total = sum(FN)

TP_total = np.array(TP_total,dtype=np.float64)
TN_total = np.array(TN_total,dtype=np.float64)
FP_total = np.array(FP_total,dtype=np.float64)
FN_total = np.array(FN_total,dtype=np.float64)



#----------------------------------------------------------------#----------------------------------------------------------------

print('---------------------------------------------------------------------------------')
print('METRICS')
print('---------------------------------------------------------------------------------')

# Acc = ACC(TP_total,TN_total, FP_total, FN_total)
# Precision = PRECISION(TP_total, FP_total)
# Recall = RECALL(TP_total, FN_total)
# F1 = F1(Recall,Precision)
# BACC = BACC(TP_total,TN_total, FP_total, FN_total)
# MCC = MCC(TP_total,TN_total, FP_total, FN_total)


Acc = accuracy_score(y_test_01, pred_label)
Precision = precision_score(y_test_01, pred_label, average='macro')
Recall = recall_score(y_test_01, pred_label, average='macro')
F1 =  f1_score(y_test_01, pred_label, average='macro')
BACC = balanced_accuracy_score(y_test_01, pred_label)
MCC = matthews_corrcoef(y_test_01, pred_label)


dnn_acc_01 = Acc
dnn_pre_01 = Precision
dnn_rec_01 = Recall
dnn_f1_01 = F1
dnn_bacc_01 = BACC
dnn_mcc_01 = MCC

# with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
print('Accuracy total: ', Acc)
print('Precision total: ', Precision )
print('Recall total: ', Recall )
print('F1 total: ', F1 )
print('BACC total: ', BACC)
print('MCC total: ', MCC)

with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)



---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0.0     1.0    2.0  3.0  4.0
0.0  11243.0   308.0    0.0  0.0  0.0
1.0   7711.0   306.0    1.0  0.0  0.0
2.0    298.0  1814.0    3.0  0.0  0.0
3.0      0.0    58.0  514.0  0.0  0.0
4.0      0.0    10.0   12.0  0.0  0.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.6091659933566748
Precision total:  0.4561130829802892
Recall total:  0.5459240511334371
F1 total:  0.48993146733213655
BACC total:  0.5459240511334371
MCC total:  0.39749208372924244


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


In [57]:
#SVM
print('---------------------------------------------------------------------------------')
print('Defining SVM Model')
print('---------------------------------------------------------------------------------')

from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import SGDClassifier

# Instantiate the SGDClassifier with additional hyperparameters
clf = SGDClassifier(
    loss='hinge',           # hinge loss for linear SVM
    penalty='l2',           # L2 regularization to prevent overfitting
    alpha=1e-4,             # Learning rate (small value for fine-grained updates)
    max_iter=1000,          # Number of passes over the training data
    random_state=42,        # Seed for reproducible results
    learning_rate='optimal' # Automatically adjusts the learning rate based on the training data
)

#SVM
start = time.time()
clf.fit(X_train_01, y_train_01)
end = time.time()
clf.score(X_train_01, y_train_01)
time_taken = end - start
with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
joblib.dump(clf, 'svm_level_01.joblib')


clf = loaded_model = joblib.load('svm_level_01.joblib')


#SVM
start = time.time()
preds_svm_01 = clf.predict(X_test_01)
end = time.time()
time_taken = end - start
with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
print('---------------------------------------------------------------------------------')



---------------------------------------------------------------------------------
Defining SVM Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------


In [58]:
with open(output_file_name, "a") as f: print('-------------------------------------------------------', file = f)
print('---------------------------------------------------------------------------------')
print('CONFUSION MATRIX')
print('---------------------------------------------------------------------------------')
with open(output_file_name, "a") as f: print('SVM', file = f)
pred_label = preds_svm_01

# pred_label = ypred
#pred_label = label[ypred]

confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
all_unique_values = sorted(set(pred_label) | set(y_test_01))
z = np.zeros((len(all_unique_values), len(all_unique_values)))
rows, cols = confusion_matrix.shape
z[:rows, :cols] = confusion_matrix
confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
# confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
# with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
print(confusion_matrix)
with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)
TP_total = sum(TP)
TN_total = sum(TN)
FP_total = sum(FP)
FN_total = sum(FN)

TP_total = np.array(TP_total,dtype=np.float64)
TN_total = np.array(TN_total,dtype=np.float64)
FP_total = np.array(FP_total,dtype=np.float64)
FN_total = np.array(FN_total,dtype=np.float64)



#----------------------------------------------------------------#----------------------------------------------------------------

print('---------------------------------------------------------------------------------')
print('METRICS')
print('---------------------------------------------------------------------------------')

# Acc = ACC(TP_total,TN_total, FP_total, FN_total)
# Precision = PRECISION(TP_total, FP_total)
# Recall = RECALL(TP_total, FN_total)
# F1 = F1(Recall,Precision)
# BACC = BACC(TP_total,TN_total, FP_total, FN_total)
# MCC = MCC(TP_total,TN_total, FP_total, FN_total)


Acc = accuracy_score(y_test_01, pred_label)
Precision = precision_score(y_test_01, pred_label, average='macro')
Recall = recall_score(y_test_01, pred_label, average='macro')
F1 =  f1_score(y_test_01, pred_label, average='macro')
BACC = balanced_accuracy_score(y_test_01, pred_label)
MCC = matthews_corrcoef(y_test_01, pred_label)



svm_acc_01 = Acc
svm_pre_01 = Precision
svm_rec_01 = Recall
svm_f1_01 = F1
svm_bacc_01 = BACC
svm_mcc_01 = MCC

# with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
print('Accuracy total: ', Acc)
print('Precision total: ', Precision )
print('Recall total: ', Recall )
print('F1 total: ', F1 )
print('BACC total: ', BACC)
print('MCC total: ', MCC)

with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)



---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
        0.0     1.0     2.0    3.0  4.0
0.0  8617.0  2617.0   313.0    4.0  0.0
1.0  4902.0  2623.0   488.0    5.0  0.0
2.0   229.0    83.0  1778.0   25.0  0.0
3.0     0.0     0.0    72.0  500.0  0.0
4.0     3.0     0.0     0.0   19.0  0.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.6067869647185564
Precision total:  0.538852401791561
Recall total:  0.5575845535423665
F1 total:  0.5418726099415194
BACC total:  0.5575845535423665
MCC total:  0.33178215306482806


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


In [59]:

print('---------------------------------------------------------------------------------')
print('Defining RF Model')
print('---------------------------------------------------------------------------------')
#Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
rf = RandomForestClassifier(max_depth = 5,  n_estimators = 10, min_samples_split = 2, n_jobs = -1)
#------------------------------------------------------------------------------

if True == True:

    print('---------------------------------------------------------------------------------')
    print('Training RF')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)
    with open(output_file_name, "a") as f: print('Training RF', file = f)
    print('---------------------------------------------------------------------------------')
    #RF
    start = time.time()
    model_rf_01 = rf.fit(X_train_01,y_train_01)
    end = time.time()

    # # Create the StratifiedKFold object
    # stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # # Perform cross-validation
    # cv_scores = cross_val_score(model_rf_01, X_train_01, y_train, cv=stratified_kfold, scoring='accuracy')
    # # Print the cross-validation scores
    # print("Cross-validation scores:", cv_scores)
    # print("Mean accuracy:", cv_scores.mean())
    # with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)


    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(model_rf_01, 'rf_base_model_01.joblib')

if 1 == 1:
    model_rf_01  = joblib.load('rf_base_model_01.joblib')

if 1 == 1:

    print('---------------------------------------------------------------------------------')
    print('Prediction RF')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Prediction RF', file = f)
    print('---------------------------------------------------------------------------------')
    #RF
    start = time.time()
    preds_rf_01 = model_rf_01.predict(X_test_01)
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')

    with open(output_file_name, "a") as f: print('-------------------------------------------------------', file = f)
print('---------------------------------------------------------------------------------')
print('CONFUSION MATRIX')
print('---------------------------------------------------------------------------------')
with open(output_file_name, "a") as f: print('RF', file = f)
pred_label = preds_rf_01

# pred_label = ypred
#pred_label = label[ypred]

confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
all_unique_values = sorted(set(pred_label) | set(y_test_01))
z = np.zeros((len(all_unique_values), len(all_unique_values)))
rows, cols = confusion_matrix.shape
z[:rows, :cols] = confusion_matrix
confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
# confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
# with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
print(confusion_matrix)
with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)
TP_total = sum(TP)
TN_total = sum(TN)
FP_total = sum(FP)
FN_total = sum(FN)

TP_total = np.array(TP_total,dtype=np.float64)
TN_total = np.array(TN_total,dtype=np.float64)
FP_total = np.array(FP_total,dtype=np.float64)
FN_total = np.array(FN_total,dtype=np.float64)



#----------------------------------------------------------------#----------------------------------------------------------------

print('---------------------------------------------------------------------------------')
print('METRICS')
print('---------------------------------------------------------------------------------')

# Acc = ACC(TP_total,TN_total, FP_total, FN_total)
# Precision = PRECISION(TP_total, FP_total)
# Recall = RECALL(TP_total, FN_total)
# F1 = F1(Recall,Precision)
# BACC = BACC(TP_total,TN_total, FP_total, FN_total)
# MCC = MCC(TP_total,TN_total, FP_total, FN_total)


Acc = accuracy_score(y_test_01, pred_label)
Precision = precision_score(y_test_01, pred_label, average='macro')
Recall = recall_score(y_test_01, pred_label, average='macro')
F1 =  f1_score(y_test_01, pred_label, average='macro')
BACC = balanced_accuracy_score(y_test_01, pred_label)
MCC = matthews_corrcoef(y_test_01, pred_label)

rf_acc_01 = Acc
rf_pre_01 = Precision
rf_rec_01 = Recall
rf_f1_01 = F1
rf_bacc_01 = BACC
rf_mcc_01 = MCC


# with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
print('Accuracy total: ', Acc)
print('Precision total: ', Precision )
print('Recall total: ', Recall )
print('F1 total: ', F1 )
print('BACC total: ', BACC)
print('MCC total: ', MCC)

with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)




---------------------------------------------------------------------------------
Defining RF Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Training RF
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Prediction RF
---------------------------------------------------------------------------------


---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0.0     1.0     2.0    3.0  4.0
0.0  11338.0    74.0   139.0    0.0  0.0
1.0    722.0  7090.0   206.0    0.0  0.0
2.0    401.0   151.0  1563.0    0.0  0.0
3.0      0.0     0.0     0.0  572.0  0.0
4.0      0.0     0.0     0.0   13.0  9.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9234222102522668
Precision total:  0.9352160549340095
Recall total:  0.8027836906899554
F1 total:  0.8431210275432454
BACC total:  0.8027836906899554
MCC total:  0.8708415232118788


In [60]:
rf_acc_01

0.9234222102522668

In [61]:
print('---------------------------------------------------------------------------------')
print('Defining LGBM Model')
print('---------------------------------------------------------------------------------')
#LGBM
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier()



if 1 == 1 and 0 == 0:


    print('---------------------------------------------------------------------------------')
    print('Training LGBM')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training LGBM', file = f)
    print('---------------------------------------------------------------------------------')
    start = time.time()
    lgbm.fit(X_train_01, y_train_01)
    end = time.time()

    # # Create the StratifiedKFold object
    # stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # # Perform cross-validation
    # cv_scores = cross_val_score(lgbm, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # # Print the cross-validation scores
    # print("Cross-validation scores:", cv_scores)
    # print("Mean accuracy:", cv_scores.mean())
    # with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)

    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(lgbm, 'lgbm_01.joblib')

if 1 == 1:
    lgbm = joblib.load('lgbm_01.joblib')


if 1 == 1:

    print('Prediction LGBM')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Prediction LGBM', file = f)
    print('---------------------------------------------------------------------------------')
    #LGBM
    start = time.time()
    preds_lgbm_01 = lgbm.predict(X_test_01)
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')

    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')
    with open(output_file_name, "a") as f: print('LGBM', file = f)
    pred_label = preds_lgbm_01

    # pred_label = ypred
    #pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test_01))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)


    Acc = accuracy_score(y_test_01, pred_label)
    lgbm_acc_01 = Acc
    Precision = precision_score(y_test_01, pred_label, average='macro')
    Recall = recall_score(y_test_01, pred_label, average='macro')
    F1 =  f1_score(y_test_01, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test_01, pred_label)
    MCC = matthews_corrcoef(y_test_01, pred_label)

    lgbm_acc_01 = Acc
    lgbm_pre_01 = Precision
    lgbm_rec_01 = Recall
    lgbm_f1_01 = F1
    lgbm_bacc_01 = BACC
    lgbm_mcc_01 = MCC

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)





---------------------------------------------------------------------------------
Defining LGBM Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Training LGBM
---------------------------------------------------------------------------------


Prediction LGBM
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0.0     1.0     2.0    3.0   4.0
0.0  11518.0    20.0    13.0    0.0   0.0
1.0     30.0  7976.0    12.0    0.0   0.0
2.0      3.0    12.0  2100.0    0.0   0.0
3.0      0.0     0.0     0.0  572.0   0.0
4.0      0.0     0.0     0.0    0.0  22.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9959601400484783
Precision total:  0.9962764789229521
Recall total:  0.9969625383786198
F1 total:  0.996618329132178
BACC total:  0.9969625383786198
MCC total:  0.9931771619500335


In [62]:
# lgbm_acc_01

In [63]:

#MLP
print('---------------------------------------------------------------------------------')
print('Defining MLP Model')
print('---------------------------------------------------------------------------------')


from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
import time

# create MLPClassifier instance
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=200, random_state=1)

if 1 == 1 and 0 == 0:


    print('---------------------------------------------------------------------------------')
    print('Training MLP')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training MLP', file = f)
    print('---------------------------------------------------------------------------------')

    start = time.time()
    MLP = mlp.fit(X_train_01, y_train_01)
    end = time.time()

    # # Create the StratifiedKFold object
    # stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # # Perform cross-validation
    # cv_scores = cross_val_score(MLP, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # # Print the cross-validation scores
    # print("Cross-validation scores:", cv_scores)
    # print("Mean accuracy:", cv_scores.mean())
    # with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)

    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(MLP, 'mlp_01.joblib')

if 1 == 1:
    MLP = joblib.load('mlp_01.joblib')


if 1 == 1:

    #MLP
    start = time.time()
    y_pred = MLP.predict_proba(X_test_01)
    preds_mlp_01 = np.argmax(y_pred,axis = 1)
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

#MLP
if 1 == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('MLP 01 model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_mlp_01
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test_01))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)


    Acc = accuracy_score(y_test_01, pred_label)
    Precision = precision_score(y_test_01, pred_label, average='macro')
    Recall = recall_score(y_test_01, pred_label, average='macro')
    F1 =  f1_score(y_test_01, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test_01, pred_label)
    MCC = matthews_corrcoef(y_test_01, pred_label)


    mlp_acc_01 = Acc
    mlp_pre_01 = Precision
    mlp_rec_01 = Recall
    mlp_f1_01 = F1
    mlp_bacc_01 = BACC
    mlp_mcc_01 = MCC

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)








---------------------------------------------------------------------------------
Defining MLP Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Training MLP
---------------------------------------------------------------------------------


---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0       1       2      3     4
0  10937.0   536.0    78.0    0.0   0.0
1   1353.0  6507.0   158.0    0.0   0.0
2     51.0    62.0  2002.0    0.0   0.0
3      0.0     0.0     0.0  572.0   0.0
4      0.0     0.0     0.0    0.0  22.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.8995421492054942
Precision total:  0.9393231012474725
Recall total:  0.9409931095579122
F1 total:  0.9391811063990051
BACC total:  0.9409931095579122
MCC total:  0.8312311815290724


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


In [64]:
# mlp_acc_01

In [65]:
print('---------------------------------------------------------------------------------')
print('Defining ADA Model')
print('---------------------------------------------------------------------------------')
#ADA
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import AdaBoostClassifier
import time
abc = AdaBoostClassifier(n_estimators=50, learning_rate=1.0)

if 1 == 1 and 0 == 0:

    print('---------------------------------------------------------------------------------')
    print('Training ADA')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training ADA', file = f)
    print('---------------------------------------------------------------------------------')
    #ADA


    start = time.time()
    ada = abc.fit(X_train_01, y_train_01)
    end = time.time()

    # # Create the StratifiedKFold object
    # stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # # Perform cross-validation
    # cv_scores = cross_val_score(ada, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # # Print the cross-validation scores
    # print("Cross-validation scores:", cv_scores)
    # print("Mean accuracy:", cv_scores.mean())
    # with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)

    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)

    # Assuming 'model' is your trained model
    joblib.dump(ada, 'ada_01.joblib')




if 1 == 1:
    ada = joblib.load('ada_01.joblib')


if 1 == 1:

    print('Prediction ADA')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Prediction ADA', file = f)
    print('---------------------------------------------------------------------------------')
    #ADA
    start = time.time()
    preds_ada_01 = ada.predict(X_test_01)
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')

if 1 == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('ADA 01 model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_ada_01
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test_01))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)


    Acc = accuracy_score(y_test_01, pred_label)
    Precision = precision_score(y_test_01, pred_label, average='macro')
    Recall = recall_score(y_test_01, pred_label, average='macro')
    F1 =  f1_score(y_test_01, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test_01, pred_label)
    MCC = matthews_corrcoef(y_test_01, pred_label)

    ada_acc_01 = Acc
    ada_pre_01 = Precision
    ada_rec_01 = Recall
    ada_f1_01 = F1
    ada_bacc_01 = BACC
    ada_mcc_01 = MCC

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)









---------------------------------------------------------------------------------
Defining ADA Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Training ADA
---------------------------------------------------------------------------------


Prediction ADA
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0.0    1.0   2.0    3.0  4.0
0.0  11530.0    7.0  14.0    0.0  0.0
1.0   7858.0  144.0  16.0    0.0  0.0
2.0   2041.0   32.0  42.0    0.0  0.0
3.0      0.0    0.0   0.0  572.0  0.0
4.0      0.0    0.0   0.0   22.0  0.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.5515755453810934
Precision total:  0.5742474895492983
Recall total:  0.4071999445070654
F1 total:  0.3507740333236951
BACC total:  0.4071999445070654
MCC total:  0.2336449609932628


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


In [66]:
#KNN
print('---------------------------------------------------------------------------------')
print('Defining KNN Model')
print('---------------------------------------------------------------------------------')
from sklearn.neighbors import KNeighborsClassifier
knn_clf_01=KNeighborsClassifier(n_neighbors = 5)

if 1 == 1 and 0 == 0:

    #KNN
    print('---------------------------------------------------------------------------------')
    print('Training KNN')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training KNN', file = f)
    print('---------------------------------------------------------------------------------')
    start = time.time()
    knn_clf_01.fit(X_train_01,y_train_01)
    end = time.time()


    # # Create the StratifiedKFold object
    # stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # # Perform cross-validation
    # cv_scores = cross_val_score(knn_clf, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # # Print the cross-validation scores
    # print("Cross-validation scores:", cv_scores)
    # print("Mean accuracy:", cv_scores.mean())
    # with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)


    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(knn_clf_01, 'knn_01.joblib')


if load_model_knn == 1:
    knn_clf_01 = joblib.load('knn_01.joblib')

if use_model_knn == 1:

    #KNN
    start = time.time()
    preds_knn =knn_clf_01.predict(X_test_01)
    preds_knn
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

#MLP
if 1 == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('KNN 01 model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_knn
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test_01))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)


    Acc = accuracy_score(y_test_01, pred_label)
    Precision = precision_score(y_test_01, pred_label, average='macro')
    Recall = recall_score(y_test_01, pred_label, average='macro')
    F1 =  f1_score(y_test_01, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test_01, pred_label)
    MCC = matthews_corrcoef(y_test_01, pred_label)


    knn_acc_01 = Acc
    knn_pre_01 = Precision
    knn_rec_01 = Recall
    knn_f1_01 = F1
    knn_bacc_01 = BACC
    knn_mcc_01 = MCC    

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)









---------------------------------------------------------------------------------
Defining KNN Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Training KNN
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0.0     1.0     2.0    3.0   4.0
0.0  11281.0   192.0    78.0    0.0   0.0
1.0    298.0  7669.0    49.0    2.0   0.0
2.0     37.0    52.0  2014.0   12.0   0.0
3.0      0.0     3.0     3.0  566.0   0.0
4.0      0.0     0.0     2.0   10.0  10.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9668731483975223

In [67]:
from sklearn.linear_model import LogisticRegression

#Logistic Regression
print('---------------------------------------------------------------------------------')
print('Defining Logistic Regression Model')
print('---------------------------------------------------------------------------------')
logreg_01 = LogisticRegression()

if 1 == 1 and 0 == 0:

    #KNN
    print('---------------------------------------------------------------------------------')
    print('Training LR ')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training LR', file = f)
    print('---------------------------------------------------------------------------------')
    start = time.time()
    logreg_01.fit(X_train_01,y_train_01)
    end = time.time()


    # # Create the StratifiedKFold object
    # stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # # Perform cross-validation
    # cv_scores = cross_val_score(knn_clf, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # # Print the cross-validation scores
    # print("Cross-validation scores:", cv_scores)
    # print("Mean accuracy:", cv_scores.mean())
    # with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)


    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(logreg_01, 'logreg_01.joblib')


if 1 == 1:
    logreg_01 = joblib.load('logreg_01.joblib')

if 1 == 1:

    #lR
    start = time.time()
    preds_logreg =logreg_01.predict(X_test_01)
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

#LR
if 1 == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('LR 01 model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_logreg
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test_01))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)


    Acc = accuracy_score(y_test_01, pred_label)
    Precision = precision_score(y_test_01, pred_label, average='macro')
    Recall = recall_score(y_test_01, pred_label, average='macro')
    F1 =  f1_score(y_test_01, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test_01, pred_label)
    MCC = matthews_corrcoef(y_test_01, pred_label)

    lr_acc_01 = Acc
    lr_pre_01 = Precision
    lr_rec_01 = Recall
    lr_f1_01 = F1
    lr_bacc_01 = BACC
    lr_mcc_01 = MCC

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)









---------------------------------------------------------------------------------
Defining Logistic Regression Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Training LR 
---------------------------------------------------------------------------------


---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
        0.0     1.0     2.0    3.0   4.0
0.0  8849.0  2460.0   242.0    0.0   0.0
1.0  4823.0  2842.0   351.0    0.0   2.0
2.0   230.0    54.0  1818.0    2.0  11.0
3.0     0.0     0.0     0.0  571.0   1.0
4.0     0.0     0.0     0.0   16.0   6.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.6322829697459377
Precision total:  0.6381261354968755
Recall total:  0.6502173659559842
F1 total:  0.6386063966146803
BACC total:  0.6502173659559842
MCC total:  0.3735069829432364


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [68]:
# #Voting
# from sklearn.ensemble import VotingClassifier
# # model1 = LogisticRegression(random_state=1)
# # model2 = tree.DecisionTreeClassifier(random_state=1)
# voting = VotingClassifier(estimators=[
#                                         ('ada', ada),
#                                        ('rf', rf),
#                                        ('svm', clf),
#                                        ('knn', knn_clf), 
#                                        ('lgbm', lgbm),
#                                       #  ('xgb', xgb_00),
#                                        ('cat', cat_00),

#                                          ('mlp', mlp)
#                                         #  ,('dnn', dnn_01)

#                                          ], voting='hard')
# voting.fit(X_train_01,y_train_01)
# # voring_acc = voting.score(X_test_01,y_test_01)

In [69]:
# preds_voting = voting(X_test_01,y_test_01)

In [70]:

# print('---------------------------------------------------------------------------------')
# print('CONFUSION MATRIX')
# print('---------------------------------------------------------------------------------')


# pred_label = preds_voting
# # pred_label = label[ypred]

# confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
# all_unique_values = sorted(set(pred_label) | set(y_test_01))
# z = np.zeros((len(all_unique_values), len(all_unique_values)))
# rows, cols = confusion_matrix.shape
# z[:rows, :cols] = confusion_matrix
# confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
# # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
# # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
# print(confusion_matrix)
# with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

# with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


# FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
# FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
# TP = np.diag(confusion_matrix)
# TN = confusion_matrix.values.sum() - (FP + FN + TP)
# TP_total = sum(TP)
# TN_total = sum(TN)
# FP_total = sum(FP)
# FN_total = sum(FN)

# TP_total = np.array(TP_total,dtype=np.float64)
# TN_total = np.array(TN_total,dtype=np.float64)
# FP_total = np.array(FP_total,dtype=np.float64)
# FN_total = np.array(FN_total,dtype=np.float64)



# #----------------------------------------------------------------#----------------------------------------------------------------

# print('---------------------------------------------------------------------------------')
# print('METRICS')
# print('---------------------------------------------------------------------------------')

# # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
# # Precision = PRECISION(TP_total, FP_total)
# # Recall = RECALL(TP_total, FN_total)
# # F1 = F1(Recall,Precision)
# # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
# # MCC = MCC(TP_total,TN_total, FP_total, FN_total)


# Acc = accuracy_score(y_test_01, pred_label)
# Precision = precision_score(y_test_01, pred_label, average='macro')
# Recall = recall_score(y_test_01, pred_label, average='macro')
# F1 =  f1_score(y_test_01, pred_label, average='macro')
# BACC = balanced_accuracy_score(y_test_01, pred_label)
# MCC = matthews_corrcoef(y_test_01, pred_label)


# voting_acc_01 = Acc
# voting_pre_01 = Precision
# voting_rec_01 = Recall
# voting_f1_01 = F1
# voting_bacc_01 = BACC
# voting_mcc_01 = MCC

# # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
# print('Accuracy total: ', Acc)
# print('Precision total: ', Precision )
# print('Recall total: ', Recall )
# print('F1 total: ', F1 )
# print('BACC total: ', BACC)
# print('MCC total: ', MCC)

# with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
# with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
# with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
# with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
# with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
# with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)









In [71]:
# from sklearn.calibration import CalibratedClassifierCV
# with open(output_file_name, "a") as f: print('Generating Predictions', file = f)

# if use_model_rf == 1:

#     print('---------------------------------------------------------------------------------')
#     print('Prediction RF')
#     with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

#     with open(output_file_name, "a") as f: print('Prediction RF', file = f)
#     print('---------------------------------------------------------------------------------')
#     #RF
#     start = time.time()
#     preds_rf = rf.predict(X_test)
#     preds_rf_prob = rf.predict_proba(X_test)
#     end = time.time()
#     time_taken = end - start
#     with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
#     print('---------------------------------------------------------------------------------')

# if use_model_svm == 1:

#     print('Prediction SVM')
#     with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

#     with open(output_file_name, "a") as f: print('Prediction SVM', file = f)
#     print('---------------------------------------------------------------------------------')
#     #SVM
#     start = time.time()
#     preds_svm = clf.predict(X_test)
#     # preds_svm_prob = clf.predict_proba(X_test)

#     #Since SVM does not deal with prob by nature we use a meta learner
#     # https://stackoverflow.com/questions/55250963/how-to-get-probabilities-for-sgdclassifier-linearsvm

#     model = CalibratedClassifierCV(clf)

#     model.fit(X, y)
#     preds_svm_prob = model.predict_proba(X)

#     end = time.time()
#     time_taken = end - start
#     with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
#     print('---------------------------------------------------------------------------------')

# if use_model_lgbm == 1:

#     print('Prediction LGBM')
#     with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

#     with open(output_file_name, "a") as f: print('Prediction LGBM', file = f)
#     print('---------------------------------------------------------------------------------')
#     #LGBM
#     start = time.time()
#     preds_lgbm = lgbm.predict(X_test)
#     preds_lgbm_prob = lgbm.predict_proba(X_test)

#     end = time.time()
#     time_taken = end - start
#     with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
#     print('---------------------------------------------------------------------------------')

# if use_model_dnn == 1:

#     print('Prediction DNN')
#     with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

#     with open(output_file_name, "a") as f: print('Prediction DNN', file = f)
#     print('---------------------------------------------------------------------------------')
#     #DNN
#     start = time.time()
#     pred_dnn = dnn.predict(X_test)
#     preds_dnn_prob = pred_dnn
#     preds_dnn = np.argmax(pred_dnn,axis = 1)
#     end = time.time()
#     time_taken = end - start
#     with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
#     print('---------------------------------------------------------------------------------')

# if use_model_ada == 1:

#     print('Prediction ADA')
#     with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

#     with open(output_file_name, "a") as f: print('Prediction ADA', file = f)
#     print('---------------------------------------------------------------------------------')
#     #ADA
#     start = time.time()
#     preds_ada = ada.predict(X_test)
#     preds_ada_prob = ada.predict_proba(X_test)

#     end = time.time()
#     time_taken = end - start
#     with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
#     print('---------------------------------------------------------------------------------')
#     print('Prediction MLP')
#     with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

#     with open(output_file_name, "a") as f: print('Prediction MLP', file = f)
#     print('---------------------------------------------------------------------------------')

# if use_model_mlp == 1:

#     #MLP
#     start = time.time()
#     y_pred = MLP.predict_proba(X_test)
#     preds_mlp_prob = y_pred
#     preds_mlp = np.argmax(y_pred,axis = 1)
#     end = time.time()
#     time_taken = end - start
#     with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
#     print('---------------------------------------------------------------------------------')
#     print('Prediction KNN')
#     with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

#     with open(output_file_name, "a") as f: print('Prediction KNN', file = f)
#     print('---------------------------------------------------------------------------------')

# if use_model_knn == 1:

#     #KNN
#     start = time.time()
#     preds_knn =knn_clf.predict(X_test)
#     preds_knn_prob =knn_clf.predict_proba(X_test)

#     preds_knn
#     end = time.time()
#     time_taken = end - start
#     with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
#     with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)


In [72]:
import catboost

cat_01 = catboost.CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='MultiClass', custom_metric='Accuracy')

# Fit the model
cat_01.fit(X_train_01, y_train_01, eval_set=(X_test_01, y_test_01), verbose=10)

# Make predictions on the test set
preds_cat = cat_01.predict(X_test_01)
preds_cat = np.squeeze(preds_cat)

with open(output_file_name, "a") as f: print('--------------------------------------------------------------------------', file = f)

with open(output_file_name, "a") as f: print('catboost', file = f)


print('---------------------------------------------------------------------------------')
print('CONFUSION MATRIX')
print('---------------------------------------------------------------------------------')


# pred_label = label[ypred]

confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
all_unique_values = sorted(set(pred_label) | set(y_test_01))
z = np.zeros((len(all_unique_values), len(all_unique_values)))
rows, cols = confusion_matrix.shape
z[:rows, :cols] = confusion_matrix
confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
# confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
# with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
print(confusion_matrix)
with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)
TP_total = sum(TP)
TN_total = sum(TN)
FP_total = sum(FP)
FN_total = sum(FN)

TP_total = np.array(TP_total,dtype=np.float64)
TN_total = np.array(TN_total,dtype=np.float64)
FP_total = np.array(FP_total,dtype=np.float64)
FN_total = np.array(FN_total,dtype=np.float64)



#----------------------------------------------------------------#----------------------------------------------------------------

print('---------------------------------------------------------------------------------')
print('METRICS')
print('---------------------------------------------------------------------------------')


Acc = accuracy_score(y_test_01, pred_label)
Precision = precision_score(y_test_01, pred_label, average='macro')
Recall = recall_score(y_test_01, pred_label, average='macro')
F1 =  f1_score(y_test_01, pred_label, average='macro')
BACC = balanced_accuracy_score(y_test_01, pred_label)
MCC = matthews_corrcoef(y_test_01, pred_label)


cat_acc_01 = Acc
cat_pre_01 = Precision
cat_rec_01 = Recall
cat_f1_01 = F1
cat_bacc_01 = BACC
cat_mcc_01 = MCC

# with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
print('Accuracy total: ', Acc)
cat_acc_01 = Acc
print('Precision total: ', Precision )
print('Recall total: ', Recall )
print('F1 total: ', F1 )
print('BACC total: ', BACC)
print('MCC total: ', MCC)

with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)



0:	learn: 1.3260412	test: 1.3261579	best: 1.3261579 (0)	total: 21.1ms	remaining: 2.09s
10:	learn: 0.4784802	test: 0.4785154	best: 0.4785154 (10)	total: 188ms	remaining: 1.52s
20:	learn: 0.2740512	test: 0.2751733	best: 0.2751733 (20)	total: 295ms	remaining: 1.11s
30:	learn: 0.1901725	test: 0.1919907	best: 0.1919907 (30)	total: 379ms	remaining: 844ms
40:	learn: 0.1487944	test: 0.1506366	best: 0.1506366 (40)	total: 463ms	remaining: 666ms
50:	learn: 0.1235563	test: 0.1257061	best: 0.1257061 (50)	total: 554ms	remaining: 533ms
60:	learn: 0.1065992	test: 0.1091684	best: 0.1091684 (60)	total: 637ms	remaining: 407ms
70:	learn: 0.0923120	test: 0.0951884	best: 0.0951884 (70)	total: 717ms	remaining: 293ms


80:	learn: 0.0824980	test: 0.0857693	best: 0.0857693 (80)	total: 802ms	remaining: 188ms
90:	learn: 0.0756322	test: 0.0790558	best: 0.0790558 (90)	total: 883ms	remaining: 87.3ms
99:	learn: 0.0693869	test: 0.0730510	best: 0.0730510 (99)	total: 952ms	remaining: 0us

bestTest = 0.07305102566
bestIteration = 99

---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
        0.0     1.0     2.0    3.0   4.0
0.0  8849.0  2460.0   242.0    0.0   0.0
1.0  4823.0  2842.0   351.0    0.0   2.0
2.0   230.0    54.0  1818.0    2.0  11.0
3.0     0.0     0.0     0.0  571.0   1.0
4.0     0.0     0.0     0.0   16.0   6.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.6322829697459377
Precision total:  0.6381261354968755
Recall total:  0.65

In [73]:

import xgboost as xgb

# Create a DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train_01, label=y_train_01)
dtest = xgb.DMatrix(X_test_01, label=y_test_01)

# Set XGBoost parameters
params = {
    'objective': 'multi:softmax',  # for multi-class classification
    'num_class': 5,  # specify the number of classes
    'max_depth': 3,
    'learning_rate': 0.1,
    'eval_metric': 'mlogloss'  # metric for multi-class classification
}

# Train the XGBoost model
num_round = 100
xgb_01 = xgb.train(params, dtrain, num_round)

# Make predictions on the test set
preds_xgb_01 = xgb_01.predict(dtest)


if 1 == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('xgboost base model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_xgb_01
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test_01))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)

    Acc = accuracy_score(y_test_01, pred_label)
    Precision = precision_score(y_test_01, pred_label, average='macro')
    Recall = recall_score(y_test_01, pred_label, average='macro')
    F1 =  f1_score(y_test_01, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test_01, pred_label)
    MCC = matthews_corrcoef(y_test_01, pred_label)

    xgb_acc_01 = Acc
    xgb_pre_01 = Precision
    xgb_rec_01 = Recall
    xgb_f1_01 = F1
    xgb_bacc_01 = BACC
    xgb_mcc_01 = MCC


    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)


---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0.0     1.0     2.0    3.0   4.0
0.0  11372.0    90.0    89.0    0.0   0.0
1.0    231.0  7728.0    59.0    0.0   0.0
2.0     18.0    15.0  2082.0    0.0   0.0
3.0      0.0     0.0     0.0  572.0   0.0
4.0      0.0     0.0     0.0    0.0  22.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9774665589370679
Precision total:  0.979760146756426
Recall total:  0.9865464097413732
F1 total:  0.9829905583077186
BACC total:  0.9865464097413732
MCC total:  0.9620764142463629


In [74]:
# model1 = tree.DecisionTreeClassifier()
# model2 = KNeighborsClassifier()
# model3= LogisticRegression()

# model1.fit(x_train,y_train)
# model2.fit(x_train,y_train)
# model3.fit(x_train,y_train)

# pred1=model1.predict_proba(x_test)
# pred2=model2.predict_proba(x_test)
# pred3=model3.predict_proba(x_test)

# finalpred=(preds_svm_prob +
#             preds_ada_prob +
#             preds_knn_prob +
#             preds_rf_prob +
#             preds_dnn_prob +
#             preds_lgbm_prob +
#             preds_mlp_prob
#             )/7

In [75]:
# with open(output_file_name, "a") as f: print('-----------------------', file = f)
# with open(output_file_name, "a") as f: print('Summary', file = f)
# with open(output_file_name, "a") as f: print('-----------------------', file = f)
# with open(output_file_name, "a") as f: print('Level 00', file = f)

# with open(output_file_name, "a") as f: print('Accuracy ada: ', ada_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy dnn: ', dnn_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy svm: ', svm_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy knn: ', knn_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy mlp: ', mlp_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy lgbm: ', lgbm_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy rf: ', rf_acc_00, file = f)

# with open(output_file_name, "a") as f: print('-----------------------', file = f)
# with open(output_file_name, "a") as f: print('Level 01', file = f)

# with open(output_file_name, "a") as f: print('Accuracy ada: ', ada_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy dnn: ', dnn_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy svm: ', svm_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy knn: ', knn_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy mlp: ', mlp_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy lgbm: ', lgbm_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy rf: ', rf_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy LR: ', lr_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy Voting: ', voring_acc, file = f)
# with open(output_file_name, "a") as f: print('Accuracy catboost: ', cat_acc_01, file = f)

In [76]:
with open(output_file_name, "a") as f: print('-----------------------', file = f)
with open(output_file_name, "a") as f: print('Summary', file = f)
with open(output_file_name, "a") as f: print('-----------------------', file = f)
with open(output_file_name, "a") as f: print('Level 00', file = f)

with open(output_file_name, "a") as f: print('Accuracy ada: ', ada_acc_00, file = f)
with open(output_file_name, "a") as f: print('Accuracy dnn: ', dnn_acc_00, file = f)
with open(output_file_name, "a") as f: print('Accuracy svm: ', svm_acc_00, file = f)
with open(output_file_name, "a") as f: print('Accuracy knn: ', knn_acc_00, file = f)
with open(output_file_name, "a") as f: print('Accuracy mlp: ', mlp_acc_00, file = f)
with open(output_file_name, "a") as f: print('Accuracy lgbm: ', lgbm_acc_00, file = f)
with open(output_file_name, "a") as f: print('Accuracy rf: ', rf_acc_00, file = f)
with open(output_file_name, "a") as f: print('Accuracy cat: ', cat_acc_00, file = f)
with open(output_file_name, "a") as f: print('Accuracy xgb: ', xgb_acc_00, file = f)


with open(output_file_name, "a") as f: print('Precision ada: ', ada_pre_00, file = f)
with open(output_file_name, "a") as f: print('Precision dnn: ', dnn_pre_00, file = f)
with open(output_file_name, "a") as f: print('Precision svm: ', svm_pre_00, file = f)
with open(output_file_name, "a") as f: print('Precision knn: ', knn_pre_00, file = f)
with open(output_file_name, "a") as f: print('Precision mlp: ', mlp_pre_00, file = f)
with open(output_file_name, "a") as f: print('Precision lgbm: ', lgbm_pre_00, file = f)
with open(output_file_name, "a") as f: print('Precision rf: ', rf_pre_00, file = f)
with open(output_file_name, "a") as f: print('Precision cat: ', cat_pre_00, file = f)
with open(output_file_name, "a") as f: print('Precision xgb: ', xgb_pre_00, file = f)

with open(output_file_name, "a") as f: print('Recall ada: ', ada_rec_00, file = f)
with open(output_file_name, "a") as f: print('Recall dnn: ', dnn_rec_00, file = f)
with open(output_file_name, "a") as f: print('Recall svm: ', svm_rec_00, file = f)
with open(output_file_name, "a") as f: print('Recall knn: ', knn_rec_00, file = f)
with open(output_file_name, "a") as f: print('Recall mlp: ', mlp_rec_00, file = f)
with open(output_file_name, "a") as f: print('Recall lgbm: ', lgbm_rec_00, file = f)
with open(output_file_name, "a") as f: print('Recall rf: ', rf_rec_00, file = f)
with open(output_file_name, "a") as f: print('Recall cat: ', cat_rec_00, file = f)
with open(output_file_name, "a") as f: print('Recall xgb: ', xgb_rec_00, file = f)

with open(output_file_name, "a") as f: print('F1 ada: ', ada_f1_00, file = f)
with open(output_file_name, "a") as f: print('F1 dnn: ', dnn_f1_00, file = f)
with open(output_file_name, "a") as f: print('F1 svm: ', svm_f1_00, file = f)
with open(output_file_name, "a") as f: print('F1 knn: ', knn_f1_00, file = f)
with open(output_file_name, "a") as f: print('F1 mlp: ', mlp_f1_00, file = f)
with open(output_file_name, "a") as f: print('F1 lgbm: ', lgbm_f1_00, file = f)
with open(output_file_name, "a") as f: print('F1 rf: ', rf_f1_00, file = f)
with open(output_file_name, "a") as f: print('F1 cat: ', cat_f1_00, file = f)
with open(output_file_name, "a") as f: print('F1 xgb: ', xgb_f1_00, file = f)


with open(output_file_name, "a") as f: print('-----------------------', file = f)
with open(output_file_name, "a") as f: print('Level 01', file = f)

with open(output_file_name, "a") as f: print('Accuracy ada: ', ada_acc_01, file = f)
with open(output_file_name, "a") as f: print('Accuracy dnn: ', dnn_acc_01, file = f)
with open(output_file_name, "a") as f: print('Accuracy svm: ', svm_acc_01, file = f)
with open(output_file_name, "a") as f: print('Accuracy knn: ', knn_acc_01, file = f)
with open(output_file_name, "a") as f: print('Accuracy mlp: ', mlp_acc_01, file = f)
with open(output_file_name, "a") as f: print('Accuracy lgbm: ', lgbm_acc_01, file = f)
with open(output_file_name, "a") as f: print('Accuracy rf: ', rf_acc_01, file = f)
with open(output_file_name, "a") as f: print('Accuracy LR: ', lr_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy Voting: ', voting_acc_01, file = f)
with open(output_file_name, "a") as f: print('Accuracy catboost: ', cat_acc_01, file = f)
with open(output_file_name, "a") as f: print('Accuracy xgb: ', xgb_acc_01, file = f)

with open(output_file_name, "a") as f: print('Precision ada: ', ada_pre_01, file = f)
with open(output_file_name, "a") as f: print('Precision dnn: ', dnn_pre_01, file = f)
with open(output_file_name, "a") as f: print('Precision svm: ', svm_pre_01, file = f)
with open(output_file_name, "a") as f: print('Precision knn: ', knn_pre_01, file = f)
with open(output_file_name, "a") as f: print('Precision mlp: ', mlp_pre_01, file = f)
with open(output_file_name, "a") as f: print('Precision lgbm: ', lgbm_pre_01, file = f)
with open(output_file_name, "a") as f: print('Precision rf: ', rf_pre_01, file = f)
with open(output_file_name, "a") as f: print('Precision LR: ', lr_pre_01, file = f)
# with open(output_file_name, "a") as f: print('Precision Voting: ', voting_pre_01, file = f)
with open(output_file_name, "a") as f: print('Precision catboosting: ', cat_pre_01, file = f)
with open(output_file_name, "a") as f: print('Precision xgboost: ', xgb_pre_01, file = f)

with open(output_file_name, "a") as f: print('Recall ada: ', ada_rec_01, file = f)
with open(output_file_name, "a") as f: print('Recall dnn: ', dnn_rec_01, file = f)
with open(output_file_name, "a") as f: print('Recall svm: ', svm_rec_01, file = f)
with open(output_file_name, "a") as f: print('Recall knn: ', knn_rec_01, file = f)
with open(output_file_name, "a") as f: print('Recall mlp: ', mlp_rec_01, file = f)
with open(output_file_name, "a") as f: print('Recall lgbm: ', lgbm_rec_01, file = f)
with open(output_file_name, "a") as f: print('Recall rf: ', rf_rec_01, file = f)
with open(output_file_name, "a") as f: print('Recall LR: ', lr_rec_01, file = f)
# with open(output_file_name, "a") as f: print('Recall Voting: ', voting_rec_01, file = f)
with open(output_file_name, "a") as f: print('Recall catboosting: ', cat_rec_01, file = f)
with open(output_file_name, "a") as f: print('Recall xgboost: ', xgb_rec_01, file = f)

with open(output_file_name, "a") as f: print('F1 ada: ', ada_f1_01, file = f)
with open(output_file_name, "a") as f: print('F1 dnn: ', dnn_f1_01, file = f)
with open(output_file_name, "a") as f: print('F1 svm: ', svm_f1_01, file = f)
with open(output_file_name, "a") as f: print('F1 knn: ', knn_f1_01, file = f)
with open(output_file_name, "a") as f: print('F1 mlp: ', mlp_f1_01, file = f)
with open(output_file_name, "a") as f: print('F1 lgbm: ', lgbm_f1_01, file = f)
with open(output_file_name, "a") as f: print('F1 rf: ', rf_f1_01, file = f)
with open(output_file_name, "a") as f: print('F1 LR: ', lr_f1_01, file = f)
# with open(output_file_name, "a") as f: print('F1 Voting: ', voting_f1_01, file = f)
with open(output_file_name, "a") as f: print('F1 catboosting: ', cat_f1_01, file = f)
with open(output_file_name, "a") as f: print('F1 xgboost: ', xgb_f1_01, file = f)




In [77]:
# with open(output_file_name, "a") as f: print('-----------------------', file = f)
# with open(output_file_name, "a") as f: print('Summary', file = f)
# with open(output_file_name, "a") as f: print('-----------------------', file = f)
# with open(output_file_name, "a") as f: print('Level 00', file = f)

# with open(output_file_name, "a") as f: print('Accuracy ada: ', ada_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy dnn: ', dnn_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy svm: ', svm_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy knn: ', knn_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy mlp: ', mlp_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy lgbm: ', lgbm_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy rf: ', rf_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy cat: ', cat_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy xgb: ', xgb_acc_00, file = f)


# # with open(output_file_name, "a") as f: print('Precision ada: ', ada_pre_00, file = f)
# # with open(output_file_name, "a") as f: print('Precision dnn: ', dnn_pre_00, file = f)
# # with open(output_file_name, "a") as f: print('Precision svm: ', svm_pre_00, file = f)
# # with open(output_file_name, "a") as f: print('Precision knn: ', knn_pre_00, file = f)
# # with open(output_file_name, "a") as f: print('Precision mlp: ', mlp_pre_00, file = f)
# # with open(output_file_name, "a") as f: print('Precision lgbm: ', lgbm_pre_00, file = f)
# # with open(output_file_name, "a") as f: print('Precision rf: ', rf_pre_00, file = f)
# # with open(output_file_name, "a") as f: print('Precision cat: ', cat_pre_00, file = f)
# # with open(output_file_name, "a") as f: print('Precision xgb: ', xgb_pre_00, file = f)

# # with open(output_file_name, "a") as f: print('Recall ada: ', ada_rec_00, file = f)
# # with open(output_file_name, "a") as f: print('Recall dnn: ', dnn_rec_00, file = f)
# # with open(output_file_name, "a") as f: print('Recall svm: ', svm_rec_00, file = f)
# # with open(output_file_name, "a") as f: print('Recall knn: ', knn_rec_00, file = f)
# # with open(output_file_name, "a") as f: print('Recall mlp: ', mlp_rec_00, file = f)
# # with open(output_file_name, "a") as f: print('Recall lgbm: ', lgbm_rec_00, file = f)
# # with open(output_file_name, "a") as f: print('Recall rf: ', rf_rec_00, file = f)
# # with open(output_file_name, "a") as f: print('Recall cat: ', cat_rec_00, file = f)
# # with open(output_file_name, "a") as f: print('Recall xgb: ', xgb_rec_00, file = f)

# # with open(output_file_name, "a") as f: print('F1 ada: ', ada_f1_00, file = f)
# # with open(output_file_name, "a") as f: print('F1 dnn: ', dnn_f1_00, file = f)
# # with open(output_file_name, "a") as f: print('F1 svm: ', svm_f1_00, file = f)
# # with open(output_file_name, "a") as f: print('F1 knn: ', knn_f1_00, file = f)
# # with open(output_file_name, "a") as f: print('F1 mlp: ', mlp_f1_00, file = f)
# # with open(output_file_name, "a") as f: print('F1 lgbm: ', lgbm_f1_00, file = f)
# # with open(output_file_name, "a") as f: print('F1 rf: ', rf_f1_00, file = f)
# # with open(output_file_name, "a") as f: print('F1 cat: ', cat_f1_00, file = f)
# # with open(output_file_name, "a") as f: print('F1 xgb: ', xgb_f1_00, file = f)


# with open(output_file_name, "a") as f: print('-----------------------', file = f)
# with open(output_file_name, "a") as f: print('Level 01', file = f)

# with open(output_file_name, "a") as f: print('Accuracy ada: ', ada_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy dnn: ', dnn_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy svm: ', svm_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy knn: ', knn_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy mlp: ', mlp_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy lgbm: ', lgbm_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy rf: ', rf_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy LR: ', lr_acc_01, file = f)
# # with open(output_file_name, "a") as f: print('Accuracy Voting: ', voting_acc, file = f)
# with open(output_file_name, "a") as f: print('Accuracy catboost: ', cat_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy xgb: ', xgb_acc_01, file = f)

# # with open(output_file_name, "a") as f: print('Precision ada: ', ada_pre_01, file = f)
# # with open(output_file_name, "a") as f: print('Precision dnn: ', dnn_pre_01, file = f)
# # with open(output_file_name, "a") as f: print('Precision svm: ', svm_pre_01, file = f)
# # with open(output_file_name, "a") as f: print('Precision knn: ', knn_pre_01, file = f)
# # with open(output_file_name, "a") as f: print('Precision mlp: ', mlp_pre_01, file = f)
# # with open(output_file_name, "a") as f: print('Precision lgbm: ', lgbm_pre_01, file = f)
# # with open(output_file_name, "a") as f: print('Precision rf: ', rf_pre_01, file = f)
# # with open(output_file_name, "a") as f: print('Precision LR: ', lr_pre_01, file = f)
# # with open(output_file_name, "a") as f: print('Precision Voting: ', voting_pre, file = f)
# # with open(output_file_name, "a") as f: print('Precision catboosting: ', cat_pre_01, file = f)
# # with open(output_file_name, "a") as f: print('Precision xgboost: ', xgb_pre_01, file = f)

# # with open(output_file_name, "a") as f: print('Recall ada: ', ada_rec_01, file = f)
# # with open(output_file_name, "a") as f: print('Recall dnn: ', dnn_rec_01, file = f)
# # with open(output_file_name, "a") as f: print('Recall svm: ', svm_rec_01, file = f)
# # with open(output_file_name, "a") as f: print('Recall knn: ', knn_rec_01, file = f)
# # with open(output_file_name, "a") as f: print('Recall mlp: ', mlp_rec_01, file = f)
# # with open(output_file_name, "a") as f: print('Recall lgbm: ', lgbm_rec_01, file = f)
# # with open(output_file_name, "a") as f: print('Recall rf: ', rf_rec_01, file = f)
# # with open(output_file_name, "a") as f: print('Recall LR: ', lr_rec_01, file = f)
# # with open(output_file_name, "a") as f: print('Recall Voting: ', voting_rec, file = f)
# # with open(output_file_name, "a") as f: print('Recall catboosting: ', cat_rec_01, file = f)
# # with open(output_file_name, "a") as f: print('Recall xgboost: ', xgb_rec_01, file = f)

# # with open(output_file_name, "a") as f: print('F1 ada: ', ada_f1_01, file = f)
# # with open(output_file_name, "a") as f: print('F1 dnn: ', dnn_f1_01, file = f)
# # with open(output_file_name, "a") as f: print('F1 svm: ', svm_f1_01, file = f)
# # with open(output_file_name, "a") as f: print('F1 knn: ', knn_f1_01, file = f)
# # with open(output_file_name, "a") as f: print('F1 mlp: ', mlp_f1_01, file = f)
# # with open(output_file_name, "a") as f: print('F1 lgbm: ', lgbm_f1_01, file = f)
# # with open(output_file_name, "a") as f: print('F1 rf: ', rf_f1_01, file = f)
# # with open(output_file_name, "a") as f: print('F1 LR: ', lr_f1_01, file = f)
# # with open(output_file_name, "a") as f: print('F1 Voting: ', voting_f1, file = f)
# # with open(output_file_name, "a") as f: print('F1 catboosting: ', cat_f1_01, file = f)
# # with open(output_file_name, "a") as f: print('F1 xgboost: ', xgb_f1_01, file = f)




In [78]:

# import sklearn
# from sklearn.model_selection import train_test_split
# split = 0.7

# #AUC ROC
# #---------------------------------------------------------------------

# #AUCROC
# aucroc =[]
# y_array = [y_0,y_1,y_2,y_3,y_4]
# for j in range(0,len(y_array)):
#     # print(j)
#     #------------------------------------------------------------------------------------------------------------
#     X_train,X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y_array[j], train_size=split)
    
#     # evaluate the model

#     knn_clf.fit(X_train,y_train)
#     y_pred=knn_clf.predict(X_test) #These are the predicted output value
#     # y_pred = knn_clf.predict_proba(X_test)

    
#     y_scores = y_pred
#     y_true = y_test

#     # model = LGBMClassifier()
#     # model.fit(X_train, y_train)
#     # y_pred = model.predict(X_test)


#     y_scores = y_pred
#     y_true = y_test
    
#     # Calculate AUC-ROC score
#     auc_roc_score= roc_auc_score(y_true, y_scores,  average='weighted')  # Use 'micro' or 'macro' for different averaging strategies
#     # print("AUC-ROC Score class:", auc_roc_score)
#     aucroc.append(auc_roc_score)
#     #-------------------------------------------------------------------------------------------------------    -----
#     # Calculate the average
# average = sum(aucroc) / len(aucroc)

# # Display the result
# # with open(output_file_name, "a") as f:print("AUC ROC Average:", average, file = f)
# print("AUC ROC Average:", average)

# #End AUC ROC

In [79]:
lr_acc_00 = 0 
voting_acc_00 = 0

lr_pre_00 = 0 
voting_pre_00 = 0

lr_rec_00 = 0 
voting_rec_00 = 0

lr_f1_00 = 0 
voting_f1_00 = 0

In [80]:

voting_acc_01 = 0


voting_pre_01 = 0


voting_rec_01 = 0


voting_f1_01 = 0

In [81]:
from tabulate import tabulate

# Assuming data is a 110x4 list, where each row is a sublist
# data =  [["Row {} Col {}".format(i + 1, j + 1) for j in range(4)] for i in range(110)]
data = [["" for _ in range(3)] for _ in range(12)]

# Manually insert data at specific row and column
# data[0][0] = "ADA"
# data[1][0] = "DNN"
# data[2][0] = "SVM"
# data[3][0] = "ADA"
# data[4][0] = "DNN"
# data[2][0] = "SVM"


names_models = ['ADA',
                'SVM',
                'DNN',
                'MLP',
                'KNN',
                'CAT',
                'XGB',
                'LGBM',
                'RF',
                'LR',
                'VOTING'
                ]
level_00_acc = [ada_acc_00,
                svm_acc_00,
                dnn_acc_00,
                mlp_acc_00,
                knn_acc_00,
                cat_acc_00,
                xgb_acc_00,
                lgbm_acc_00,
                rf_acc_00,
                lr_acc_00,
                voting_acc_00]  
level_01_acc = [ada_acc_01,
                svm_acc_01,
                dnn_acc_01,
                mlp_acc_01,
                knn_acc_01,
                cat_acc_01,
                xgb_acc_01,
                lgbm_acc_01,
                rf_acc_01,
                lr_acc_01,
                voting_acc_01]  
                 

for i in range(0,len(names_models)):
    data[i][0] =  names_models[i]
    data[i][1] = level_00_acc[i]
    data[i][2] = level_01_acc[i]


 
# data[0][1] = ada_acc_00
# data

# Define column headers
headers = ["Accuracy", "Level 00", "Level 01"]

# Print the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)
with open(output_file_name, "a") as f: print(table, file = f)


+------------+--------------------+--------------------+
| Accuracy   | Level 00           | Level 01           |
| ADA        | 0.7586959156465883 | 0.5515755453810934 |
+------------+--------------------+--------------------+
| SVM        | 0.8661441710769066 | 0.6067869647185564 |
+------------+--------------------+--------------------+
| DNN        | 0.5435435435435435 | 0.6091659933566748 |
+------------+--------------------+--------------------+
| MLP        | 0.9663340470515358 | 0.8995421492054942 |
+------------+--------------------+--------------------+
| KNN        | 0.974615871476858  | 0.9668731483975223 |
+------------+--------------------+--------------------+
| CAT        | 0.970562490741863  | 0.6322829697459377 |
+------------+--------------------+--------------------+
| XGB        | 0.9550357532420313 | 0.9774665589370679 |
+------------+--------------------+--------------------+
| LGBM       | 0.9810124025370662 | 0.9959601400484783 |
+------------+-----------------

In [82]:
from tabulate import tabulate

# Assuming data is a 110x4 list, where each row is a sublist
# data =  [["Row {} Col {}".format(i + 1, j + 1) for j in range(4)] for i in range(110)]
data = [["" for _ in range(3)] for _ in range(12)]

# Manually insert data at specific row and column
# data[0][0] = "ADA"
# data[1][0] = "DNN"
# data[2][0] = "SVM"
# data[3][0] = "ADA"
# data[4][0] = "DNN"
# data[2][0] = "SVM"


names_models = ['ADA',
                'SVM',
                'DNN',
                'MLP',
                'KNN',
                'CAT',
                'XGB',
                'LGBM',
                'RF',
                'LR',
                'VOTING'
                ]
level_00_pre = [ada_pre_00,
                svm_pre_00,
                dnn_pre_00,
                mlp_pre_00,
                knn_pre_00,
                cat_pre_00,
                xgb_pre_00,
                lgbm_pre_00,
                rf_pre_00,
                lr_pre_00,
                voting_pre_00]  
level_01_pre = [ada_pre_01,
                svm_pre_01,
                dnn_pre_01,
                mlp_pre_01,
                knn_pre_01,
                cat_pre_01,
                xgb_pre_01,
                lgbm_pre_01,
                rf_pre_01,
                lr_pre_01,
                voting_pre_01]  
                 

for i in range(0,len(names_models)):
    data[i][0] =  names_models[i]
    data[i][1] = level_00_pre[i]
    data[i][2] = level_01_pre[i]


 
# data[0][1] = ada_acc_00
# data

# Define column headers
headers = ["Precision", "Level 00", "Level 01"]

# Print the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)
with open(output_file_name, "a") as f: print(table, file = f)


+-------------+---------------------+--------------------+
| Precision   | Level 00            | Level 01           |
| ADA         | 0.5148897595750748  | 0.5742474895492983 |
+-------------+---------------------+--------------------+
| SVM         | 0.4865043558587877  | 0.538852401791561  |
+-------------+---------------------+--------------------+
| DNN         | 0.23228438339215812 | 0.4561130829802892 |
+-------------+---------------------+--------------------+
| MLP         | 0.9367785393943964  | 0.9393231012474725 |
+-------------+---------------------+--------------------+
| KNN         | 0.784230172592528   | 0.9675540177960912 |
+-------------+---------------------+--------------------+
| CAT         | 0.7403722100119083  | 0.6381261354968755 |
+-------------+---------------------+--------------------+
| XGB         | 0.7177268205303429  | 0.979760146756426  |
+-------------+---------------------+--------------------+
| LGBM        | 0.7740801027644095  | 0.9962764789229521

In [83]:
from tabulate import tabulate

# Assuming data is a 110x4 list, where each row is a sublist
# data =  [["Row {} Col {}".format(i + 1, j + 1) for j in range(4)] for i in range(110)]
data = [["" for _ in range(3)] for _ in range(12)]

# Manually insert data at specific row and column
# data[0][0] = "ADA"
# data[1][0] = "DNN"
# data[2][0] = "SVM"
# data[3][0] = "ADA"
# data[4][0] = "DNN"
# data[2][0] = "SVM"


names_models = ['ADA',
                'SVM',
                'DNN',
                'MLP',
                'KNN',
                'CAT',
                'XGB',
                'LGBM',
                'RF',
                'LR',
                'VOTING'
                ]
level_00_rec = [ada_rec_00,
                svm_rec_00,
                dnn_rec_00,
                mlp_rec_00,
                knn_rec_00,
                cat_rec_00,
                xgb_rec_00,
                lgbm_rec_00,
                rf_rec_00,
                lr_rec_00,
                voting_rec_00]  
level_01_rec = [ada_rec_01,
                svm_rec_01,
                dnn_rec_01,
                mlp_rec_01,
                knn_rec_01,
                cat_rec_01,
                xgb_rec_01,
                lgbm_rec_01,
                rf_rec_01,
                lr_rec_01,
                voting_rec_01]  
                 

for i in range(0,len(names_models)):
    data[i][0] =  names_models[i]
    data[i][1] = level_00_rec[i]
    data[i][2] = level_01_rec[i]

 
# data[0][1] = ada_acc_00
# data

# Define column headers
headers = ["Recall", "Level 00", "Level 01"]

# Print the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)
with open(output_file_name, "a") as f: print(table, file = f)


+----------+---------------------+--------------------+
| Recall   | Level 00            | Level 01           |
| ADA      | 0.5628118482439187  | 0.4071999445070654 |
+----------+---------------------+--------------------+
| SVM      | 0.48149153505386855 | 0.5575845535423665 |
+----------+---------------------+--------------------+
| DNN      | 0.224502924856971   | 0.5459240511334371 |
+----------+---------------------+--------------------+
| MLP      | 0.72674616229928    | 0.9409931095579122 |
+----------+---------------------+--------------------+
| KNN      | 0.7586551316929457  | 0.8658800286465154 |
+----------+---------------------+--------------------+
| CAT      | 0.7412066832896944  | 0.6502173659559842 |
+----------+---------------------+--------------------+
| XGB      | 0.7005618139133347  | 0.9865464097413732 |
+----------+---------------------+--------------------+
| LGBM     | 0.7900534539974579  | 0.9969625383786198 |
+----------+---------------------+--------------

In [84]:
from tabulate import tabulate

# Assuming data is a 110x4 list, where each row is a sublist
# data =  [["Row {} Col {}".format(i + 1, j + 1) for j in range(4)] for i in range(110)]
data = [["" for _ in range(3)] for _ in range(12)]

# Manually insert data at specific row and column
# data[0][0] = "ADA"
# data[1][0] = "DNN"
# data[2][0] = "SVM"
# data[3][0] = "ADA"
# data[4][0] = "DNN"
# data[2][0] = "SVM"


names_models = ['ADA',
                'SVM',
                'DNN',
                'MLP',
                'KNN',
                'CAT',
                'XGB',
                'LGBM',
                'RF',
                'LR',
                'VOTING'
                ]
level_00_f1 = [ada_f1_00,
                svm_f1_00,
                dnn_f1_00,
                mlp_f1_00,
                knn_f1_00,
                cat_f1_00,
                xgb_f1_00,
                lgbm_f1_00,
                rf_f1_00,
                lr_f1_00,
                voting_f1_00]  
level_01_f1 = [ada_f1_01,
                svm_f1_01,
                dnn_f1_01,
                mlp_f1_01,
                knn_f1_01,
                cat_f1_01,
                xgb_f1_01,
                lgbm_f1_01,
                rf_f1_01,
                lr_f1_01,
                voting_f1_01]  
                 

for i in range(0,len(names_models)):
    data[i][0] =  names_models[i]
    data[i][1] = level_00_f1[i]
    data[i][2] = level_01_f1[i]


 
# data[0][1] = ada_acc_00
# data

# Define column headers
headers = ["F1", "Level 00", "Level 01"]

# Print the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)
with open(output_file_name, "a") as f: print(table, file = f)


+--------+--------------------+---------------------+
| F1     | Level 00           | Level 01            |
| ADA    | 0.5232524495141136 | 0.3507740333236951  |
+--------+--------------------+---------------------+
| SVM    | 0.4829959148201005 | 0.5418726099415194  |
+--------+--------------------+---------------------+
| DNN    | 0.2013062027461246 | 0.48993146733213655 |
+--------+--------------------+---------------------+
| MLP    | 0.7366784048823265 | 0.9391811063990051  |
+--------+--------------------+---------------------+
| KNN    | 0.7597319520860626 | 0.8961962839323581  |
+--------+--------------------+---------------------+
| CAT    | 0.7407826385359696 | 0.6386063966146803  |
+--------+--------------------+---------------------+
| XGB    | 0.7084949959099308 | 0.9829905583077186  |
+--------+--------------------+---------------------+
| LGBM   | 0.7813739805540139 | 0.996618329132178   |
+--------+--------------------+---------------------+
| RF     | 0.558728563291694

In [85]:
from tabulate import tabulate

# Assuming data is a 110x4 list, where each row is a sublist
# data =  [["Row {} Col {}".format(i + 1, j + 1) for j in range(4)] for i in range(110)]
data = [["" for _ in range(9)] for _ in range(12)]

# Manually insert data at specific row and column
# data[0][0] = "ADA"
# data[1][0] = "DNN"
# data[2][0] = "SVM"
# data[3][0] = "ADA"
# data[4][0] = "DNN"
# data[2][0] = "SVM"


names_models = ['ADA',
                'SVM',
                'DNN',
                'MLP',
                'KNN',
                'CAT',
                'XGB',
                'LGBM',
                'RF',
                'LR',
                'VOTING'
                ]
level_00_f1 = [ada_f1_00,
                svm_f1_00,
                dnn_f1_00,
                mlp_f1_00,
                knn_f1_00,
                cat_f1_00,
                xgb_f1_00,
                lgbm_f1_00,
                rf_f1_00,
                lr_f1_00,
                voting_f1_00]  
level_01_f1 = [ada_f1_01,
                svm_f1_01,
                dnn_f1_01,
                mlp_f1_01,
                knn_f1_01,
                cat_f1_01,
                xgb_f1_01,
                lgbm_f1_01,
                rf_f1_01,
                lr_f1_01,
                voting_f1_01]  
                 

for i in range(0,len(names_models)):
    data[i][0] =  names_models[i]

    data[i][1] = level_00_acc[i]
    data[i][2] = level_01_acc[i]

    data[i][3] = level_00_pre[i] 
    data[i][4] = level_01_pre[i]

    data[i][5] = level_00_rec[i] 
    data[i][6] = level_01_rec[i]

    data[i][7] = level_00_f1[i]
    data[i][8] = level_01_f1[i]




 
# data[0][1] = ada_acc_00
# data

# Define column headers
headers = ["Models", "ACC-00", " ACC-01","PRE-00", " PRE-01","REC-00", " REC-01","F1-00", " F1-01",]

# Print the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)
with open(output_file_name, "a") as f: print(table, file = f)


+----------+--------------------+--------------------+---------------------+--------------------+---------------------+--------------------+--------------------+---------------------+
| Models   | ACC-00             |  ACC-01            | PRE-00              |  PRE-01            | REC-00              |  REC-01            | F1-00              |  F1-01              |
| ADA      | 0.7586959156465883 | 0.5515755453810934 | 0.5148897595750748  | 0.5742474895492983 | 0.5628118482439187  | 0.4071999445070654 | 0.5232524495141136 | 0.3507740333236951  |
+----------+--------------------+--------------------+---------------------+--------------------+---------------------+--------------------+--------------------+---------------------+
| SVM      | 0.8661441710769066 | 0.6067869647185564 | 0.4865043558587877  | 0.538852401791561  | 0.48149153505386855 | 0.5575845535423665 | 0.4829959148201005 | 0.5418726099415194  |
+----------+--------------------+--------------------+---------------------+----