In [76]:
# First ensemble with NSL-KDD
# Parameters

#----------------------------------------------
# 0 for not using it as base learner
# 1 for using it as base learner

use_model_ada = 1 
use_model_dnn = 1 
use_model_mlp = 1 
use_model_lgbm = 1 
use_model_rf = 1 
use_model_svm = 1
use_model_knn = 1 
#----------------------------------------------
# 0 for training the model
# 1 for using the saved version of the model

load_model_ada = 0 
load_model_dnn = 0 
load_model_mlp = 0 
load_model_lgbm = 0 
load_model_rf = 0 
load_model_svm = 0
load_model_knn = 0 
#----------------------------------------------

# load_model_ada = 1
# load_model_dnn = 1 
# load_model_mlp = 1 
# load_model_lgbm = 1 
# load_model_rf = 1 
# load_model_svm = 1
# load_model_knn = 1 
#----------------------------------------------




In [77]:

# Specify the name of the output text file
output_file_name = "ensemble_prob_AF.txt"
with open(output_file_name, "w") as f: print('---------------------------------------------------------------------------------', file = f)
with open(output_file_name, "a") as f: print('---- Start Ensemble Model Info - v0 ----', file = f)


In [78]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:
# importing required libraries
import numpy as np
import pandas as pd
import pickle # saving and loading trained model
from os import path


# importing required libraries for normalizing data
from sklearn import preprocessing
from sklearn.preprocessing import (StandardScaler, OrdinalEncoder,LabelEncoder, MinMaxScaler, OneHotEncoder)
from sklearn.preprocessing import Normalizer, MaxAbsScaler , RobustScaler, PowerTransformer

# importing library for plotting
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.metrics import accuracy_score # for calculating accuracy of model
from sklearn.model_selection import train_test_split # for splitting the dataset for training and testing
from sklearn.metrics import classification_report # for generating a classification report of model

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc

import tensorflow as tf
from tensorflow.keras.utils import to_categorical

from keras.layers import Dense # importing dense layer

from keras.layers import Input
from keras.models import Model
# representation of model layers
#from keras.utils import plot_model
from sklearn.metrics import confusion_matrix
import shap




# In[2]:


# #Defining metric functions
# def ACC(TP,TN,FP,FN):
#     Acc = (TP+TN)/(TP+FP+FN+TN)
#     return Acc
# def ACC_2 (TP, FN):
#     ac = (TP/(TP+FN))
#     return ac
# def PRECISION(TP,FP):
#     eps = 1e-7
#     Precision = TP/(TP+FP+eps)
    

#     return Precision
# def RECALL(TP,FN):
#     Recall = TP/(TP+FN)
#     return Recall
# def F1(Recall, Precision):
#     F1 = 2 * Recall * Precision / (Recall + Precision)
#     return F1
# def BACC(TP,TN,FP,FN):
#     BACC =(TP/(TP+FN)+ TN/(TN+FP))*0.5
#     return BACC
# def MCC(TP,TN,FP,FN):
#     eps = 1e-7
#     MCC = (TN*TP-FN*FP)/(((TP+FP+eps)*(TP+FN+eps)*(TN+FP+eps)*(TN+FN+eps))**.5)
#     return MCC
# def AUC_ROC(y_test_bin,y_score):
#     fpr = dict()
#     tpr = dict()
#     roc_auc = dict()
#     auc_avg = 0
#     counting = 0
#     for i in range(n_classes):
#       fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
#      # plt.plot(fpr[i], tpr[i], color='darkorange', lw=2)
#       #print('AUC for Class {}: {}'.format(i+1, auc(fpr[i], tpr[i])))
#       auc_avg += auc(fpr[i], tpr[i])
#       counting = i+1
#     return auc_avg/counting


# In[3]:


# attach the column names to the dataset
feature=["duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent","hot",
          "num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root","num_file_creations","num_shells",
          "num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count","srv_count","serror_rate","srv_serror_rate",
          "rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count", 
          "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate",
          "dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","label","difficulty"]
# KDDTrain+_2.csv & KDDTest+_2.csv are the datafiles without the last column about the difficulty score
# these have already been removed.

train='KDDTrain+.txt'
test='KDDTest+.txt'

df=pd.read_csv(train,names=feature)
df_test=pd.read_csv(test,names=feature)

In [79]:


# shape, this gives the dimensions of the dataset
print('Dimensions of the Training set:',df.shape)
print('Dimensions of the Test set:',df_test.shape)


df.drop(['difficulty'],axis=1,inplace=True)
df_test.drop(['difficulty'],axis=1,inplace=True)



print('Label distribution Training set:')
print(df['label'].value_counts())
print()
print('Label distribution Test set:')
print(df_test['label'].value_counts())



# colums that are categorical and not binary yet: protocol_type (column 2), service (column 3), flag (column 4).
# explore categorical features
print('Training set:')
for col_name in df.columns:
    if df[col_name].dtypes == 'object' :
        unique_cat = len(df[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

#see how distributed the feature service is, it is evenly distributed and therefore we need to make dummies for all.
print()
print('Distribution of categories in service:')
print(df['service'].value_counts().sort_values(ascending=False).head())



# Test set
print('Test set:')
for col_name in df_test.columns:
    if df_test[col_name].dtypes == 'object' :
        unique_cat = len(df_test[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))


from sklearn.preprocessing import LabelEncoder,OneHotEncoder
categorical_columns=['protocol_type', 'service', 'flag']
# insert code to get a list of categorical columns into a variable, categorical_columns
categorical_columns=['protocol_type', 'service', 'flag'] 
 # Get the categorical values into a 2D numpy array
df_categorical_values = df[categorical_columns]
testdf_categorical_values = df_test[categorical_columns]
df_categorical_values.head()


# protocol type
unique_protocol=sorted(df.protocol_type.unique())
string1 = 'Protocol_type_'
unique_protocol2=[string1 + x for x in unique_protocol]
# service
unique_service=sorted(df.service.unique())
string2 = 'service_'
unique_service2=[string2 + x for x in unique_service]
# flag
unique_flag=sorted(df.flag.unique())
string3 = 'flag_'
unique_flag2=[string3 + x for x in unique_flag]
# put together
dumcols=unique_protocol2 + unique_service2 + unique_flag2
print(dumcols)

#do same for test set
unique_service_test=sorted(df_test.service.unique())
unique_service2_test=[string2 + x for x in unique_service_test]
testdumcols=unique_protocol2 + unique_service2_test + unique_flag2




df_categorical_values_enc=df_categorical_values.apply(LabelEncoder().fit_transform)
print(df_categorical_values_enc.head())
# test set
testdf_categorical_values_enc=testdf_categorical_values.apply(LabelEncoder().fit_transform)



enc = OneHotEncoder()
df_categorical_values_encenc = enc.fit_transform(df_categorical_values_enc)
df_cat_data = pd.DataFrame(df_categorical_values_encenc.toarray(),columns=dumcols)
# test set
testdf_categorical_values_encenc = enc.fit_transform(testdf_categorical_values_enc)
testdf_cat_data = pd.DataFrame(testdf_categorical_values_encenc.toarray(),columns=testdumcols)

df_cat_data.head()


trainservice=df['service'].tolist()
testservice= df_test['service'].tolist()
difference=list(set(trainservice) - set(testservice))
string = 'service_'
difference=[string + x for x in difference]
difference

for col in difference:
    testdf_cat_data[col] = 0

testdf_cat_data.shape

newdf=df.join(df_cat_data)
newdf.drop('flag', axis=1, inplace=True)
newdf.drop('protocol_type', axis=1, inplace=True)
newdf.drop('service', axis=1, inplace=True)
# test data
newdf_test=df_test.join(testdf_cat_data)
newdf_test.drop('flag', axis=1, inplace=True)
newdf_test.drop('protocol_type', axis=1, inplace=True)
newdf_test.drop('service', axis=1, inplace=True)
print(newdf.shape)
print(newdf_test.shape)


# take label column
labeldf=newdf['label']
labeldf_test=newdf_test['label']
# change the label column
newlabeldf=labeldf.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})
newlabeldf_test=labeldf_test.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})
# put the new label column back
newdf['label'] = newlabeldf
newdf_test['label'] = newlabeldf_test
print(newdf['label'].head())


# Specify your selected features. Note that you'll need to modify this list according to your final processed dataframe
#Uncomment the below lines to use these top 20 features from shap analysis
#selected_features = ["root_shell","service_telnet","num_shells","service_uucp","dst_host_same_src_port_rate"
#                     ,"dst_host_rerror_rate","dst_host_srv_serror_rate","dst_host_srv_count","service_private","logged_in",
#                    "dst_host_serror_rate","serror_rate","srv_serror_rate","flag_S0","diff_srv_rate","dst_host_srv_diff_host_rate","num_file_creations","flag_RSTR"#,"dst_host_same_srv_rate","service_Idap","label"]
                     

# Select those features from your dataframe
#newdf = newdf[selected_features]
#newdf_test = newdf_test[selected_features]

# Now your dataframe only contains your selected features.

# creating a dataframe with multi-class labels (Dos,Probe,R2L,U2R,normal)
multi_data = newdf.copy()
multi_label = pd.DataFrame(multi_data.label)

multi_data_test=newdf_test.copy()
multi_label_test = pd.DataFrame(multi_data_test.label)


# using standard scaler for normalizing
std_scaler = StandardScaler()
def standardization(df,col):
    for i in col:
        arr = df[i]
        arr = np.array(arr)
        df[i] = std_scaler.fit_transform(arr.reshape(len(arr),1))
    return df

numeric_col = multi_data.select_dtypes(include='number').columns
data = standardization(multi_data,numeric_col)
numeric_col_test = multi_data_test.select_dtypes(include='number').columns
data_test = standardization(multi_data_test,numeric_col_test)

# label encoding (0,1,2,3,4) multi-class labels (Dos,normal,Probe,R2L,U2R)
le2 = preprocessing.LabelEncoder()
le2_test = preprocessing.LabelEncoder()
enc_label = multi_label.apply(le2.fit_transform)
enc_label_test = multi_label_test.apply(le2_test.fit_transform)
multi_data = multi_data.copy()
multi_data_test = multi_data_test.copy()

multi_data['intrusion'] = enc_label
multi_data_test['intrusion'] = enc_label_test

#y_mul = multi_data['intrusion']
multi_data
multi_data_test



multi_data.drop(labels= [ 'label'], axis=1, inplace=True)
multi_data
multi_data_test.drop(labels= [ 'label'], axis=1, inplace=True)
multi_data_test


y_train_multi= multi_data[['intrusion']]
X_train_multi= multi_data.drop(labels=['intrusion'], axis=1)

print('X_train has shape:',X_train_multi.shape,'\ny_train has shape:',y_train_multi.shape)

y_test_multi= multi_data_test[['intrusion']]
X_test_multi= multi_data_test.drop(labels=['intrusion'], axis=1)

print('X_test has shape:',X_test_multi.shape,'\ny_test has shape:',y_test_multi.shape)


from collections import Counter

label_counts = Counter(y_train_multi['intrusion'])
print(label_counts)


from sklearn.preprocessing import LabelBinarizer

y_train_multi = LabelBinarizer().fit_transform(y_train_multi)

y_test_multi = LabelBinarizer().fit_transform(y_test_multi)


Y_train=y_train_multi.copy()
X_train=X_train_multi.copy()

Y_test=y_test_multi.copy()
X_test=X_test_multi.copy()




Dimensions of the Training set: (125973, 43)
Dimensions of the Test set: (22544, 43)
Label distribution Training set:
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: label, dtype: int64

Label distribution Test set:
normal             9711
neptune            4657
guess_passwd       1231
mscan               996
warezmaster         944
apache2             737
satan               735
processtable        685
smurf               665
back                359
snmpguess  

(125973, 123)
(22544, 123)
0    0
1    0
2    1
3    0
4    0
Name: label, dtype: int64
X_train has shape: (125973, 122) 
y_train has shape: (125973, 1)
X_test has shape: (22544, 122) 
y_test has shape: (22544, 1)
Counter({0: 67343, 1: 45927, 2: 11656, 3: 995, 4: 52})


In [80]:

# In[24]:

'''
from sklearn.feature_selection import SelectKBest, f_classif

# Number of best features you want to select
k = 15

# Initialize a dataframe to store the scores for each feature against each class
feature_scores = pd.DataFrame(index=X_train.columns)

# Loop through each class
for class_index in range(Y_train.shape[1]):
    
    # Get the current class labels
    y_train_current_class = Y_train[:, class_index]
    
    # Select K best features for the current class
    best_features = SelectKBest(score_func=f_classif, k='all')
    fit = best_features.fit(X_train, y_train_current_class)

    # Get the scores
    df_scores = pd.DataFrame(fit.scores_, index=X_train.columns, columns=[f"class_{class_index}"])
    
    # Concatenate the scores to the main dataframe
    feature_scores = pd.concat([feature_scores, df_scores],axis=1)

# Get the sum of the scores for each feature
feature_scores['total'] = feature_scores.sum(axis=1)

# Get the top k features in a list
top_k_features = feature_scores.nlargest(k, 'total').index.tolist()

print(top_k_features)

'''
# In[32]:

'\nfrom sklearn.feature_selection import SelectKBest, f_classif\n\n# Number of best features you want to select\nk = 15\n\n# Initialize a dataframe to store the scores for each feature against each class\nfeature_scores = pd.DataFrame(index=X_train.columns)\n\n# Loop through each class\nfor class_index in range(Y_train.shape[1]):\n    \n    # Get the current class labels\n    y_train_current_class = Y_train[:, class_index]\n    \n    # Select K best features for the current class\n    best_features = SelectKBest(score_func=f_classif, k=\'all\')\n    fit = best_features.fit(X_train, y_train_current_class)\n\n    # Get the scores\n    df_scores = pd.DataFrame(fit.scores_, index=X_train.columns, columns=[f"class_{class_index}"])\n    \n    # Concatenate the scores to the main dataframe\n    feature_scores = pd.concat([feature_scores, df_scores],axis=1)\n\n# Get the sum of the scores for each feature\nfeature_scores[\'total\'] = feature_scores.sum(axis=1)\n\n# Get the top k features in a l

In [81]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.datasets import make_classification

# Assuming you have features X and labels Y
# X, Y = make_classification()

ros = RandomOverSampler(sampling_strategy='minority', random_state=100)

X_train, Y_train = ros.fit_resample(X_train, Y_train)


# In[33]:


print(Y_test)


# In[34]:


X_train.values


# In[35]:



[[0 1 0 0 0]
 [0 1 0 0 0]
 [1 0 0 0 0]
 ...
 [0 1 0 0 0]
 [1 0 0 0 0]
 [0 0 1 0 0]]


array([[-1.10249223e-01, -7.67859947e-03, -4.91864438e-03, ...,
        -1.97262160e-02,  8.25150071e-01, -4.64315895e-02],
       [-1.10249223e-01, -7.73736981e-03, -4.91864438e-03, ...,
        -1.97262160e-02,  8.25150071e-01, -4.64315895e-02],
       [-1.10249223e-01, -7.76224074e-03, -4.91864438e-03, ...,
        -1.97262160e-02, -1.21190076e+00, -4.64315895e-02],
       ...,
       [-9.29714678e-02, -7.36430591e-03, -3.87394518e-03, ...,
        -1.97262160e-02,  8.25150071e-01, -4.64315895e-02],
       [-8.68282658e-02, -7.36430591e-03, -3.87568593e-03, ...,
        -1.97262160e-02,  8.25150071e-01, -4.64315895e-02],
       [ 1.61587463e-01, -7.46804833e-03,  1.06953862e-03, ...,
        -1.97262160e-02,  8.25150071e-01, -4.64315895e-02]])

In [82]:
single_class_train = np.argmax(y_train_multi, axis=1)
single_class_test = np.argmax(y_test_multi, axis=1)


df1 = X_train_multi.assign(Label = single_class_train)
df2 =  X_test_multi.assign(Label = single_class_test)

frames = [df1,  df2]

df = pd.concat(frames,ignore_index=True)

y = df.pop('Label')
X = df

y1, y2 = pd.factorize(y)

y_0 = pd.DataFrame(y1)
y_1 = pd.DataFrame(y1)
y_2 = pd.DataFrame(y1)
y_3 = pd.DataFrame(y1)
y_4 = pd.DataFrame(y1)


# y_0 = y_0.replace(0, 0)
# y_0 = y_0.replace(1, 1)
y_0 = y_0.replace(2, 1)
y_0 = y_0.replace(3, 1)
y_0 = y_0.replace(4, 1)


y_1 = y_1.replace(1, 999)
y_1 = y_1.replace(0, 1)
# y_1 = y_1.replace(1, 0)
y_1 = y_1.replace(2, 1)
y_1 = y_1.replace(3, 1)
y_1 = y_1.replace(4, 1)
y_1 = y_1.replace(999, 1)


y_2 = y_2.replace(0, 1)
y_2 = y_2.replace(1, 1)
y_2 = y_2.replace(2, 0)
y_2 = y_2.replace(3, 1)
y_2 = y_2.replace(4, 1)


y_3 = y_3.replace(0, 1)
# y_3 = y_3.replace(1, 1)
y_3 = y_3.replace(2, 1)
y_3 = y_3.replace(3, 0)
y_3 = y_3.replace(4, 1)


y_4 = y_4.replace(0, 1)
# y_4 = y_4.replace(1, 1)
y_4 = y_4.replace(2, 1)
y_4 = y_4.replace(3, 1)
y_4 = y_4.replace(4, 0)



df = df.assign(Label = y)

In [83]:
#Divide the dataset between level 00 and level 01
import sklearn
from sklearn.model_selection import train_test_split
split = 0.5 # 0.7

# X_00,X_01, y_00, y_01 = sklearn.model_selection.train_test_split(X, y, train_size=split)
X_train,X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=split)

In [84]:
from collections import Counter

label_counts2 = Counter(y)
print(label_counts2)


Counter({0: 77054, 1: 53387, 2: 14077, 3: 3880, 4: 119})


In [85]:
#Base learner Split
# split = 0.7

# X_train,X_test, y_train, y_test = sklearn.model_selection.train_test_split(X_00, y_00, train_size=split)

In [86]:
X_train

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
37230,-0.110249,-0.007755,-0.004889,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,-0.809262,-0.011664,...,-0.312889,-0.11205,-0.028606,-0.139982,-0.618438,-0.053906,-0.031767,-0.019726,0.825150,-0.046432
17528,-0.109865,-0.007560,-0.004837,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,1.235694,-0.011664,...,-0.312889,-0.11205,-0.028606,-0.139982,-0.618438,-0.053906,-0.031767,-0.019726,0.825150,-0.046432
143061,-0.155534,-0.021973,-0.083842,-0.017624,-0.059104,-0.019459,-0.113521,-0.143999,-0.890373,-0.016494,...,-0.453815,-0.18843,-0.009419,-0.174880,-0.313124,-0.030535,-0.025803,-0.105681,0.718027,-0.056997
87792,-0.104874,-0.007744,-0.004883,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,-0.809262,-0.011664,...,-0.312889,-0.11205,-0.028606,-0.139982,-0.618438,-0.053906,-0.031767,-0.019726,0.825150,-0.046432
115107,-0.110249,-0.007762,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,-0.809262,-0.011664,...,-0.312889,-0.11205,-0.028606,-0.139982,1.616978,-0.053906,-0.031767,-0.019726,-1.211901,-0.046432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11869,-0.110249,-0.007714,-0.004738,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,1.235694,-0.011664,...,-0.312889,-0.11205,-0.028606,-0.139982,-0.618438,-0.053906,-0.031767,-0.019726,0.825150,-0.046432
75598,-0.110249,-0.007762,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,-0.809262,-0.011664,...,-0.312889,-0.11205,-0.028606,-0.139982,1.616978,-0.053906,-0.031767,-0.019726,-1.211901,-0.046432
97245,-0.110249,-0.007761,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,-0.809262,-0.011664,...,-0.312889,-0.11205,-0.028606,-0.139982,-0.618438,-0.053906,-0.031767,-0.019726,0.825150,-0.046432
125911,-0.110249,-0.007762,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,-0.809262,-0.011664,...,-0.312889,-0.11205,-0.028606,-0.139982,1.616978,-0.053906,-0.031767,-0.019726,-1.211901,-0.046432


In [87]:
y_train

37230     0
17528     0
143061    0
87792     0
115107    1
         ..
11869     0
75598     1
97245     2
125911    1
111713    1
Name: Label, Length: 74258, dtype: int64

## LEVEL 0 - Weak models - Base Learner

In [88]:
with open(output_file_name, "a") as f: print('------------START of WEAK LEARNERS (BASE MODELS) - STACK 00 -----------------', file = f)

#Defining Basemodels


print('---------------------------------------------------------------------------------')
print('Defining RF Model')
print('---------------------------------------------------------------------------------')
#Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
rf = RandomForestClassifier(max_depth = 5,  n_estimators = 10, min_samples_split = 2, n_jobs = -1)
#------------------------------------------------------------------------------


print('---------------------------------------------------------------------------------')
print('Defining ADA Model')
print('---------------------------------------------------------------------------------')
#ADA
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import AdaBoostClassifier
import time
abc = AdaBoostClassifier(n_estimators=50, learning_rate=1.0)


print('---------------------------------------------------------------------------------')
print('Defining LGBM Model')
print('---------------------------------------------------------------------------------')
#LGBM
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier()



#KNN
print('---------------------------------------------------------------------------------')
print('Defining KNN Model')
print('---------------------------------------------------------------------------------')
from sklearn.neighbors import KNeighborsClassifier
knn_clf=KNeighborsClassifier(n_neighbors = 5)


#SVM
print('---------------------------------------------------------------------------------')
print('Defining SVM Model')
print('---------------------------------------------------------------------------------')

from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import SGDClassifier

# Instantiate the SGDClassifier with additional hyperparameters
clf = SGDClassifier(
    loss='hinge',           # hinge loss for linear SVM
    penalty='l2',           # L2 regularization to prevent overfitting
    alpha=1e-4,             # Learning rate (small value for fine-grained updates)
    max_iter=1000,          # Number of passes over the training data
    random_state=42,        # Seed for reproducible results
    learning_rate='optimal' # Automatically adjusts the learning rate based on the training data
)


#MLP
print('---------------------------------------------------------------------------------')
print('Defining MLP Model')
print('---------------------------------------------------------------------------------')


from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
import time

# create MLPClassifier instance
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=200, random_state=1)


#DNN
print('---------------------------------------------------------------------------------')
print('Defining DNN Model')
print('---------------------------------------------------------------------------------')

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# #Model Parameters
# dropout_rate = 0.01
# nodes = 70
# out_layer = 5
# optimizer='adam'
# loss='sparse_categorical_crossentropy'
# epochs=1
# batch_size=2*256

#Model Parameters
dropout_rate = 0.2
nodes = 3
out_layer = 5
optimizer='adam'
loss='sparse_categorical_crossentropy'
epochs=100
batch_size=128


num_columns = X_train.shape[1]

dnn = tf.keras.Sequential()

# Input layer
dnn.add(tf.keras.Input(shape=(num_columns,)))

# Dense layers with dropout
dnn.add(tf.keras.layers.Dense(nodes))
dnn.add(tf.keras.layers.Dropout(dropout_rate))

dnn.add(tf.keras.layers.Dense(nodes))
dnn.add(tf.keras.layers.Dropout(dropout_rate))

dnn.add(tf.keras.layers.Dense(nodes))
dnn.add(tf.keras.layers.Dropout(dropout_rate))

dnn.add(tf.keras.layers.Dense(nodes))
dnn.add(tf.keras.layers.Dropout(dropout_rate))

dnn.add(tf.keras.layers.Dense(nodes))
dnn.add(tf.keras.layers.Dropout(dropout_rate))

# Output layer
dnn.add(tf.keras.layers.Dense(out_layer))



dnn.compile(optimizer=optimizer, loss=loss,metrics=['accuracy'])

dnn.summary()



# dnn = Sequential()
# dnn.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))  # Input layer
# dnn.add(Dense(64, activation='relu'))  # Hidden layer
# dnn.add(Dense(5))  # Output layer

# dnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# # summary of model layers
# dnn.summary()

---------------------------------------------------------------------------------
Defining RF Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Defining ADA Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Defining LGBM Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Defining KNN Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Defining SVM Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Def

In [89]:
# #SVM
# # Wrap SGDClassifier with MultiOutputClassifier
# multi_target_clf = MultiOutputClassifier(clf)

# # Fit the model on the training data
# multi_target_clf.fit(X_train, y_train)

# Make predictions on the test data
# y_pred = clf.predict(X_test)



In [90]:
#Training Basemodels
import joblib
from sklearn.model_selection import StratifiedKFold, cross_val_score
n_splits = 5  # You can adjust the number of folds as needed



print('---------------------------------------------------------------------------------')
print('Training Model')
with open(output_file_name, "a") as f: print('Training weak models - level 0', file = f)

print('---------------------------------------------------------------------------------')

if use_model_ada == 1 and load_model_ada == 0:

    print('---------------------------------------------------------------------------------')
    print('Training ADA')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training ADA', file = f)
    print('---------------------------------------------------------------------------------')
    #ADA


    start = time.time()
    ada = abc.fit(X_train, y_train)
    end = time.time()

    # Create the StratifiedKFold object
    stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # Perform cross-validation
    cv_scores = cross_val_score(ada, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # Print the cross-validation scores
    print("Cross-validation scores:", cv_scores)
    print("Mean accuracy:", cv_scores.mean())
    with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)

    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)

    # Assuming 'model' is your trained model
    joblib.dump(ada, 'ada_base_model.joblib')


if use_model_rf == 1 and load_model_rf == 0:

    print('---------------------------------------------------------------------------------')
    print('Training RF')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)
    with open(output_file_name, "a") as f: print('Training RF', file = f)
    print('---------------------------------------------------------------------------------')
    #RF
    start = time.time()
    model_rf = rf.fit(X_train,y_train)
    end = time.time()

    # Create the StratifiedKFold object
    stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # Perform cross-validation
    cv_scores = cross_val_score(model_rf, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # Print the cross-validation scores
    print("Cross-validation scores:", cv_scores)
    print("Mean accuracy:", cv_scores.mean())
    with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)


    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(model_rf, 'rf_base_model.joblib')

if use_model_svm == 1 and load_model_svm == 0:

    print('---------------------------------------------------------------------------------')
    print('Training SVM')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training SVM', file = f)
    print('---------------------------------------------------------------------------------')
    #SVM

    start = time.time()
    clf.fit(X_train, y_train)
    end = time.time()
    # clf.score(X_train, y_train)
    time_taken = end - start

    # Create the StratifiedKFold object
    stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # Perform cross-validation
    cv_scores = cross_val_score(clf, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # Print the cross-validation scores
    print("Cross-validation scores:", cv_scores)
    print("Mean accuracy:", cv_scores.mean())
    with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)


    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(clf, 'svm_base_model.joblib')


if use_model_knn == 1 and load_model_knn == 0:

    #KNN
    print('---------------------------------------------------------------------------------')
    print('Training KNN')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training KNN', file = f)
    print('---------------------------------------------------------------------------------')
    start = time.time()
    knn_clf.fit(X_train,y_train)
    end = time.time()


    # Create the StratifiedKFold object
    stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # Perform cross-validation
    cv_scores = cross_val_score(knn_clf, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # Print the cross-validation scores
    print("Cross-validation scores:", cv_scores)
    print("Mean accuracy:", cv_scores.mean())
    with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)


    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(knn_clf, 'knn_base_model.joblib')


if use_model_lgbm == 1 and load_model_lgbm == 0:


    print('---------------------------------------------------------------------------------')
    print('Training LGBM')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training LGBM', file = f)
    print('---------------------------------------------------------------------------------')
    start = time.time()
    lgbm.fit(X_train, y_train)
    end = time.time()

    # Create the StratifiedKFold object
    stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # Perform cross-validation
    cv_scores = cross_val_score(lgbm, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # Print the cross-validation scores
    print("Cross-validation scores:", cv_scores)
    print("Mean accuracy:", cv_scores.mean())
    with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)

    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(lgbm, 'lgbm_base_model.joblib')

if use_model_mlp == 1 and load_model_mlp == 0:


    print('---------------------------------------------------------------------------------')
    print('Training MLP')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training MLP', file = f)
    print('---------------------------------------------------------------------------------')

    start = time.time()
    MLP = mlp.fit(X_train, y_train)
    end = time.time()

    # Create the StratifiedKFold object
    stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # Perform cross-validation
    cv_scores = cross_val_score(MLP, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # Print the cross-validation scores
    print("Cross-validation scores:", cv_scores)
    print("Mean accuracy:", cv_scores.mean())
    with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)

    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(MLP, 'mlp_base_model.joblib')


if use_model_dnn == 1 and load_model_dnn == 0:
    from keras.callbacks import EarlyStopping

    # Define EarlyStopping callback
    early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)
    print('---------------------------------------------------------------------------------')
    print('Training DNN')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training DNN', file = f)
    print('---------------------------------------------------------------------------------')
    # Convert Y_test back to its original format
    # y_test = np.argmax(Y_test, axis=1)

    # Start the timer
    start = time.time()
    # dnn.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)
    dnn.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.2, callbacks=[early_stopping])

    # End the timer
    end = time.time()

    # # Create the StratifiedKFold object
    # stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # # Perform cross-validation
    # cv_scores = cross_val_score(dnn, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # # Print the cross-validation scores
    # print("Cross-validation scores:", cv_scores)
    # print("Mean accuracy:", cv_scores.mean())
    # with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)


    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    dnn.save("DNN_base_model.h5")

    # Calculate the time taken and print it out
    # print(f'Time taken for training: {time_taken} seconds')


with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)



---------------------------------------------------------------------------------
Training Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Training ADA
---------------------------------------------------------------------------------
Cross-validation scores: [0.76373552 0.89489631 0.51642876 0.79139452 0.91717729]
Mean accuracy: 0.7767264822490351
---------------------------------------------------------------------------------
Training RF
---------------------------------------------------------------------------------
Cross-validation scores: [0.94647186 0.94694317 0.94431726 0.94680493 0.93656993]
Mean accuracy: 0.944221429771841
---------------------------------------------------------------------------------
Training SVM
---------------------------------------------------------------------------------
Cross-validation scores: [0.9672098  0.96848909 0.96949906 0

In [91]:
# from keras.models import Sequential
# from keras.layers import Dense
# from keras.wrappers.scikit_learn import KerasClassifier
# from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import StratifiedKFold

# # Define your Keras model as a function
# def create_model(optimizer='adam', hidden_layer_size=16):
#     # model = Sequential()
#     # model.add(Dense(hidden_layer_size, input_dim=input_size, activation='relu'))
#     # model.add(Dense(1, activation='sigmoid'))
#     # model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

        
#     dnn = tf.keras.Sequential()

#     # Input layer
#     dnn.add(tf.keras.Input(shape=(num_columns,)))

#     # Dense layers with dropout
#     dnn.add(tf.keras.layers.Dense(nodes))
#     dnn.add(tf.keras.layers.Dropout(dropout_rate))

#     dnn.add(tf.keras.layers.Dense(nodes))
#     dnn.add(tf.keras.layers.Dropout(dropout_rate))

#     dnn.add(tf.keras.layers.Dense(nodes))
#     dnn.add(tf.keras.layers.Dropout(dropout_rate))

#     dnn.add(tf.keras.layers.Dense(nodes))
#     dnn.add(tf.keras.layers.Dropout(dropout_rate))

#     dnn.add(tf.keras.layers.Dense(nodes))
#     dnn.add(tf.keras.layers.Dropout(dropout_rate))

#     # Output layer
#     dnn.add(tf.keras.layers.Dense(out_layer))



#     dnn.compile(optimizer=optimizer, loss=loss)

#     dnn.summary()
#     return dnn

# # Create a KerasClassifier
# dnn = KerasClassifier(build_fn=create_model, epochs=10, batch_size=32, verbose=0)

# # Define the parameter grid for GridSearchCV
# param_grid = {
#     'optimizer': ['adam', 'sgd'],
#     'hidden_layer_size': [8, 16, 32]
# }

# # Create the StratifiedKFold
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# # Create GridSearchCV
# grid = GridSearchCV(estimator=dnn, param_grid=param_grid, cv=cv, scoring='accuracy')
# grid_result = grid.fit(X_train, y_train)

# # Print the best parameters and best accuracy
# print("Best Parameters: ", grid_result.best_params_)
# print("Best Accuracy: ", grid_result.best_score_)



In [92]:
# stratified_kfold

In [93]:
# Loading Models
from tensorflow.keras.models import load_model

if load_model_ada == 1:
    ada = joblib.load('ada_base_model.joblib')

if load_model_svm == 1:
    clf =  joblib.load('svm_base_model.joblib')

if load_model_dnn == 1:
    dnn = load_model("DNN_base_model.h5")

if load_model_knn == 1:
    knn_clf = joblib.load('knn_base_model.joblib')

if load_model_mlp == 1:
    MLP = joblib.load('mlp_base_model.joblib')

if load_model_rf == 1:
    rf = joblib.load('rf_base_model.joblib')

if load_model_lgbm == 1:
    lgbm = joblib.load('lgbm_base_model.joblib')







In [94]:
# Make predictions on the test data
# preds_svm = clf.predict(X_test)



# y_scores = y_pred
# y_true = y_test



### Base leaners predictions

In [95]:
from sklearn.calibration import CalibratedClassifierCV
with open(output_file_name, "a") as f: print('Generating Predictions', file = f)

if use_model_rf == 1:

    print('---------------------------------------------------------------------------------')
    print('Prediction RF')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Prediction RF', file = f)
    print('---------------------------------------------------------------------------------')
    #RF
    start = time.time()
    preds_rf = rf.predict(X_test)
    preds_rf_prob = rf.predict_proba(X_test)
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')

if use_model_svm == 1:

    print('Prediction SVM')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Prediction SVM', file = f)
    print('---------------------------------------------------------------------------------')
    #SVM
    start = time.time()
    preds_svm = clf.predict(X_test)
    # preds_svm_prob = clf.predict_proba(X_test)

    #Since SVM does not deal with prob by nature we use a meta learner
    # https://stackoverflow.com/questions/55250963/how-to-get-probabilities-for-sgdclassifier-linearsvm

    model = CalibratedClassifierCV(clf)

    model.fit(X, y)
    preds_svm_prob = model.predict_proba(X)

    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')

if use_model_lgbm == 1:

    print('Prediction LGBM')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Prediction LGBM', file = f)
    print('---------------------------------------------------------------------------------')
    #LGBM
    start = time.time()
    preds_lgbm = lgbm.predict(X_test)
    preds_lgbm_prob = lgbm.predict_proba(X_test)

    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')

if use_model_dnn == 1:

    print('Prediction DNN')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Prediction DNN', file = f)
    print('---------------------------------------------------------------------------------')
    #DNN
    start = time.time()
    pred_dnn = dnn.predict(X_test)
    preds_dnn_prob = pred_dnn
    preds_dnn = np.argmax(pred_dnn,axis = 1)
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')

if use_model_ada == 1:

    print('Prediction ADA')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Prediction ADA', file = f)
    print('---------------------------------------------------------------------------------')
    #ADA
    start = time.time()
    preds_ada = ada.predict(X_test)
    preds_ada_prob = ada.predict_proba(X_test)

    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')
    print('Prediction MLP')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Prediction MLP', file = f)
    print('---------------------------------------------------------------------------------')

if use_model_mlp == 1:

    #MLP
    start = time.time()
    y_pred = MLP.predict_proba(X_test)
    preds_mlp_prob = y_pred
    preds_mlp = np.argmax(y_pred,axis = 1)
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')
    print('Prediction KNN')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Prediction KNN', file = f)
    print('---------------------------------------------------------------------------------')

if use_model_knn == 1:

    #KNN
    start = time.time()
    preds_knn =knn_clf.predict(X_test)
    preds_knn_prob =knn_clf.predict_proba(X_test)

    preds_knn
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)


---------------------------------------------------------------------------------
Prediction RF
---------------------------------------------------------------------------------


---------------------------------------------------------------------------------
Prediction SVM
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Prediction LGBM
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Prediction DNN
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Prediction ADA
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Prediction MLP
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Prediction KNN
-------

In [96]:
# from sklearn.calibration import CalibratedClassifierCV
# model = CalibratedClassifierCV(clf)

# model.fit(X, y)
# preds_svm_prob = model.predict_proba(X)

# print(preds_ada_prob)
# print(preds_knn_prob)
# print(preds_dnn_prob)
# print(preds_mlp_prob)
# print(preds_rf_prob)
# print(preds_svm_prob)


In [97]:
print(preds_svm_prob)
preds_3 = np.argmax(preds_svm_prob,axis = 1)
print(preds_3)

print(preds_svm)
# print(y_train)

[[8.72093733e-01 7.45689618e-03 2.85238807e-02 9.04090624e-02
  1.51642739e-03]
 [8.58613008e-01 2.04807623e-04 1.09593308e-01 3.09428029e-02
  6.46073102e-04]
 [1.25468259e-01 8.27396976e-01 3.85425919e-02 8.41942724e-03
  1.72745838e-04]
 ...
 [3.93095065e-01 6.06350746e-01 2.88756006e-04 8.55018688e-07
  2.64577935e-04]
 [9.47981386e-01 2.33621240e-03 9.44433097e-03 3.99189637e-02
  3.19107346e-04]
 [3.21028204e-01 3.57404714e-02 6.43035240e-01 5.19631068e-08
  1.96033090e-04]]
[0 0 1 ... 1 0 2]
[0 1 1 ... 0 0 1]


### METRICS - Base Learners

In [98]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score



# >>> 
# >>> roc_auc_score(y, clf.predict_proba(X)[:, 1])
# 0.99...
# >>> roc_auc_score(y, clf.decision_function(X))

In [99]:
import catboost

cat_00 = catboost.CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='MultiClass', custom_metric='Accuracy')

# Fit the model
cat_00.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=10)

# Make predictions on the test set
preds_cat = cat_00.predict(X_test)
preds_cat_prob = cat_00.predict_proba(X_test)
preds_cat = np.squeeze(preds_cat)


if 1 == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Catboost base model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_cat
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)

    Acc = accuracy_score(y_test, pred_label)
    cat_acc_00 = Acc
    Precision = precision_score(y_test, pred_label, average='macro')
    Recall = recall_score(y_test, pred_label, average='macro')
    F1 =  f1_score(y_test, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test, pred_label)
    MCC = matthews_corrcoef(y_test, pred_label)

    cat_acc_00 = Acc
    cat_pre_00 = Precision
    cat_rec_00 = Recall
    cat_f1_00 = F1
    cat_bacc_00 = BACC
    cat_mcc_00 = MCC

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)



0:	learn: 1.2936113	test: 1.2929301	best: 1.2929301 (0)	total: 27.4ms	remaining: 2.71s


10:	learn: 0.4169585	test: 0.4165954	best: 0.4165954 (10)	total: 229ms	remaining: 1.85s
20:	learn: 0.2087191	test: 0.2088132	best: 0.2088132 (20)	total: 431ms	remaining: 1.62s
30:	learn: 0.1274849	test: 0.1280464	best: 0.1280464 (30)	total: 622ms	remaining: 1.38s
40:	learn: 0.0911167	test: 0.0919316	best: 0.0919316 (40)	total: 800ms	remaining: 1.15s
50:	learn: 0.0723743	test: 0.0734236	best: 0.0734236 (50)	total: 964ms	remaining: 926ms
60:	learn: 0.0608892	test: 0.0620483	best: 0.0620483 (60)	total: 1.12s	remaining: 718ms
70:	learn: 0.0529493	test: 0.0543260	best: 0.0543260 (70)	total: 1.28s	remaining: 524ms
80:	learn: 0.0478477	test: 0.0492849	best: 0.0492849 (80)	total: 1.44s	remaining: 337ms
90:	learn: 0.0434513	test: 0.0449536	best: 0.0449536 (90)	total: 1.59s	remaining: 158ms
99:	learn: 0.0410736	test: 0.0426801	best: 0.0426801 (99)	total: 1.74s	remaining: 0us

bestTest = 0.04268008494
bestIteration = 99

----------------------------------------------------------------------------

In [102]:
import xgboost as xgb
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score

# Assuming you have your features and labels as X and y
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set XGBoost parameters
params = {
    'objective': 'multi:softmax',  # for multi-class classification
    'num_class': 5,  # specify the number of classes
    'max_depth': 3,
    'learning_rate': 0.1,
    'eval_metric': 'mlogloss'  # metric for multi-class classification
}

# Train the XGBoost model
num_round = 100
xgb_00 = xgb.train(params, dtrain, num_round)

# Make predictions on the test set
preds_xgb = xgb_00.predict(dtest)
# preds_xgb_prob = xgb_00.predict_proba(dtest)


# Get class probabilities
# Assuming binary classification, get the probability for the positive class (class 1)
preds_xgb_margin = xgb_00.predict(dtest, output_margin=True)
preds_xgb_prob = 1 / (1 + np.exp(-preds_xgb_margin))

# Print or use positive_class_probabilities as needed
# print(positive_class_probabilities)


# Convert predicted probabilities to class labels (if necessary)
# y_pred_labels = [round(value) for value in y_pred]

# Evaluate the accuracy
# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy: %.2f%%" % (accuracy * 100.0))


In [103]:

if 1 == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('xgboost base model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_xgb
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)

    Acc = accuracy_score(y_test, pred_label)
    xgb_acc_00 = Acc
    Precision = precision_score(y_test, pred_label, average='macro')
    Recall = recall_score(y_test, pred_label, average='macro')
    F1 =  f1_score(y_test, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test, pred_label)
    MCC = matthews_corrcoef(y_test, pred_label)
    xgb_acc_00 = Acc
    xgb_pre_00 = Precision
    xgb_rec_00 = Recall
    xgb_f1_00 = F1
    xgb_bacc_00 = BACC
    xgb_mcc_00 = MCC
    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)


---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0.0      1.0     2.0     3.0   4.0
0.0  38253.0     34.0   109.0   116.0   6.0
1.0     33.0  26656.0     7.0     0.0   0.0
2.0     81.0     10.0  6924.0    16.0   0.0
3.0    146.0      0.0     2.0  1797.0  10.0
4.0     18.0      0.0     0.0     7.0  34.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9919875031982656
Precision total:  0.916516593252323
Recall total:  0.8943712402251023
F1 total:  0.9045829083245825
BACC total:  0.8943712402251023
MCC total:  0.9864640766078165


#### RF

In [104]:
# y_test
# pred_label

In [105]:
#RF
if use_model_rf == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('RF base model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_rf
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)

    Acc = accuracy_score(y_test, pred_label)
    rf_acc_00 = Acc
    Precision = precision_score(y_test, pred_label, average='macro')
    Recall = recall_score(y_test, pred_label, average='macro')
    F1 =  f1_score(y_test, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test, pred_label)
    MCC = matthews_corrcoef(y_test, pred_label)

    rf_acc_00 = Acc
    rf_pre_00 = Precision
    rf_rec_00 = Recall
    rf_f1_00 = F1
    rf_bacc_00 = BACC
    rf_mcc_00 = MCC

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)



---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0        1       2      3    4
0  38438.0     19.0    53.0    8.0  0.0
1    896.0  25784.0    16.0    0.0  0.0
2    657.0    185.0  6186.0    3.0  0.0
3   1616.0      6.0    64.0  269.0  0.0
4     59.0      0.0     0.0    0.0  0.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9517634226154406
Precision total:  0.7708229249221301
Recall total:  0.5962348969277227
F1 total:  0.6209815960412555
BACC total:  0.5962348969277227
MCC total:  0.9185607045845918


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


In [106]:
#DNN
if use_model_dnn == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('DNN base model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_dnn
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)

    Acc = accuracy_score(y_test, pred_label)
    dnn_acc_00 = Acc

    Precision = precision_score(y_test, pred_label, average='macro')
    Recall = recall_score(y_test, pred_label, average='macro')
    F1 =  f1_score(y_test, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test, pred_label)
    MCC = matthews_corrcoef(y_test, pred_label)

    dnn_acc_00 = Acc
    dnn_pre_00 = Precision
    dnn_rec_00 = Recall
    dnn_f1_00 = F1
    dnn_bacc_00 = BACC
    dnn_mcc_00 = MCC

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)



---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


         0        1     2    3    4
0  38118.0    375.0  25.0  0.0  0.0
1   1058.0  25638.0   0.0  0.0  0.0
2   6412.0    619.0   0.0  0.0  0.0
3   1943.0     11.0   1.0  0.0  0.0
4     54.0      2.0   3.0  0.0  0.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.8585760648540918
Precision total:  0.35954806058543765
Recall total:  0.3900990696636019
F1 total:  0.369539913853707
BACC total:  0.3900990696636019
MCC total:  0.7605774539729424


In [107]:
#ADA
if use_model_ada == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('ADA base model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_ada
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)

    Acc = accuracy_score(y_test, pred_label)
    ada_acc_00 = Acc

    Precision = precision_score(y_test, pred_label, average='macro')
    Recall = recall_score(y_test, pred_label, average='macro')
    F1 =  f1_score(y_test, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test, pred_label)
    MCC = matthews_corrcoef(y_test, pred_label)


    ada_acc_00 = Acc
    ada_pre_00 = Precision
    ada_rec_00 = Recall
    ada_f1_00 = F1
    ada_bacc_00 = BACC
    ada_mcc_00 = MCC

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)



---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0       1        2       3     4
0  36891.0   338.0    625.0   656.0   8.0
1   3152.0  3451.0  20090.0     3.0   0.0
2   2588.0   251.0   4192.0     0.0   0.0
3    777.0    16.0     14.0  1143.0   5.0
4     19.0     0.0      0.0    14.0  26.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.615454019041463
Precision total:  0.632923263976516
Recall total:  0.541715952581571
F1 total:  0.5048072353676654
BACC total:  0.541715952581571
MCC total:  0.4591745406397964


In [108]:
#SVM
if use_model_svm == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('SVM base model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_svm
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)

    Acc = accuracy_score(y_test, pred_label)
    Precision = precision_score(y_test, pred_label, average='macro')
    Recall = recall_score(y_test, pred_label, average='macro')
    F1 =  f1_score(y_test, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test, pred_label)
    MCC = matthews_corrcoef(y_test, pred_label)

    svm_acc_00 = Acc
    svm_pre_00 = Precision
    svm_rec_00 = Recall
    svm_f1_00 = F1
    svm_bacc_00 = BACC
    svm_mcc_00 = MCC

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)



---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0        1       2       3     4
0  37851.0    195.0   316.0   149.0   7.0
1    485.0  26139.0    22.0    50.0   0.0
2    274.0    117.0  6616.0    24.0   0.0
3    573.0      9.0     9.0  1361.0   3.0
4     32.0      1.0     0.0     7.0  19.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9693909155792564
Precision total:  0.8827645887402553
Recall total:  0.784198426409409
F1 total:  0.8204655892263848
BACC total:  0.784198426409409
MCC total:  0.9481081827724333


In [109]:
#KNN
if use_model_knn == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('KNN base model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_knn
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)

    Acc = accuracy_score(y_test, pred_label)
    Precision = precision_score(y_test, pred_label, average='macro')
    Recall = recall_score(y_test, pred_label, average='macro')
    F1 =  f1_score(y_test, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test, pred_label)
    MCC = matthews_corrcoef(y_test, pred_label)


    knn_acc_00 = Acc
    knn_pre_00 = Precision
    knn_rec_00 = Recall
    knn_f1_00 = F1
    knn_bacc_00 = BACC
    knn_mcc_00 = MCC

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)



---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0        1       2       3     4
0  38242.0     47.0   100.0   124.0   5.0
1     39.0  26654.0     3.0     0.0   0.0
2    110.0    177.0  6738.0     6.0   0.0
3    162.0      8.0     6.0  1775.0   4.0
4     21.0      0.0     1.0     8.0  29.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9889441010517244
Precision total:  0.9315441895221477
Recall total:  0.8698084938389755
F1 total:  0.8947376203615685
BACC total:  0.8698084938389755
MCC total:  0.9813055345538203


In [110]:
#MLP
if use_model_mlp == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('MLP base model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_mlp
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)

    Acc = accuracy_score(y_test, pred_label)
    Precision = precision_score(y_test, pred_label, average='macro')
    Recall = recall_score(y_test, pred_label, average='macro')
    F1 =  f1_score(y_test, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test, pred_label)
    MCC = matthews_corrcoef(y_test, pred_label)

    mlp_acc_00 = Acc
    mlp_pre_00 = Precision
    mlp_rec_00 = Recall
    mlp_f1_00 = F1
    mlp_bacc_00 = BACC
    mlp_mcc_00 = MCC

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)



---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0        1       2       3     4
0  38235.0     37.0    98.0   143.0   5.0
1     35.0  26653.0     5.0     3.0   0.0
2     71.0     22.0  6931.0     7.0   0.0
3    102.0      1.0     4.0  1844.0   4.0
4     15.0      0.0     1.0    11.0  32.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9924049610148264
Precision total:  0.9350853557454828
Recall total:  0.8924829434684349
F1 total:  0.9094645814131586
BACC total:  0.8924829434684349
MCC total:  0.9871796012439795


In [111]:
#lgbm

if use_model_lgbm == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('LGBM base model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_lgbm
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)

    Acc = accuracy_score(y_test, pred_label)
    Precision = precision_score(y_test, pred_label, average='macro')
    Recall = recall_score(y_test, pred_label, average='macro')
    F1 =  f1_score(y_test, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test, pred_label)
    MCC = matthews_corrcoef(y_test, pred_label)
    lgbm_acc_00 = Acc
    lgbm_pre_00 = Precision
    lgbm_rec_00 = Recall
    lgbm_f1_00 = F1
    lgbm_bacc_00 = BACC
    lgbm_mcc_00 = MCC
    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)



---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0        1       2       3     4
0  38019.0    199.0   128.0   117.0  55.0
1     79.0  26361.0   118.0   132.0   6.0
2     98.0    270.0  6627.0    22.0  14.0
3    108.0     11.0    25.0  1791.0  20.0
4     30.0      3.0     0.0     9.0  17.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9805545455769671
Precision total:  0.7902176964513938
Recall total:  0.82425692517516
F1 total:  0.8028434238196789
BACC total:  0.82425692517516
MCC total:  0.9672085090622727


## Training the stronger model - STACK level 01

In [112]:
print(len(preds_dnn_prob), len(y_test))

74259 74259


In [113]:
print(y_test)

54891     0
32459     1
82084     1
115832    0
88729     1
         ..
146686    0
77911     2
139901    0
39294     0
130461    1
Name: Label, Length: 74259, dtype: int64


In [114]:
df_from_series = y_test.to_frame()
y_test_reset_index = df_from_series.reset_index()
# y_test2 = y_test.reset_index(inplace=True)
print(y_test_reset_index)
y_test_reset_index.pop('index')

        index  Label
0       54891      0
1       32459      1
2       82084      1
3      115832      0
4       88729      1
...       ...    ...
74254  146686      0
74255   77911      2
74256  139901      0
74257   39294      0
74258  130461      1

[74259 rows x 2 columns]


0         54891
1         32459
2         82084
3        115832
4         88729
          ...  
74254    146686
74255     77911
74256    139901
74257     39294
74258    130461
Name: index, Length: 74259, dtype: int64

In [115]:
y_test_reset_index.values[0][0]

0

In [116]:
preds_dnn_2 = []
preds_svm_2 = []
preds_rf_2 = []
preds_mlp_2 = []
preds_ada_2 = []
preds_knn_2 = []
preds_lgbm_2 = []
preds_cat_2 = []
preds_xgb_2 = []

for i in range(0,len(preds_dnn_prob)):  
    # print(i)
    # print(preds_dnn_prob[i][y_test_reset_index.values[i][0]])
    preds_dnn_2.append(preds_dnn_prob[i][y_test_reset_index.values[i][0]])
    preds_svm_2.append(preds_svm_prob[i][y_test_reset_index.values[i][0]])
    preds_rf_2.append(preds_rf_prob[i][y_test_reset_index.values[i][0]])
    preds_mlp_2.append(preds_mlp_prob[i][y_test_reset_index.values[i][0]])
    preds_ada_2.append(preds_ada_prob[i][y_test_reset_index.values[i][0]])
    preds_knn_2.append(preds_knn_prob[i][y_test_reset_index.values[i][0]])
    preds_lgbm_2.append(preds_lgbm_prob[i][y_test_reset_index.values[i][0]])
    preds_cat_2.append(preds_cat_prob[i][y_test_reset_index.values[i][0]])
    preds_xgb_2.append(preds_xgb_prob[i][y_test_reset_index.values[i][0]])

    

In [117]:
with open(output_file_name, "a") as f: print('------------------------------------------------------------------', file = f)
with open(output_file_name, "a") as f: print('------------------------------------------------------------------', file = f)
with open(output_file_name, "a") as f: print('------------------------------------------------------------------', file = f)

with open(output_file_name, "a") as f: print('------------START of STRONGER LEARNER - STACK 01 -----------------', file = f)


# Stack the vectors horizontally to create a matrix
column_features = ['dnn','rf','lgbm','ada','knn','mlp','svm','cat','xgb','label']
training_matrix = np.column_stack((
                          preds_dnn_2,
                          preds_rf_2,
                          preds_lgbm_2,
                          preds_ada_2,
                          preds_knn_2, 
                          preds_mlp_2,
                          preds_svm_2,
                          preds_cat_2,
                          preds_xgb_2,
                          y_test
                          ))

# Print the resulting matrix
print(training_matrix)

[[0.2974636  0.83474243 0.99985337 ... 0.98826399 0.98765093 0.        ]
 [0.72982407 0.99633156 1.         ... 0.99718377 0.99776554 1.        ]
 [0.70787787 0.76533758 1.         ... 0.97737626 0.99565542 1.        ]
 ...
 [0.3834421  0.81693161 0.99999204 ... 0.98769927 0.98844504 0.        ]
 [0.62206382 0.95777388 1.         ... 0.99638074 0.99334705 0.        ]
 [0.9766444  0.94678884 1.         ... 0.99247126 0.99238926 1.        ]]


In [118]:
df_level_01 = pd.DataFrame(training_matrix, columns=column_features)

In [119]:

# Assuming df is your DataFrame
df_level_01.to_csv('models7dataset_prob.csv', index=False)


In [120]:
y_01 = df_level_01.pop('label')
X_01 = df_level_01
df_level_01 = df_level_01.assign(label = y_01)

In [121]:
X_01

Unnamed: 0,dnn,rf,lgbm,ada,knn,mlp,svm,cat,xgb
0,0.297464,0.834742,0.999853,0.263048,1.0,1.000000,0.872094,0.988264,0.987651
1,0.729824,0.996332,1.000000,0.282531,1.0,1.000000,0.000205,0.997184,0.997766
2,0.707878,0.765338,1.000000,0.237124,1.0,1.000000,0.827397,0.977376,0.995655
3,0.397967,0.922330,1.000000,0.277063,1.0,1.000000,0.970162,0.992576,0.997013
4,0.695965,0.996332,1.000000,0.293208,1.0,1.000000,0.082876,0.996701,0.997390
...,...,...,...,...,...,...,...,...,...
74254,0.360355,0.866679,0.999983,0.266256,1.0,1.000000,0.981861,0.995916,0.995133
74255,0.282128,0.803362,1.000000,0.328061,1.0,0.999999,0.038218,0.979811,0.996661
74256,0.383442,0.816932,0.999992,0.256380,1.0,1.000000,0.008064,0.987699,0.988445
74257,0.622064,0.957774,1.000000,0.264476,1.0,1.000000,0.981942,0.996381,0.993347


In [122]:
y_01

0        0.0
1        1.0
2        1.0
3        0.0
4        1.0
        ... 
74254    0.0
74255    2.0
74256    0.0
74257    0.0
74258    1.0
Name: label, Length: 74259, dtype: float64

In [123]:
df_level_01

Unnamed: 0,dnn,rf,lgbm,ada,knn,mlp,svm,cat,xgb,label
0,0.297464,0.834742,0.999853,0.263048,1.0,1.000000,0.872094,0.988264,0.987651,0.0
1,0.729824,0.996332,1.000000,0.282531,1.0,1.000000,0.000205,0.997184,0.997766,1.0
2,0.707878,0.765338,1.000000,0.237124,1.0,1.000000,0.827397,0.977376,0.995655,1.0
3,0.397967,0.922330,1.000000,0.277063,1.0,1.000000,0.970162,0.992576,0.997013,0.0
4,0.695965,0.996332,1.000000,0.293208,1.0,1.000000,0.082876,0.996701,0.997390,1.0
...,...,...,...,...,...,...,...,...,...,...
74254,0.360355,0.866679,0.999983,0.266256,1.0,1.000000,0.981861,0.995916,0.995133,0.0
74255,0.282128,0.803362,1.000000,0.328061,1.0,0.999999,0.038218,0.979811,0.996661,2.0
74256,0.383442,0.816932,0.999992,0.256380,1.0,1.000000,0.008064,0.987699,0.988445,0.0
74257,0.622064,0.957774,1.000000,0.264476,1.0,1.000000,0.981942,0.996381,0.993347,0.0


In [124]:
split = 0.7

X_train_01,X_test_01, y_train_01, y_test_01 = sklearn.model_selection.train_test_split(X_01, y_01, train_size=split)

In [125]:
# from keras.callbacks import EarlyStopping

# # Define EarlyStopping callback
# early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)

# # Compile the model
# # model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


# # Train the model with EarlyStopping callback
# model.fit(x_train, Y_train, epochs=100, batch_size=128, validation_split=0.2, callbacks=[early_stopping])

# # Save the trained model
# # model.save("CNN_CIC_1.h5")
# model = load_model("CNN_CIC_1.h5")

In [126]:
print('---------------------------------------------------------------------------------')
print('Defining DNN Model')
print('---------------------------------------------------------------------------------')

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

#Model Parameters
dropout_rate = 0.2
nodes = 3
out_layer = 5
optimizer='adam'
loss='sparse_categorical_crossentropy'
epochs=100
batch_size=128


num_columns = X_train_01.shape[1]

dnn_01 = tf.keras.Sequential()

# Input layer
dnn_01.add(tf.keras.Input(shape=(num_columns,)))

# # Dense layers with dropout
# dnn_01.add(tf.keras.layers.Dense(nodes))
# dnn_01.add(tf.keras.layers.Dropout(dropout_rate))

# dnn_01.add(tf.keras.layers.Dense(2*nodes))
# dnn_01.add(tf.keras.layers.Dropout(dropout_rate))

# dnn_01.add(tf.keras.layers.Dense(3*nodes))
# dnn_01.add(tf.keras.layers.Dropout(dropout_rate))

# dnn_01.add(tf.keras.layers.Dense(2*nodes))
# dnn_01.add(tf.keras.layers.Dropout(dropout_rate))

# dnn.add(tf.keras.layers.Dense(nodes))
# dnn.add(tf.keras.layers.Dropout(dropout_rate))



# Dense layers with dropout
dnn_01.add(tf.keras.layers.Dense(nodes))
dnn_01.add(tf.keras.layers.Dropout(dropout_rate))

dnn_01.add(tf.keras.layers.Dense(nodes))
dnn_01.add(tf.keras.layers.Dropout(dropout_rate))

dnn_01.add(tf.keras.layers.Dense(nodes))
dnn_01.add(tf.keras.layers.Dropout(dropout_rate))

dnn_01.add(tf.keras.layers.Dense(nodes))
dnn_01.add(tf.keras.layers.Dropout(dropout_rate))

dnn.add(tf.keras.layers.Dense(nodes))
dnn.add(tf.keras.layers.Dropout(dropout_rate))

# Output layer
# dnn_01.add(tf.keras.layers.Dense(out_layer))

dnn_01.add(tf.keras.layers.Dense(out_layer, activation='softmax'))


dnn_01.compile(optimizer=optimizer, loss=loss,metrics=['accuracy'])

dnn_01.summary()

---------------------------------------------------------------------------------
Defining DNN Model
---------------------------------------------------------------------------------
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_18 (Dense)             (None, 3)                 30        
_________________________________________________________________
dropout_15 (Dropout)         (None, 3)                 0         
_________________________________________________________________
dense_19 (Dense)             (None, 3)                 12        
_________________________________________________________________
dropout_16 (Dropout)         (None, 3)                 0         
_________________________________________________________________
dense_20 (Dense)             (None, 3)                 12        
_________________________________________________________________
dro

In [127]:
#DNN
from keras.callbacks import EarlyStopping

# Define EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)

print('---------------------------------------------------------------------------------')
print('Training DNN')
with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

with open(output_file_name, "a") as f: print('Training DNN', file = f)
print('---------------------------------------------------------------------------------')
# Convert Y_test back to its original format
# y_test = np.argmax(Y_test, axis=1)

# Start the timer
start = time.time()
# dnn_01.fit(X_train_01, y_train_01, epochs=epochs, batch_size=batch_size)
dnn_01.fit(X_train_01, y_train_01, epochs=epochs, batch_size=batch_size,validation_split=0.2, callbacks=[early_stopping])

# model.fit(x_train, Y_train, epochs=100, batch_size=128, validation_split=0.2, callbacks=[early_stopping])

# End the timer
end = time.time()
time_taken = end - start
with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
# joblib.dump(dnn_01, 'dnn_level_01.joblib')
dnn_01.save("dnn_level_01.h5")

# Calculate the time taken and print it out
# print(f'Time taken for training: {time_taken} seconds')


---------------------------------------------------------------------------------
Training DNN
---------------------------------------------------------------------------------
Epoch 1/100

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100


In [128]:
dnn_01 = load_model("dnn_level_01.h5")


In [129]:
#DNN
start = time.time()
pred_dnn = dnn_01.predict(X_test_01)
preds_dnn_01 = np.argmax(pred_dnn,axis = 1)
end = time.time()
time_taken = end - start
with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)

In [130]:
# y_test = y_test_01

In [131]:
#----------------------------------------------------------------
with open(output_file_name, "a") as f: print('Stack model - Strong learner - level 01', file = f)
with open(output_file_name, "a") as f: print('-------------------------------------------------------', file = f)

In [132]:

print('---------------------------------------------------------------------------------')
print('CONFUSION MATRIX')
print('---------------------------------------------------------------------------------')
with open(output_file_name, "a") as f: print('DNN', file = f)
pred_label = preds_dnn_01

# pred_label = ypred
#pred_label = label[ypred]

confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
all_unique_values = sorted(set(pred_label) | set(y_test_01))
z = np.zeros((len(all_unique_values), len(all_unique_values)))
rows, cols = confusion_matrix.shape
z[:rows, :cols] = confusion_matrix
confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
# confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
# with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
print(confusion_matrix)
with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)
TP_total = sum(TP)
TN_total = sum(TN)
FP_total = sum(FP)
FN_total = sum(FN)

TP_total = np.array(TP_total,dtype=np.float64)
TN_total = np.array(TN_total,dtype=np.float64)
FP_total = np.array(FP_total,dtype=np.float64)
FN_total = np.array(FN_total,dtype=np.float64)



#----------------------------------------------------------------#----------------------------------------------------------------

print('---------------------------------------------------------------------------------')
print('METRICS')
print('---------------------------------------------------------------------------------')

# Acc = ACC(TP_total,TN_total, FP_total, FN_total)
# Precision = PRECISION(TP_total, FP_total)
# Recall = RECALL(TP_total, FN_total)
# F1 = F1(Recall,Precision)
# BACC = BACC(TP_total,TN_total, FP_total, FN_total)
# MCC = MCC(TP_total,TN_total, FP_total, FN_total)


Acc = accuracy_score(y_test_01, pred_label)
Precision = precision_score(y_test_01, pred_label, average='macro')
Recall = recall_score(y_test_01, pred_label, average='macro')
F1 =  f1_score(y_test_01, pred_label, average='macro')
BACC = balanced_accuracy_score(y_test_01, pred_label)
MCC = matthews_corrcoef(y_test_01, pred_label)


dnn_acc_01 = Acc
dnn_pre_01 = Precision
dnn_rec_01 = Recall
dnn_f1_01 = F1
dnn_bacc_01 = BACC
dnn_mcc_01 = MCC

# with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
print('Accuracy total: ', Acc)
print('Precision total: ', Precision )
print('Recall total: ', Recall )
print('F1 total: ', F1 )
print('BACC total: ', BACC)
print('MCC total: ', MCC)

with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)



---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0.0     1.0     2.0   3.0  4.0
0.0  10612.0  1004.0    43.0  11.0  0.0
1.0   2446.0  5246.0   266.0   3.0  0.0
2.0    619.0     2.0  1456.0   1.0  0.0
3.0      0.0     3.0   549.0   0.0  0.0
4.0      0.0     0.0     6.0  11.0  0.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.777179280007182
Precision total:  0.4484352807140194
Recall total:  0.45379527103143447
F1 total:  0.44749970805741135
BACC total:  0.45379527103143447
MCC total:  0.6166245634393946


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


In [133]:
#SVM
print('---------------------------------------------------------------------------------')
print('Defining SVM Model')
print('---------------------------------------------------------------------------------')

from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import SGDClassifier

# Instantiate the SGDClassifier with additional hyperparameters
clf = SGDClassifier(
    loss='hinge',           # hinge loss for linear SVM
    penalty='l2',           # L2 regularization to prevent overfitting
    alpha=1e-4,             # Learning rate (small value for fine-grained updates)
    max_iter=1000,          # Number of passes over the training data
    random_state=42,        # Seed for reproducible results
    learning_rate='optimal' # Automatically adjusts the learning rate based on the training data
)

#SVM
start = time.time()
clf.fit(X_train_01, y_train_01)
end = time.time()
clf.score(X_train_01, y_train_01)
time_taken = end - start
with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
joblib.dump(clf, 'svm_level_01.joblib')


clf = loaded_model = joblib.load('svm_level_01.joblib')


#SVM
start = time.time()
preds_svm_01 = clf.predict(X_test_01)
end = time.time()
time_taken = end - start
with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
print('---------------------------------------------------------------------------------')



---------------------------------------------------------------------------------
Defining SVM Model
---------------------------------------------------------------------------------


---------------------------------------------------------------------------------


In [134]:
with open(output_file_name, "a") as f: print('-------------------------------------------------------', file = f)
print('---------------------------------------------------------------------------------')
print('CONFUSION MATRIX')
print('---------------------------------------------------------------------------------')
with open(output_file_name, "a") as f: print('SVM', file = f)
pred_label = preds_svm_01

# pred_label = ypred
#pred_label = label[ypred]

confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
all_unique_values = sorted(set(pred_label) | set(y_test_01))
z = np.zeros((len(all_unique_values), len(all_unique_values)))
rows, cols = confusion_matrix.shape
z[:rows, :cols] = confusion_matrix
confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
# confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
# with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
print(confusion_matrix)
with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)
TP_total = sum(TP)
TN_total = sum(TN)
FP_total = sum(FP)
FN_total = sum(FN)

TP_total = np.array(TP_total,dtype=np.float64)
TN_total = np.array(TN_total,dtype=np.float64)
FP_total = np.array(FP_total,dtype=np.float64)
FN_total = np.array(FN_total,dtype=np.float64)



#----------------------------------------------------------------#----------------------------------------------------------------

print('---------------------------------------------------------------------------------')
print('METRICS')
print('---------------------------------------------------------------------------------')

# Acc = ACC(TP_total,TN_total, FP_total, FN_total)
# Precision = PRECISION(TP_total, FP_total)
# Recall = RECALL(TP_total, FN_total)
# F1 = F1(Recall,Precision)
# BACC = BACC(TP_total,TN_total, FP_total, FN_total)
# MCC = MCC(TP_total,TN_total, FP_total, FN_total)


Acc = accuracy_score(y_test_01, pred_label)
Precision = precision_score(y_test_01, pred_label, average='macro')
Recall = recall_score(y_test_01, pred_label, average='macro')
F1 =  f1_score(y_test_01, pred_label, average='macro')
BACC = balanced_accuracy_score(y_test_01, pred_label)
MCC = matthews_corrcoef(y_test_01, pred_label)



svm_acc_01 = Acc
svm_pre_01 = Precision
svm_rec_01 = Recall
svm_f1_01 = F1
svm_bacc_01 = BACC
svm_mcc_01 = MCC

# with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
print('Accuracy total: ', Acc)
print('Precision total: ', Precision )
print('Recall total: ', Recall )
print('F1 total: ', F1 )
print('BACC total: ', BACC)
print('MCC total: ', MCC)

with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)



---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0.0     1.0     2.0    3.0   4.0
0.0  10530.0  1032.0    97.0    0.0  11.0
1.0   2076.0  5677.0   174.0   30.0   4.0
2.0    201.0     1.0  1825.0   50.0   1.0
3.0     15.0     0.0   311.0  226.0   0.0
4.0      4.0     0.0     9.0    4.0   0.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.8195529221653649
Precision total:  0.6302904680558632
Recall total:  0.5806167198793248
F1 total:  0.5940391791421344
BACC total:  0.5806167198793248
MCC total:  0.6928887053722315


In [135]:

print('---------------------------------------------------------------------------------')
print('Defining RF Model')
print('---------------------------------------------------------------------------------')
#Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
rf = RandomForestClassifier(max_depth = 5,  n_estimators = 10, min_samples_split = 2, n_jobs = -1)
#------------------------------------------------------------------------------

if True == True:

    print('---------------------------------------------------------------------------------')
    print('Training RF')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)
    with open(output_file_name, "a") as f: print('Training RF', file = f)
    print('---------------------------------------------------------------------------------')
    #RF
    start = time.time()
    model_rf_01 = rf.fit(X_train_01,y_train_01)
    end = time.time()

    # # Create the StratifiedKFold object
    # stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # # Perform cross-validation
    # cv_scores = cross_val_score(model_rf_01, X_train_01, y_train, cv=stratified_kfold, scoring='accuracy')
    # # Print the cross-validation scores
    # print("Cross-validation scores:", cv_scores)
    # print("Mean accuracy:", cv_scores.mean())
    # with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)


    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(model_rf_01, 'rf_base_model_01.joblib')

if 1 == 1:
    model_rf_01  = joblib.load('rf_base_model_01.joblib')

if 1 == 1:

    print('---------------------------------------------------------------------------------')
    print('Prediction RF')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Prediction RF', file = f)
    print('---------------------------------------------------------------------------------')
    #RF
    start = time.time()
    preds_rf_01 = model_rf_01.predict(X_test_01)
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')

    with open(output_file_name, "a") as f: print('-------------------------------------------------------', file = f)
print('---------------------------------------------------------------------------------')
print('CONFUSION MATRIX')
print('---------------------------------------------------------------------------------')
with open(output_file_name, "a") as f: print('RF', file = f)
pred_label = preds_rf_01

# pred_label = ypred
#pred_label = label[ypred]

confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
all_unique_values = sorted(set(pred_label) | set(y_test_01))
z = np.zeros((len(all_unique_values), len(all_unique_values)))
rows, cols = confusion_matrix.shape
z[:rows, :cols] = confusion_matrix
confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
# confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
# with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
print(confusion_matrix)
with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)
TP_total = sum(TP)
TN_total = sum(TN)
FP_total = sum(FP)
FN_total = sum(FN)

TP_total = np.array(TP_total,dtype=np.float64)
TN_total = np.array(TN_total,dtype=np.float64)
FP_total = np.array(FP_total,dtype=np.float64)
FN_total = np.array(FN_total,dtype=np.float64)



#----------------------------------------------------------------#----------------------------------------------------------------

print('---------------------------------------------------------------------------------')
print('METRICS')
print('---------------------------------------------------------------------------------')

# Acc = ACC(TP_total,TN_total, FP_total, FN_total)
# Precision = PRECISION(TP_total, FP_total)
# Recall = RECALL(TP_total, FN_total)
# F1 = F1(Recall,Precision)
# BACC = BACC(TP_total,TN_total, FP_total, FN_total)
# MCC = MCC(TP_total,TN_total, FP_total, FN_total)


Acc = accuracy_score(y_test_01, pred_label)
Precision = precision_score(y_test_01, pred_label, average='macro')
Recall = recall_score(y_test_01, pred_label, average='macro')
F1 =  f1_score(y_test_01, pred_label, average='macro')
BACC = balanced_accuracy_score(y_test_01, pred_label)
MCC = matthews_corrcoef(y_test_01, pred_label)

rf_acc_01 = Acc
rf_pre_01 = Precision
rf_rec_01 = Recall
rf_f1_01 = F1
rf_bacc_01 = BACC
rf_mcc_01 = MCC


# with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
print('Accuracy total: ', Acc)
print('Precision total: ', Precision )
print('Recall total: ', Recall )
print('F1 total: ', F1 )
print('BACC total: ', BACC)
print('MCC total: ', MCC)

with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)




---------------------------------------------------------------------------------
Defining RF Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Training RF
---------------------------------------------------------------------------------


---------------------------------------------------------------------------------
Prediction RF
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0.0     1.0     2.0    3.0  4.0
0.0  11563.0    91.0    16.0    0.0  0.0
1.0    842.0  6895.0   189.0   35.0  0.0
2.0     54.0    21.0  1962.0   41.0  0.0
3.0     12.0     2.0    53.0  485.0  0.0
4.0      0.0     0.0     1.0   16.0  0.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.938369692072897
Precision total:  0.7269733471723125
Recall total:  0.735945739363979
F1 total:  0.7302137693773736
BACC t

Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


In [136]:
rf_acc_01

0.938369692072897

In [137]:
print('---------------------------------------------------------------------------------')
print('Defining LGBM Model')
print('---------------------------------------------------------------------------------')
#LGBM
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier()



if 1 == 1 and 0 == 0:


    print('---------------------------------------------------------------------------------')
    print('Training LGBM')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training LGBM', file = f)
    print('---------------------------------------------------------------------------------')
    start = time.time()
    lgbm.fit(X_train_01, y_train_01)
    end = time.time()

    # # Create the StratifiedKFold object
    # stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # # Perform cross-validation
    # cv_scores = cross_val_score(lgbm, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # # Print the cross-validation scores
    # print("Cross-validation scores:", cv_scores)
    # print("Mean accuracy:", cv_scores.mean())
    # with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)

    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(lgbm, 'lgbm_01.joblib')

if 1 == 1:
    lgbm = joblib.load('lgbm_01.joblib')


if 1 == 1:

    print('Prediction LGBM')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Prediction LGBM', file = f)
    print('---------------------------------------------------------------------------------')
    #LGBM
    start = time.time()
    preds_lgbm_01 = lgbm.predict(X_test_01)
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')

    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')
    with open(output_file_name, "a") as f: print('LGBM', file = f)
    pred_label = preds_lgbm_01

    # pred_label = ypred
    #pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test_01))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)


    Acc = accuracy_score(y_test_01, pred_label)
    lgbm_acc_01 = Acc
    Precision = precision_score(y_test_01, pred_label, average='macro')
    Recall = recall_score(y_test_01, pred_label, average='macro')
    F1 =  f1_score(y_test_01, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test_01, pred_label)
    MCC = matthews_corrcoef(y_test_01, pred_label)

    lgbm_acc_01 = Acc
    lgbm_pre_01 = Precision
    lgbm_rec_01 = Recall
    lgbm_f1_01 = F1
    lgbm_bacc_01 = BACC
    lgbm_mcc_01 = MCC

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)





---------------------------------------------------------------------------------
Defining LGBM Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Training LGBM
---------------------------------------------------------------------------------


Prediction LGBM
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0.0     1.0     2.0    3.0  4.0
0.0  11522.0    87.0    53.0    6.0  2.0
1.0    105.0  7755.0    67.0   25.0  9.0
2.0     15.0    66.0  1958.0   32.0  7.0
3.0      2.0     5.0     9.0  528.0  8.0
4.0      0.0     2.0     1.0   11.0  3.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9770176856091211
Precision total:  0.777514374044633
Recall total:  0.8073372511731085
F1 total:  0.7901736960971538
BACC total:  0.8073372511731085
MCC total:  0.9610472908008874


In [138]:
# lgbm_acc_01

In [139]:

#MLP
print('---------------------------------------------------------------------------------')
print('Defining MLP Model')
print('---------------------------------------------------------------------------------')


from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
import time

# create MLPClassifier instance
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=200, random_state=1)

if 1 == 1 and 0 == 0:


    print('---------------------------------------------------------------------------------')
    print('Training MLP')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training MLP', file = f)
    print('---------------------------------------------------------------------------------')

    start = time.time()
    MLP = mlp.fit(X_train_01, y_train_01)
    end = time.time()

    # # Create the StratifiedKFold object
    # stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # # Perform cross-validation
    # cv_scores = cross_val_score(MLP, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # # Print the cross-validation scores
    # print("Cross-validation scores:", cv_scores)
    # print("Mean accuracy:", cv_scores.mean())
    # with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)

    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(MLP, 'mlp_01.joblib')

if 1 == 1:
    MLP = joblib.load('mlp_01.joblib')


if 1 == 1:

    #MLP
    start = time.time()
    y_pred = MLP.predict_proba(X_test_01)
    preds_mlp_01 = np.argmax(y_pred,axis = 1)
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

#MLP
if 1 == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('MLP 01 model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_mlp_01
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test_01))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)


    Acc = accuracy_score(y_test_01, pred_label)
    Precision = precision_score(y_test_01, pred_label, average='macro')
    Recall = recall_score(y_test_01, pred_label, average='macro')
    F1 =  f1_score(y_test_01, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test_01, pred_label)
    MCC = matthews_corrcoef(y_test_01, pred_label)


    mlp_acc_01 = Acc
    mlp_pre_01 = Precision
    mlp_rec_01 = Recall
    mlp_f1_01 = F1
    mlp_bacc_01 = BACC
    mlp_mcc_01 = MCC

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)








---------------------------------------------------------------------------------
Defining MLP Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Training MLP
---------------------------------------------------------------------------------


---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0       1       2      3     4
0  11267.0   377.0    23.0    2.0   1.0
1    729.0  7065.0   160.0    7.0   0.0
2      2.0    43.0  2007.0   26.0   0.0
3      2.0     0.0    15.0  535.0   0.0
4      0.0     0.0     0.0    0.0  17.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9377412694137714
Precision total:  0.9352098919045874
Recall total:  0.9575907528934161
F1 total:  0.9458166048227209
BACC total:  0.9575907528934161
MCC total:  0.8946421624803115


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


In [140]:
# mlp_acc_01

In [141]:
print('---------------------------------------------------------------------------------')
print('Defining ADA Model')
print('---------------------------------------------------------------------------------')
#ADA
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import AdaBoostClassifier
import time
abc = AdaBoostClassifier(n_estimators=50, learning_rate=1.0)

if 1 == 1 and 0 == 0:

    print('---------------------------------------------------------------------------------')
    print('Training ADA')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training ADA', file = f)
    print('---------------------------------------------------------------------------------')
    #ADA


    start = time.time()
    ada = abc.fit(X_train_01, y_train_01)
    end = time.time()

    # # Create the StratifiedKFold object
    # stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # # Perform cross-validation
    # cv_scores = cross_val_score(ada, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # # Print the cross-validation scores
    # print("Cross-validation scores:", cv_scores)
    # print("Mean accuracy:", cv_scores.mean())
    # with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)

    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)

    # Assuming 'model' is your trained model
    joblib.dump(ada, 'ada_01.joblib')




if 1 == 1:
    ada = joblib.load('ada_01.joblib')


if 1 == 1:

    print('Prediction ADA')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Prediction ADA', file = f)
    print('---------------------------------------------------------------------------------')
    #ADA
    start = time.time()
    preds_ada_01 = ada.predict(X_test_01)
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')

if 1 == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('ADA 01 model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_ada_01
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test_01))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)


    Acc = accuracy_score(y_test_01, pred_label)
    Precision = precision_score(y_test_01, pred_label, average='macro')
    Recall = recall_score(y_test_01, pred_label, average='macro')
    F1 =  f1_score(y_test_01, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test_01, pred_label)
    MCC = matthews_corrcoef(y_test_01, pred_label)

    ada_acc_01 = Acc
    ada_pre_01 = Precision
    ada_rec_01 = Recall
    ada_f1_01 = F1
    ada_bacc_01 = BACC
    ada_mcc_01 = MCC

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)









---------------------------------------------------------------------------------
Defining ADA Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Training ADA
---------------------------------------------------------------------------------


Prediction ADA
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
        0.0     1.0   2.0    3.0   4.0
0.0  3804.0  7859.0   7.0    0.0   0.0
1.0  6916.0   991.0  46.0    8.0   0.0
2.0     3.0  1997.0  43.0   35.0   0.0
3.0     0.0   265.0  32.0  255.0   0.0
4.0     0.0     0.0   2.0    0.0  15.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.2292844959152527
Precision total:  0.5260816525863901
Recall total:  0.36308965924515896
F1 total:  0.40402296705975915
BACC total:  0.36308965924515896
MCC total:  -0.36530994306887465


In [142]:
#KNN
print('---------------------------------------------------------------------------------')
print('Defining KNN Model')
print('---------------------------------------------------------------------------------')
from sklearn.neighbors import KNeighborsClassifier
knn_clf_01=KNeighborsClassifier(n_neighbors = 5)

if 1 == 1 and 0 == 0:

    #KNN
    print('---------------------------------------------------------------------------------')
    print('Training KNN')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training KNN', file = f)
    print('---------------------------------------------------------------------------------')
    start = time.time()
    knn_clf_01.fit(X_train_01,y_train_01)
    end = time.time()


    # # Create the StratifiedKFold object
    # stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # # Perform cross-validation
    # cv_scores = cross_val_score(knn_clf, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # # Print the cross-validation scores
    # print("Cross-validation scores:", cv_scores)
    # print("Mean accuracy:", cv_scores.mean())
    # with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)


    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(knn_clf_01, 'knn_01.joblib')


if load_model_knn == 1:
    knn_clf_01 = joblib.load('knn_01.joblib')

if use_model_knn == 1:

    #KNN
    start = time.time()
    preds_knn =knn_clf_01.predict(X_test_01)
    preds_knn
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

#MLP
if 1 == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('KNN 01 model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_knn
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test_01))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)


    Acc = accuracy_score(y_test_01, pred_label)
    Precision = precision_score(y_test_01, pred_label, average='macro')
    Recall = recall_score(y_test_01, pred_label, average='macro')
    F1 =  f1_score(y_test_01, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test_01, pred_label)
    MCC = matthews_corrcoef(y_test_01, pred_label)


    knn_acc_01 = Acc
    knn_pre_01 = Precision
    knn_rec_01 = Recall
    knn_f1_01 = F1
    knn_bacc_01 = BACC
    knn_mcc_01 = MCC    

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)









---------------------------------------------------------------------------------
Defining KNN Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Training KNN
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0.0     1.0     2.0    3.0   4.0
0.0  11346.0   301.0    22.0    0.0   1.0
1.0    276.0  7628.0    53.0    4.0   0.0
2.0     10.0    35.0  2007.0   26.0   0.0
3.0      0.0     1.0    13.0  538.0   0.0
4.0      0.0     0.0     0.0    7.0  10.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9663793877367807

In [143]:
from sklearn.linear_model import LogisticRegression

#Logistic Regression
print('---------------------------------------------------------------------------------')
print('Defining Logistic Regression Model')
print('---------------------------------------------------------------------------------')
logreg_01 = LogisticRegression()

if 1 == 1 and 0 == 0:

    #KNN
    print('---------------------------------------------------------------------------------')
    print('Training LR ')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training LR', file = f)
    print('---------------------------------------------------------------------------------')
    start = time.time()
    logreg_01.fit(X_train_01,y_train_01)
    end = time.time()


    # # Create the StratifiedKFold object
    # stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # # Perform cross-validation
    # cv_scores = cross_val_score(knn_clf, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
    # # Print the cross-validation scores
    # print("Cross-validation scores:", cv_scores)
    # print("Mean accuracy:", cv_scores.mean())
    # with open(output_file_name, "a") as f: print('mean accuracy', cv_scores.mean() , file = f)


    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(logreg_01, 'logreg_01.joblib')


if 1 == 1:
    logreg_01 = joblib.load('logreg_01.joblib')

if 1 == 1:

    #lR
    start = time.time()
    preds_logreg =logreg_01.predict(X_test_01)
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

#LR
if 1 == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('LR 01 model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_logreg
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test_01))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)


    Acc = accuracy_score(y_test_01, pred_label)
    Precision = precision_score(y_test_01, pred_label, average='macro')
    Recall = recall_score(y_test_01, pred_label, average='macro')
    F1 =  f1_score(y_test_01, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test_01, pred_label)
    MCC = matthews_corrcoef(y_test_01, pred_label)

    lr_acc_01 = Acc
    lr_pre_01 = Precision
    lr_rec_01 = Recall
    lr_f1_01 = F1
    lr_bacc_01 = BACC
    lr_mcc_01 = MCC

    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)









---------------------------------------------------------------------------------
Defining Logistic Regression Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Training LR 
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0.0     1.0     2.0    3.0  4.0
0.0  10518.0  1076.0    73.0    2.0  1.0
1.0   1833.0  5912.0   185.0   31.0  0.0
2.0    142.0     6.0  1832.0   98.0  0.0
3.0      7.0     0.0   104.0  437.0  4.0
4.0      2.0     0.0     0.0   14.0  1.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.839393

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [145]:
# #Voting
# from sklearn.ensemble import VotingClassifier
# # model1 = LogisticRegression(random_state=1)
# # model2 = tree.DecisionTreeClassifier(random_state=1)
# voting = VotingClassifier(estimators=[
#                                         ('ada', ada),
#                                        ('rf', rf),
#                                        ('svm', clf),
#                                        ('knn', knn_clf), 
#                                        ('lgbm', lgbm),
#                                       #  ('xgb', xgb_00),
#                                        ('cat', cat_00),

#                                          ('mlp', mlp)
#                                         #  ,('dnn', dnn_01)

#                                          ], voting='hard')
# voting.fit(X_train_01,y_train_01)
# # voring_acc = voting.score(X_test_01,y_test_01)

0:	learn: 1.3159672	total: 18.8ms	remaining: 1.86s
1:	learn: 1.1257824	total: 35ms	remaining: 1.71s
2:	learn: 0.9889719	total: 51.4ms	remaining: 1.66s
3:	learn: 0.8795065	total: 67.8ms	remaining: 1.63s
4:	learn: 0.7832594	total: 84.2ms	remaining: 1.6s
5:	learn: 0.7054163	total: 100ms	remaining: 1.57s
6:	learn: 0.6392608	total: 116ms	remaining: 1.54s
7:	learn: 0.5839469	total: 133ms	remaining: 1.52s
8:	learn: 0.5358800	total: 150ms	remaining: 1.52s
9:	learn: 0.4963636	total: 167ms	remaining: 1.51s
10:	learn: 0.4598019	total: 183ms	remaining: 1.48s
11:	learn: 0.4258480	total: 199ms	remaining: 1.46s
12:	learn: 0.3978712	total: 216ms	remaining: 1.45s
13:	learn: 0.3728625	total: 231ms	remaining: 1.42s
14:	learn: 0.3496802	total: 247ms	remaining: 1.4s
15:	learn: 0.3285943	total: 265ms	remaining: 1.39s
16:	learn: 0.3092808	total: 281ms	remaining: 1.37s
17:	learn: 0.2930996	total: 298ms	remaining: 1.35s
18:	learn: 0.2774927	total: 314ms	remaining: 1.34s
19:	learn: 0.2635279	total: 330ms	remain

Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray


ValueError: could not broadcast input array from shape (22278,1) into shape (22278)

In [146]:
# preds_voting = voting(X_test_01,y_test_01)

TypeError: 'VotingClassifier' object is not callable

In [None]:

# print('---------------------------------------------------------------------------------')
# print('CONFUSION MATRIX')
# print('---------------------------------------------------------------------------------')


# pred_label = preds_voting
# # pred_label = label[ypred]

# confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
# all_unique_values = sorted(set(pred_label) | set(y_test_01))
# z = np.zeros((len(all_unique_values), len(all_unique_values)))
# rows, cols = confusion_matrix.shape
# z[:rows, :cols] = confusion_matrix
# confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
# # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
# # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
# print(confusion_matrix)
# with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

# with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


# FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
# FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
# TP = np.diag(confusion_matrix)
# TN = confusion_matrix.values.sum() - (FP + FN + TP)
# TP_total = sum(TP)
# TN_total = sum(TN)
# FP_total = sum(FP)
# FN_total = sum(FN)

# TP_total = np.array(TP_total,dtype=np.float64)
# TN_total = np.array(TN_total,dtype=np.float64)
# FP_total = np.array(FP_total,dtype=np.float64)
# FN_total = np.array(FN_total,dtype=np.float64)



# #----------------------------------------------------------------#----------------------------------------------------------------

# print('---------------------------------------------------------------------------------')
# print('METRICS')
# print('---------------------------------------------------------------------------------')

# # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
# # Precision = PRECISION(TP_total, FP_total)
# # Recall = RECALL(TP_total, FN_total)
# # F1 = F1(Recall,Precision)
# # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
# # MCC = MCC(TP_total,TN_total, FP_total, FN_total)


# Acc = accuracy_score(y_test_01, pred_label)
# Precision = precision_score(y_test_01, pred_label, average='macro')
# Recall = recall_score(y_test_01, pred_label, average='macro')
# F1 =  f1_score(y_test_01, pred_label, average='macro')
# BACC = balanced_accuracy_score(y_test_01, pred_label)
# MCC = matthews_corrcoef(y_test_01, pred_label)


# voting_acc_01 = Acc
# voting_pre_01 = Precision
# voting_rec_01 = Recall
# voting_f1_01 = F1
# voting_bacc_01 = BACC
# voting_mcc_01 = MCC

# # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
# print('Accuracy total: ', Acc)
# print('Precision total: ', Precision )
# print('Recall total: ', Recall )
# print('F1 total: ', F1 )
# print('BACC total: ', BACC)
# print('MCC total: ', MCC)

# with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
# with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
# with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
# with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
# with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
# with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)









In [None]:
# from sklearn.calibration import CalibratedClassifierCV
# with open(output_file_name, "a") as f: print('Generating Predictions', file = f)

# if use_model_rf == 1:

#     print('---------------------------------------------------------------------------------')
#     print('Prediction RF')
#     with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

#     with open(output_file_name, "a") as f: print('Prediction RF', file = f)
#     print('---------------------------------------------------------------------------------')
#     #RF
#     start = time.time()
#     preds_rf = rf.predict(X_test)
#     preds_rf_prob = rf.predict_proba(X_test)
#     end = time.time()
#     time_taken = end - start
#     with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
#     print('---------------------------------------------------------------------------------')

# if use_model_svm == 1:

#     print('Prediction SVM')
#     with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

#     with open(output_file_name, "a") as f: print('Prediction SVM', file = f)
#     print('---------------------------------------------------------------------------------')
#     #SVM
#     start = time.time()
#     preds_svm = clf.predict(X_test)
#     # preds_svm_prob = clf.predict_proba(X_test)

#     #Since SVM does not deal with prob by nature we use a meta learner
#     # https://stackoverflow.com/questions/55250963/how-to-get-probabilities-for-sgdclassifier-linearsvm

#     model = CalibratedClassifierCV(clf)

#     model.fit(X, y)
#     preds_svm_prob = model.predict_proba(X)

#     end = time.time()
#     time_taken = end - start
#     with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
#     print('---------------------------------------------------------------------------------')

# if use_model_lgbm == 1:

#     print('Prediction LGBM')
#     with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

#     with open(output_file_name, "a") as f: print('Prediction LGBM', file = f)
#     print('---------------------------------------------------------------------------------')
#     #LGBM
#     start = time.time()
#     preds_lgbm = lgbm.predict(X_test)
#     preds_lgbm_prob = lgbm.predict_proba(X_test)

#     end = time.time()
#     time_taken = end - start
#     with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
#     print('---------------------------------------------------------------------------------')

# if use_model_dnn == 1:

#     print('Prediction DNN')
#     with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

#     with open(output_file_name, "a") as f: print('Prediction DNN', file = f)
#     print('---------------------------------------------------------------------------------')
#     #DNN
#     start = time.time()
#     pred_dnn = dnn.predict(X_test)
#     preds_dnn_prob = pred_dnn
#     preds_dnn = np.argmax(pred_dnn,axis = 1)
#     end = time.time()
#     time_taken = end - start
#     with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
#     print('---------------------------------------------------------------------------------')

# if use_model_ada == 1:

#     print('Prediction ADA')
#     with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

#     with open(output_file_name, "a") as f: print('Prediction ADA', file = f)
#     print('---------------------------------------------------------------------------------')
#     #ADA
#     start = time.time()
#     preds_ada = ada.predict(X_test)
#     preds_ada_prob = ada.predict_proba(X_test)

#     end = time.time()
#     time_taken = end - start
#     with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
#     print('---------------------------------------------------------------------------------')
#     print('Prediction MLP')
#     with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

#     with open(output_file_name, "a") as f: print('Prediction MLP', file = f)
#     print('---------------------------------------------------------------------------------')

# if use_model_mlp == 1:

#     #MLP
#     start = time.time()
#     y_pred = MLP.predict_proba(X_test)
#     preds_mlp_prob = y_pred
#     preds_mlp = np.argmax(y_pred,axis = 1)
#     end = time.time()
#     time_taken = end - start
#     with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
#     print('---------------------------------------------------------------------------------')
#     print('Prediction KNN')
#     with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

#     with open(output_file_name, "a") as f: print('Prediction KNN', file = f)
#     print('---------------------------------------------------------------------------------')

# if use_model_knn == 1:

#     #KNN
#     start = time.time()
#     preds_knn =knn_clf.predict(X_test)
#     preds_knn_prob =knn_clf.predict_proba(X_test)

#     preds_knn
#     end = time.time()
#     time_taken = end - start
#     with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
#     with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)


In [147]:
import catboost

cat_01 = catboost.CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='MultiClass', custom_metric='Accuracy')

# Fit the model
cat_01.fit(X_train_01, y_train_01, eval_set=(X_test_01, y_test_01), verbose=10)

# Make predictions on the test set
preds_cat = cat_01.predict(X_test_01)
preds_cat = np.squeeze(preds_cat)

with open(output_file_name, "a") as f: print('--------------------------------------------------------------------------', file = f)

with open(output_file_name, "a") as f: print('catboost', file = f)


print('---------------------------------------------------------------------------------')
print('CONFUSION MATRIX')
print('---------------------------------------------------------------------------------')


# pred_label = label[ypred]

confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
all_unique_values = sorted(set(pred_label) | set(y_test_01))
z = np.zeros((len(all_unique_values), len(all_unique_values)))
rows, cols = confusion_matrix.shape
z[:rows, :cols] = confusion_matrix
confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
# confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
# with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
print(confusion_matrix)
with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)
TP_total = sum(TP)
TN_total = sum(TN)
FP_total = sum(FP)
FN_total = sum(FN)

TP_total = np.array(TP_total,dtype=np.float64)
TN_total = np.array(TN_total,dtype=np.float64)
FP_total = np.array(FP_total,dtype=np.float64)
FN_total = np.array(FN_total,dtype=np.float64)



#----------------------------------------------------------------#----------------------------------------------------------------

print('---------------------------------------------------------------------------------')
print('METRICS')
print('---------------------------------------------------------------------------------')


Acc = accuracy_score(y_test_01, pred_label)
Precision = precision_score(y_test_01, pred_label, average='macro')
Recall = recall_score(y_test_01, pred_label, average='macro')
F1 =  f1_score(y_test_01, pred_label, average='macro')
BACC = balanced_accuracy_score(y_test_01, pred_label)
MCC = matthews_corrcoef(y_test_01, pred_label)


cat_acc_01 = Acc
cat_pre_01 = Precision
cat_rec_01 = Recall
cat_f1_01 = F1
cat_bacc_01 = BACC
cat_mcc_01 = MCC

# with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
print('Accuracy total: ', Acc)
cat_acc_01 = Acc
print('Precision total: ', Precision )
print('Recall total: ', Recall )
print('F1 total: ', F1 )
print('BACC total: ', BACC)
print('MCC total: ', MCC)

with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)



0:	learn: 1.3159672	test: 1.3145653	best: 1.3145653 (0)	total: 21.3ms	remaining: 2.11s
10:	learn: 0.4598019	test: 0.4551612	best: 0.4551612 (10)	total: 183ms	remaining: 1.48s
20:	learn: 0.2506635	test: 0.2463249	best: 0.2463249 (20)	total: 315ms	remaining: 1.18s
30:	learn: 0.1680234	test: 0.1644089	best: 0.1644089 (30)	total: 472ms	remaining: 1.05s
40:	learn: 0.1285191	test: 0.1255907	best: 0.1255907 (40)	total: 604ms	remaining: 870ms
50:	learn: 0.1063464	test: 0.1038684	best: 0.1038684 (50)	total: 733ms	remaining: 704ms
60:	learn: 0.0906806	test: 0.0885082	best: 0.0885082 (60)	total: 868ms	remaining: 555ms
70:	learn: 0.0815365	test: 0.0798485	best: 0.0798485 (70)	total: 992ms	remaining: 405ms
80:	learn: 0.0731660	test: 0.0720154	best: 0.0720154 (80)	total: 1.12s	remaining: 263ms
90:	learn: 0.0663771	test: 0.0655500	best: 0.0655500 (90)	total: 1.2s	remaining: 119ms
99:	learn: 0.0616477	test: 0.0610695	best: 0.0610695 (99)	total: 1.26s	remaining: 0us

bestTest = 0.06106954033
bestIterat

In [148]:

import xgboost as xgb

# Create a DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train_01, label=y_train_01)
dtest = xgb.DMatrix(X_test_01, label=y_test_01)

# Set XGBoost parameters
params = {
    'objective': 'multi:softmax',  # for multi-class classification
    'num_class': 5,  # specify the number of classes
    'max_depth': 3,
    'learning_rate': 0.1,
    'eval_metric': 'mlogloss'  # metric for multi-class classification
}

# Train the XGBoost model
num_round = 100
xgb_01 = xgb.train(params, dtrain, num_round)

# Make predictions on the test set
preds_xgb_01 = xgb_01.predict(dtest)


if 1 == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('xgboost base model', file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    pred_label = preds_xgb_01
    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test_01))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')

    # Acc = ACC(TP_total,TN_total, FP_total, FN_total)
    # Precision = PRECISION(TP_total, FP_total)
    # Recall = RECALL(TP_total, FN_total)
    # F1 = F1(Recall,Precision)
    # BACC = BACC(TP_total,TN_total, FP_total, FN_total)
    # MCC = MCC(TP_total,TN_total, FP_total, FN_total)

    Acc = accuracy_score(y_test_01, pred_label)
    Precision = precision_score(y_test_01, pred_label, average='macro')
    Recall = recall_score(y_test_01, pred_label, average='macro')
    F1 =  f1_score(y_test_01, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test_01, pred_label)
    MCC = matthews_corrcoef(y_test_01, pred_label)

    xgb_acc_01 = Acc
    xgb_pre_01 = Precision
    xgb_rec_01 = Recall
    xgb_f1_01 = F1
    xgb_bacc_01 = BACC
    xgb_mcc_01 = MCC


    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)


---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
         0.0     1.0     2.0    3.0   4.0
0.0  11579.0    77.0    12.0    2.0   0.0
1.0    157.0  7732.0    66.0    6.0   0.0
2.0      3.0    11.0  2047.0   17.0   0.0
3.0      0.0     1.0     6.0  545.0   0.0
4.0      0.0     0.0     0.0    1.0  16.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9838854475267079
Precision total:  0.978007667155827
Recall total:  0.9754028236072573
F1 total:  0.9764234163571771
BACC total:  0.9754028236072573
MCC total:  0.9726851550972443


In [149]:
# model1 = tree.DecisionTreeClassifier()
# model2 = KNeighborsClassifier()
# model3= LogisticRegression()

# model1.fit(x_train,y_train)
# model2.fit(x_train,y_train)
# model3.fit(x_train,y_train)

# pred1=model1.predict_proba(x_test)
# pred2=model2.predict_proba(x_test)
# pred3=model3.predict_proba(x_test)

# finalpred=(preds_svm_prob +
#             preds_ada_prob +
#             preds_knn_prob +
#             preds_rf_prob +
#             preds_dnn_prob +
#             preds_lgbm_prob +
#             preds_mlp_prob
#             )/7

In [None]:
# with open(output_file_name, "a") as f: print('-----------------------', file = f)
# with open(output_file_name, "a") as f: print('Summary', file = f)
# with open(output_file_name, "a") as f: print('-----------------------', file = f)
# with open(output_file_name, "a") as f: print('Level 00', file = f)

# with open(output_file_name, "a") as f: print('Accuracy ada: ', ada_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy dnn: ', dnn_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy svm: ', svm_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy knn: ', knn_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy mlp: ', mlp_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy lgbm: ', lgbm_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy rf: ', rf_acc_00, file = f)

# with open(output_file_name, "a") as f: print('-----------------------', file = f)
# with open(output_file_name, "a") as f: print('Level 01', file = f)

# with open(output_file_name, "a") as f: print('Accuracy ada: ', ada_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy dnn: ', dnn_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy svm: ', svm_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy knn: ', knn_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy mlp: ', mlp_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy lgbm: ', lgbm_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy rf: ', rf_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy LR: ', lr_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy Voting: ', voring_acc, file = f)
# with open(output_file_name, "a") as f: print('Accuracy catboost: ', cat_acc_01, file = f)

In [151]:
with open(output_file_name, "a") as f: print('-----------------------', file = f)
with open(output_file_name, "a") as f: print('Summary', file = f)
with open(output_file_name, "a") as f: print('-----------------------', file = f)
with open(output_file_name, "a") as f: print('Level 00', file = f)

with open(output_file_name, "a") as f: print('Accuracy ada: ', ada_acc_00, file = f)
with open(output_file_name, "a") as f: print('Accuracy dnn: ', dnn_acc_00, file = f)
with open(output_file_name, "a") as f: print('Accuracy svm: ', svm_acc_00, file = f)
with open(output_file_name, "a") as f: print('Accuracy knn: ', knn_acc_00, file = f)
with open(output_file_name, "a") as f: print('Accuracy mlp: ', mlp_acc_00, file = f)
with open(output_file_name, "a") as f: print('Accuracy lgbm: ', lgbm_acc_00, file = f)
with open(output_file_name, "a") as f: print('Accuracy rf: ', rf_acc_00, file = f)
with open(output_file_name, "a") as f: print('Accuracy cat: ', cat_acc_00, file = f)
with open(output_file_name, "a") as f: print('Accuracy xgb: ', xgb_acc_00, file = f)


with open(output_file_name, "a") as f: print('Precision ada: ', ada_pre_00, file = f)
with open(output_file_name, "a") as f: print('Precision dnn: ', dnn_pre_00, file = f)
with open(output_file_name, "a") as f: print('Precision svm: ', svm_pre_00, file = f)
with open(output_file_name, "a") as f: print('Precision knn: ', knn_pre_00, file = f)
with open(output_file_name, "a") as f: print('Precision mlp: ', mlp_pre_00, file = f)
with open(output_file_name, "a") as f: print('Precision lgbm: ', lgbm_pre_00, file = f)
with open(output_file_name, "a") as f: print('Precision rf: ', rf_pre_00, file = f)
with open(output_file_name, "a") as f: print('Precision cat: ', cat_pre_00, file = f)
with open(output_file_name, "a") as f: print('Precision xgb: ', xgb_pre_00, file = f)

with open(output_file_name, "a") as f: print('Recall ada: ', ada_rec_00, file = f)
with open(output_file_name, "a") as f: print('Recall dnn: ', dnn_rec_00, file = f)
with open(output_file_name, "a") as f: print('Recall svm: ', svm_rec_00, file = f)
with open(output_file_name, "a") as f: print('Recall knn: ', knn_rec_00, file = f)
with open(output_file_name, "a") as f: print('Recall mlp: ', mlp_rec_00, file = f)
with open(output_file_name, "a") as f: print('Recall lgbm: ', lgbm_rec_00, file = f)
with open(output_file_name, "a") as f: print('Recall rf: ', rf_rec_00, file = f)
with open(output_file_name, "a") as f: print('Recall cat: ', cat_rec_00, file = f)
with open(output_file_name, "a") as f: print('Recall xgb: ', xgb_rec_00, file = f)

with open(output_file_name, "a") as f: print('F1 ada: ', ada_f1_00, file = f)
with open(output_file_name, "a") as f: print('F1 dnn: ', dnn_f1_00, file = f)
with open(output_file_name, "a") as f: print('F1 svm: ', svm_f1_00, file = f)
with open(output_file_name, "a") as f: print('F1 knn: ', knn_f1_00, file = f)
with open(output_file_name, "a") as f: print('F1 mlp: ', mlp_f1_00, file = f)
with open(output_file_name, "a") as f: print('F1 lgbm: ', lgbm_f1_00, file = f)
with open(output_file_name, "a") as f: print('F1 rf: ', rf_f1_00, file = f)
with open(output_file_name, "a") as f: print('F1 cat: ', cat_f1_00, file = f)
with open(output_file_name, "a") as f: print('F1 xgb: ', xgb_f1_00, file = f)


with open(output_file_name, "a") as f: print('-----------------------', file = f)
with open(output_file_name, "a") as f: print('Level 01', file = f)

with open(output_file_name, "a") as f: print('Accuracy ada: ', ada_acc_01, file = f)
with open(output_file_name, "a") as f: print('Accuracy dnn: ', dnn_acc_01, file = f)
with open(output_file_name, "a") as f: print('Accuracy svm: ', svm_acc_01, file = f)
with open(output_file_name, "a") as f: print('Accuracy knn: ', knn_acc_01, file = f)
with open(output_file_name, "a") as f: print('Accuracy mlp: ', mlp_acc_01, file = f)
with open(output_file_name, "a") as f: print('Accuracy lgbm: ', lgbm_acc_01, file = f)
with open(output_file_name, "a") as f: print('Accuracy rf: ', rf_acc_01, file = f)
with open(output_file_name, "a") as f: print('Accuracy LR: ', lr_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy Voting: ', voting_acc_01, file = f)
with open(output_file_name, "a") as f: print('Accuracy catboost: ', cat_acc_01, file = f)
with open(output_file_name, "a") as f: print('Accuracy xgb: ', xgb_acc_01, file = f)

with open(output_file_name, "a") as f: print('Precision ada: ', ada_pre_01, file = f)
with open(output_file_name, "a") as f: print('Precision dnn: ', dnn_pre_01, file = f)
with open(output_file_name, "a") as f: print('Precision svm: ', svm_pre_01, file = f)
with open(output_file_name, "a") as f: print('Precision knn: ', knn_pre_01, file = f)
with open(output_file_name, "a") as f: print('Precision mlp: ', mlp_pre_01, file = f)
with open(output_file_name, "a") as f: print('Precision lgbm: ', lgbm_pre_01, file = f)
with open(output_file_name, "a") as f: print('Precision rf: ', rf_pre_01, file = f)
with open(output_file_name, "a") as f: print('Precision LR: ', lr_pre_01, file = f)
# with open(output_file_name, "a") as f: print('Precision Voting: ', voting_pre_01, file = f)
with open(output_file_name, "a") as f: print('Precision catboosting: ', cat_pre_01, file = f)
with open(output_file_name, "a") as f: print('Precision xgboost: ', xgb_pre_01, file = f)

with open(output_file_name, "a") as f: print('Recall ada: ', ada_rec_01, file = f)
with open(output_file_name, "a") as f: print('Recall dnn: ', dnn_rec_01, file = f)
with open(output_file_name, "a") as f: print('Recall svm: ', svm_rec_01, file = f)
with open(output_file_name, "a") as f: print('Recall knn: ', knn_rec_01, file = f)
with open(output_file_name, "a") as f: print('Recall mlp: ', mlp_rec_01, file = f)
with open(output_file_name, "a") as f: print('Recall lgbm: ', lgbm_rec_01, file = f)
with open(output_file_name, "a") as f: print('Recall rf: ', rf_rec_01, file = f)
with open(output_file_name, "a") as f: print('Recall LR: ', lr_rec_01, file = f)
# with open(output_file_name, "a") as f: print('Recall Voting: ', voting_rec_01, file = f)
with open(output_file_name, "a") as f: print('Recall catboosting: ', cat_rec_01, file = f)
with open(output_file_name, "a") as f: print('Recall xgboost: ', xgb_rec_01, file = f)

with open(output_file_name, "a") as f: print('F1 ada: ', ada_f1_01, file = f)
with open(output_file_name, "a") as f: print('F1 dnn: ', dnn_f1_01, file = f)
with open(output_file_name, "a") as f: print('F1 svm: ', svm_f1_01, file = f)
with open(output_file_name, "a") as f: print('F1 knn: ', knn_f1_01, file = f)
with open(output_file_name, "a") as f: print('F1 mlp: ', mlp_f1_01, file = f)
with open(output_file_name, "a") as f: print('F1 lgbm: ', lgbm_f1_01, file = f)
with open(output_file_name, "a") as f: print('F1 rf: ', rf_f1_01, file = f)
with open(output_file_name, "a") as f: print('F1 LR: ', lr_f1_01, file = f)
# with open(output_file_name, "a") as f: print('F1 Voting: ', voting_f1_01, file = f)
with open(output_file_name, "a") as f: print('F1 catboosting: ', cat_f1_01, file = f)
with open(output_file_name, "a") as f: print('F1 xgboost: ', xgb_f1_01, file = f)




In [None]:
# with open(output_file_name, "a") as f: print('-----------------------', file = f)
# with open(output_file_name, "a") as f: print('Summary', file = f)
# with open(output_file_name, "a") as f: print('-----------------------', file = f)
# with open(output_file_name, "a") as f: print('Level 00', file = f)

# with open(output_file_name, "a") as f: print('Accuracy ada: ', ada_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy dnn: ', dnn_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy svm: ', svm_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy knn: ', knn_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy mlp: ', mlp_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy lgbm: ', lgbm_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy rf: ', rf_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy cat: ', cat_acc_00, file = f)
# with open(output_file_name, "a") as f: print('Accuracy xgb: ', xgb_acc_00, file = f)


# # with open(output_file_name, "a") as f: print('Precision ada: ', ada_pre_00, file = f)
# # with open(output_file_name, "a") as f: print('Precision dnn: ', dnn_pre_00, file = f)
# # with open(output_file_name, "a") as f: print('Precision svm: ', svm_pre_00, file = f)
# # with open(output_file_name, "a") as f: print('Precision knn: ', knn_pre_00, file = f)
# # with open(output_file_name, "a") as f: print('Precision mlp: ', mlp_pre_00, file = f)
# # with open(output_file_name, "a") as f: print('Precision lgbm: ', lgbm_pre_00, file = f)
# # with open(output_file_name, "a") as f: print('Precision rf: ', rf_pre_00, file = f)
# # with open(output_file_name, "a") as f: print('Precision cat: ', cat_pre_00, file = f)
# # with open(output_file_name, "a") as f: print('Precision xgb: ', xgb_pre_00, file = f)

# # with open(output_file_name, "a") as f: print('Recall ada: ', ada_rec_00, file = f)
# # with open(output_file_name, "a") as f: print('Recall dnn: ', dnn_rec_00, file = f)
# # with open(output_file_name, "a") as f: print('Recall svm: ', svm_rec_00, file = f)
# # with open(output_file_name, "a") as f: print('Recall knn: ', knn_rec_00, file = f)
# # with open(output_file_name, "a") as f: print('Recall mlp: ', mlp_rec_00, file = f)
# # with open(output_file_name, "a") as f: print('Recall lgbm: ', lgbm_rec_00, file = f)
# # with open(output_file_name, "a") as f: print('Recall rf: ', rf_rec_00, file = f)
# # with open(output_file_name, "a") as f: print('Recall cat: ', cat_rec_00, file = f)
# # with open(output_file_name, "a") as f: print('Recall xgb: ', xgb_rec_00, file = f)

# # with open(output_file_name, "a") as f: print('F1 ada: ', ada_f1_00, file = f)
# # with open(output_file_name, "a") as f: print('F1 dnn: ', dnn_f1_00, file = f)
# # with open(output_file_name, "a") as f: print('F1 svm: ', svm_f1_00, file = f)
# # with open(output_file_name, "a") as f: print('F1 knn: ', knn_f1_00, file = f)
# # with open(output_file_name, "a") as f: print('F1 mlp: ', mlp_f1_00, file = f)
# # with open(output_file_name, "a") as f: print('F1 lgbm: ', lgbm_f1_00, file = f)
# # with open(output_file_name, "a") as f: print('F1 rf: ', rf_f1_00, file = f)
# # with open(output_file_name, "a") as f: print('F1 cat: ', cat_f1_00, file = f)
# # with open(output_file_name, "a") as f: print('F1 xgb: ', xgb_f1_00, file = f)


# with open(output_file_name, "a") as f: print('-----------------------', file = f)
# with open(output_file_name, "a") as f: print('Level 01', file = f)

# with open(output_file_name, "a") as f: print('Accuracy ada: ', ada_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy dnn: ', dnn_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy svm: ', svm_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy knn: ', knn_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy mlp: ', mlp_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy lgbm: ', lgbm_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy rf: ', rf_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy LR: ', lr_acc_01, file = f)
# # with open(output_file_name, "a") as f: print('Accuracy Voting: ', voting_acc, file = f)
# with open(output_file_name, "a") as f: print('Accuracy catboost: ', cat_acc_01, file = f)
# with open(output_file_name, "a") as f: print('Accuracy xgb: ', xgb_acc_01, file = f)

# # with open(output_file_name, "a") as f: print('Precision ada: ', ada_pre_01, file = f)
# # with open(output_file_name, "a") as f: print('Precision dnn: ', dnn_pre_01, file = f)
# # with open(output_file_name, "a") as f: print('Precision svm: ', svm_pre_01, file = f)
# # with open(output_file_name, "a") as f: print('Precision knn: ', knn_pre_01, file = f)
# # with open(output_file_name, "a") as f: print('Precision mlp: ', mlp_pre_01, file = f)
# # with open(output_file_name, "a") as f: print('Precision lgbm: ', lgbm_pre_01, file = f)
# # with open(output_file_name, "a") as f: print('Precision rf: ', rf_pre_01, file = f)
# # with open(output_file_name, "a") as f: print('Precision LR: ', lr_pre_01, file = f)
# # with open(output_file_name, "a") as f: print('Precision Voting: ', voting_pre, file = f)
# # with open(output_file_name, "a") as f: print('Precision catboosting: ', cat_pre_01, file = f)
# # with open(output_file_name, "a") as f: print('Precision xgboost: ', xgb_pre_01, file = f)

# # with open(output_file_name, "a") as f: print('Recall ada: ', ada_rec_01, file = f)
# # with open(output_file_name, "a") as f: print('Recall dnn: ', dnn_rec_01, file = f)
# # with open(output_file_name, "a") as f: print('Recall svm: ', svm_rec_01, file = f)
# # with open(output_file_name, "a") as f: print('Recall knn: ', knn_rec_01, file = f)
# # with open(output_file_name, "a") as f: print('Recall mlp: ', mlp_rec_01, file = f)
# # with open(output_file_name, "a") as f: print('Recall lgbm: ', lgbm_rec_01, file = f)
# # with open(output_file_name, "a") as f: print('Recall rf: ', rf_rec_01, file = f)
# # with open(output_file_name, "a") as f: print('Recall LR: ', lr_rec_01, file = f)
# # with open(output_file_name, "a") as f: print('Recall Voting: ', voting_rec, file = f)
# # with open(output_file_name, "a") as f: print('Recall catboosting: ', cat_rec_01, file = f)
# # with open(output_file_name, "a") as f: print('Recall xgboost: ', xgb_rec_01, file = f)

# # with open(output_file_name, "a") as f: print('F1 ada: ', ada_f1_01, file = f)
# # with open(output_file_name, "a") as f: print('F1 dnn: ', dnn_f1_01, file = f)
# # with open(output_file_name, "a") as f: print('F1 svm: ', svm_f1_01, file = f)
# # with open(output_file_name, "a") as f: print('F1 knn: ', knn_f1_01, file = f)
# # with open(output_file_name, "a") as f: print('F1 mlp: ', mlp_f1_01, file = f)
# # with open(output_file_name, "a") as f: print('F1 lgbm: ', lgbm_f1_01, file = f)
# # with open(output_file_name, "a") as f: print('F1 rf: ', rf_f1_01, file = f)
# # with open(output_file_name, "a") as f: print('F1 LR: ', lr_f1_01, file = f)
# # with open(output_file_name, "a") as f: print('F1 Voting: ', voting_f1, file = f)
# # with open(output_file_name, "a") as f: print('F1 catboosting: ', cat_f1_01, file = f)
# # with open(output_file_name, "a") as f: print('F1 xgboost: ', xgb_f1_01, file = f)




In [None]:

# import sklearn
# from sklearn.model_selection import train_test_split
# split = 0.7

# #AUC ROC
# #---------------------------------------------------------------------

# #AUCROC
# aucroc =[]
# y_array = [y_0,y_1,y_2,y_3,y_4]
# for j in range(0,len(y_array)):
#     # print(j)
#     #------------------------------------------------------------------------------------------------------------
#     X_train,X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y_array[j], train_size=split)
    
#     # evaluate the model

#     knn_clf.fit(X_train,y_train)
#     y_pred=knn_clf.predict(X_test) #These are the predicted output value
#     # y_pred = knn_clf.predict_proba(X_test)

    
#     y_scores = y_pred
#     y_true = y_test

#     # model = LGBMClassifier()
#     # model.fit(X_train, y_train)
#     # y_pred = model.predict(X_test)


#     y_scores = y_pred
#     y_true = y_test
    
#     # Calculate AUC-ROC score
#     auc_roc_score= roc_auc_score(y_true, y_scores,  average='weighted')  # Use 'micro' or 'macro' for different averaging strategies
#     # print("AUC-ROC Score class:", auc_roc_score)
#     aucroc.append(auc_roc_score)
#     #-------------------------------------------------------------------------------------------------------    -----
#     # Calculate the average
# average = sum(aucroc) / len(aucroc)

# # Display the result
# # with open(output_file_name, "a") as f:print("AUC ROC Average:", average, file = f)
# print("AUC ROC Average:", average)

# #End AUC ROC

In [152]:
lr_acc_00 = 0 
voting_acc_00 = 0

lr_pre_00 = 0 
voting_pre_00 = 0

lr_rec_00 = 0 
voting_rec_00 = 0

lr_f1_00 = 0 
voting_f1_00 = 0

In [153]:

voting_acc_01 = 0


voting_pre_01 = 0


voting_rec_01 = 0


voting_f1_01 = 0

In [154]:
from tabulate import tabulate

# Assuming data is a 110x4 list, where each row is a sublist
# data =  [["Row {} Col {}".format(i + 1, j + 1) for j in range(4)] for i in range(110)]
data = [["" for _ in range(3)] for _ in range(12)]

# Manually insert data at specific row and column
# data[0][0] = "ADA"
# data[1][0] = "DNN"
# data[2][0] = "SVM"
# data[3][0] = "ADA"
# data[4][0] = "DNN"
# data[2][0] = "SVM"


names_models = ['ADA',
                'SVM',
                'DNN',
                'MLP',
                'KNN',
                'CAT',
                'XGB',
                'LGBM',
                'RF',
                'LR',
                'VOTING'
                ]
level_00_acc = [ada_acc_00,
                svm_acc_00,
                dnn_acc_00,
                mlp_acc_00,
                knn_acc_00,
                cat_acc_00,
                xgb_acc_00,
                lgbm_acc_00,
                rf_acc_00,
                lr_acc_00,
                voting_acc_00]  
level_01_acc = [ada_acc_01,
                svm_acc_01,
                dnn_acc_01,
                mlp_acc_01,
                knn_acc_01,
                cat_acc_01,
                xgb_acc_01,
                lgbm_acc_01,
                rf_acc_01,
                lr_acc_01,
                voting_acc_01]  
                 

for i in range(0,len(names_models)):
    data[i][0] =  names_models[i]
    data[i][1] = level_00_acc[i]
    data[i][2] = level_01_acc[i]


 
# data[0][1] = ada_acc_00
# data

# Define column headers
headers = ["Accuracy", "Level 00", "Level 01"]

# Print the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)
with open(output_file_name, "a") as f: print(table, file = f)


+------------+--------------------+--------------------+
| Accuracy   | Level 00           | Level 01           |
| ADA        | 0.615454019041463  | 0.2292844959152527 |
+------------+--------------------+--------------------+
| SVM        | 0.9693909155792564 | 0.8195529221653649 |
+------------+--------------------+--------------------+
| DNN        | 0.8585760648540918 | 0.777179280007182  |
+------------+--------------------+--------------------+
| MLP        | 0.9924049610148264 | 0.9377412694137714 |
+------------+--------------------+--------------------+
| KNN        | 0.9889441010517244 | 0.9663793877367807 |
+------------+--------------------+--------------------+
| CAT        | 0.9920144359606243 | 0.8393931232606159 |
+------------+--------------------+--------------------+
| XGB        | 0.9919875031982656 | 0.9838854475267079 |
+------------+--------------------+--------------------+
| LGBM       | 0.9805545455769671 | 0.9770176856091211 |
+------------+-----------------

In [155]:
from tabulate import tabulate

# Assuming data is a 110x4 list, where each row is a sublist
# data =  [["Row {} Col {}".format(i + 1, j + 1) for j in range(4)] for i in range(110)]
data = [["" for _ in range(3)] for _ in range(12)]

# Manually insert data at specific row and column
# data[0][0] = "ADA"
# data[1][0] = "DNN"
# data[2][0] = "SVM"
# data[3][0] = "ADA"
# data[4][0] = "DNN"
# data[2][0] = "SVM"


names_models = ['ADA',
                'SVM',
                'DNN',
                'MLP',
                'KNN',
                'CAT',
                'XGB',
                'LGBM',
                'RF',
                'LR',
                'VOTING'
                ]
level_00_pre = [ada_pre_00,
                svm_pre_00,
                dnn_pre_00,
                mlp_pre_00,
                knn_pre_00,
                cat_pre_00,
                xgb_pre_00,
                lgbm_pre_00,
                rf_pre_00,
                lr_pre_00,
                voting_pre_00]  
level_01_pre = [ada_pre_01,
                svm_pre_01,
                dnn_pre_01,
                mlp_pre_01,
                knn_pre_01,
                cat_pre_01,
                xgb_pre_01,
                lgbm_pre_01,
                rf_pre_01,
                lr_pre_01,
                voting_pre_01]  
                 

for i in range(0,len(names_models)):
    data[i][0] =  names_models[i]
    data[i][1] = level_00_pre[i]
    data[i][2] = level_01_pre[i]


 
# data[0][1] = ada_acc_00
# data

# Define column headers
headers = ["Precision", "Level 00", "Level 01"]

# Print the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)
with open(output_file_name, "a") as f: print(table, file = f)


+-------------+---------------------+--------------------+
| Precision   | Level 00            | Level 01           |
| ADA         | 0.632923263976516   | 0.5260816525863901 |
+-------------+---------------------+--------------------+
| SVM         | 0.8827645887402553  | 0.6302904680558632 |
+-------------+---------------------+--------------------+
| DNN         | 0.35954806058543765 | 0.4484352807140194 |
+-------------+---------------------+--------------------+
| MLP         | 0.9350853557454828  | 0.9352098919045874 |
+-------------+---------------------+--------------------+
| KNN         | 0.9315441895221477  | 0.9471681715525812 |
+-------------+---------------------+--------------------+
| CAT         | 0.9317815323212976  | 0.6878263380381966 |
+-------------+---------------------+--------------------+
| XGB         | 0.916516593252323   | 0.978007667155827  |
+-------------+---------------------+--------------------+
| LGBM        | 0.7902176964513938  | 0.777514374044633 

In [156]:
from tabulate import tabulate

# Assuming data is a 110x4 list, where each row is a sublist
# data =  [["Row {} Col {}".format(i + 1, j + 1) for j in range(4)] for i in range(110)]
data = [["" for _ in range(3)] for _ in range(12)]

# Manually insert data at specific row and column
# data[0][0] = "ADA"
# data[1][0] = "DNN"
# data[2][0] = "SVM"
# data[3][0] = "ADA"
# data[4][0] = "DNN"
# data[2][0] = "SVM"


names_models = ['ADA',
                'SVM',
                'DNN',
                'MLP',
                'KNN',
                'CAT',
                'XGB',
                'LGBM',
                'RF',
                'LR',
                'VOTING'
                ]
level_00_rec = [ada_rec_00,
                svm_rec_00,
                dnn_rec_00,
                mlp_rec_00,
                knn_rec_00,
                cat_rec_00,
                xgb_rec_00,
                lgbm_rec_00,
                rf_rec_00,
                lr_rec_00,
                voting_rec_00]  
level_01_rec = [ada_rec_01,
                svm_rec_01,
                dnn_rec_01,
                mlp_rec_01,
                knn_rec_01,
                cat_rec_01,
                xgb_rec_01,
                lgbm_rec_01,
                rf_rec_01,
                lr_rec_01,
                voting_rec_01]  
                 

for i in range(0,len(names_models)):
    data[i][0] =  names_models[i]
    data[i][1] = level_00_rec[i]
    data[i][2] = level_01_rec[i]

 
# data[0][1] = ada_acc_00
# data

# Define column headers
headers = ["Recall", "Level 00", "Level 01"]

# Print the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)
with open(output_file_name, "a") as f: print(table, file = f)


+----------+--------------------+---------------------+
| Recall   | Level 00           | Level 01            |
| ADA      | 0.541715952581571  | 0.36308965924515896 |
+----------+--------------------+---------------------+
| SVM      | 0.784198426409409  | 0.5806167198793248  |
+----------+--------------------+---------------------+
| DNN      | 0.3900990696636019 | 0.45379527103143447 |
+----------+--------------------+---------------------+
| MLP      | 0.8924829434684349 | 0.9575907528934161  |
+----------+--------------------+---------------------+
| KNN      | 0.8698084938389755 | 0.8918226188895708  |
+----------+--------------------+---------------------+
| CAT      | 0.8334771923068951 | 0.6752025512643705  |
+----------+--------------------+---------------------+
| XGB      | 0.8943712402251023 | 0.9754028236072573  |
+----------+--------------------+---------------------+
| LGBM     | 0.82425692517516   | 0.8073372511731085  |
+----------+--------------------+---------------

In [157]:
from tabulate import tabulate

# Assuming data is a 110x4 list, where each row is a sublist
# data =  [["Row {} Col {}".format(i + 1, j + 1) for j in range(4)] for i in range(110)]
data = [["" for _ in range(3)] for _ in range(12)]

# Manually insert data at specific row and column
# data[0][0] = "ADA"
# data[1][0] = "DNN"
# data[2][0] = "SVM"
# data[3][0] = "ADA"
# data[4][0] = "DNN"
# data[2][0] = "SVM"


names_models = ['ADA',
                'SVM',
                'DNN',
                'MLP',
                'KNN',
                'CAT',
                'XGB',
                'LGBM',
                'RF',
                'LR',
                'VOTING'
                ]
level_00_f1 = [ada_f1_00,
                svm_f1_00,
                dnn_f1_00,
                mlp_f1_00,
                knn_f1_00,
                cat_f1_00,
                xgb_f1_00,
                lgbm_f1_00,
                rf_f1_00,
                lr_f1_00,
                voting_f1_00]  
level_01_f1 = [ada_f1_01,
                svm_f1_01,
                dnn_f1_01,
                mlp_f1_01,
                knn_f1_01,
                cat_f1_01,
                xgb_f1_01,
                lgbm_f1_01,
                rf_f1_01,
                lr_f1_01,
                voting_f1_01]  
                 

for i in range(0,len(names_models)):
    data[i][0] =  names_models[i]
    data[i][1] = level_00_f1[i]
    data[i][2] = level_01_f1[i]


 
# data[0][1] = ada_acc_00
# data

# Define column headers
headers = ["F1", "Level 00", "Level 01"]

# Print the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)
with open(output_file_name, "a") as f: print(table, file = f)


+--------+--------------------+---------------------+
| F1     | Level 00           | Level 01            |
| ADA    | 0.5048072353676654 | 0.40402296705975915 |
+--------+--------------------+---------------------+
| SVM    | 0.8204655892263848 | 0.5940391791421344  |
+--------+--------------------+---------------------+
| DNN    | 0.369539913853707  | 0.44749970805741135 |
+--------+--------------------+---------------------+
| MLP    | 0.9094645814131586 | 0.9458166048227209  |
+--------+--------------------+---------------------+
| KNN    | 0.8947376203615685 | 0.9125366326903638  |
+--------+--------------------+---------------------+
| CAT    | 0.8627391808106865 | 0.6752518450356755  |
+--------+--------------------+---------------------+
| XGB    | 0.9045829083245825 | 0.9764234163571771  |
+--------+--------------------+---------------------+
| LGBM   | 0.8028434238196789 | 0.7901736960971538  |
+--------+--------------------+---------------------+
| RF     | 0.620981596041255

In [None]:
from tabulate import tabulate

# Assuming data is a 110x4 list, where each row is a sublist
# data =  [["Row {} Col {}".format(i + 1, j + 1) for j in range(4)] for i in range(110)]
data = [["" for _ in range(9)] for _ in range(12)]

# Manually insert data at specific row and column
# data[0][0] = "ADA"
# data[1][0] = "DNN"
# data[2][0] = "SVM"
# data[3][0] = "ADA"
# data[4][0] = "DNN"
# data[2][0] = "SVM"


names_models = ['ADA',
                'SVM',
                'DNN',
                'MLP',
                'KNN',
                'CAT',
                'XGB',
                'LGBM',
                'RF',
                'LR',
                'VOTING'
                ]
level_00_f1 = [ada_f1_00,
                svm_f1_00,
                dnn_f1_00,
                mlp_f1_00,
                knn_f1_00,
                cat_f1_00,
                xgb_f1_00,
                lgbm_f1_00,
                rf_f1_00,
                lr_f1_00,
                voting_f1_00]  
level_01_f1 = [ada_f1_01,
                svm_f1_01,
                dnn_f1_01,
                mlp_f1_01,
                knn_f1_01,
                cat_f1_01,
                xgb_f1_01,
                lgbm_f1_01,
                rf_f1_01,
                lr_f1_01,
                voting_f1_01]  
                 

for i in range(0,len(names_models)):
    data[i][0] =  names_models[i]

    data[i][1] = level_00_acc[i]
    data[i][2] = level_01_acc[i]

    data[i][3] = level_00_pre[i] 
    data[i][4] = level_01_pre[i]

    data[i][5] = level_00_rec[i] 
    data[i][6] = level_01_rec[i]

    data[i][7] = level_00_f1[i]
    data[i][8] = level_01_f1[i]




 
# data[0][1] = ada_acc_00
# data

# Define column headers
headers = ["Models", "ACC-00", " ACC-01","PRE-00", " PRE-01","REC-00", " REC-01","F1-00", " F1-01",]

# Print the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)
with open(output_file_name, "a") as f: print(table, file = f)
