#### Import Libraries

In [317]:
#!/usr/bin/env python       
# coding: utf-8

# importing required libraries
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import time
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

from os import path
import sklearn
from sklearn import metrics, preprocessing
from sklearn.metrics import (precision_score, recall_score, f1_score, roc_auc_score, 
                             roc_curve, auc, confusion_matrix, accuracy_score, 
                             balanced_accuracy_score, matthews_corrcoef, classification_report)
from sklearn.preprocessing import (StandardScaler, OrdinalEncoder, LabelEncoder, 
                                   MinMaxScaler, OneHotEncoder, Normalizer, 
                                   MaxAbsScaler, RobustScaler, PowerTransformer, LabelBinarizer)
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris, make_classification
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_selection import mutual_info_classif
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from lightgbm import LGBMClassifier
import catboost
import xgboost as xgb
from scipy.stats import mode
from tabulate import tabulate
import shap
import joblib

start_program = time.time()

#### Parameters to use or load models, but yet to be implemented, it is not really functional.

In [318]:
# First ensemble with NSL-KDD
# Parameters
# Few parameters are not fully implemented yet

#----------------------------------------------
# 0 for not using it as base learner
# 1 for using it as base learner
# not implemented but in the code in someparts
use_model_ada = 1 
use_model_dnn = 1 
use_model_mlp = 1 
use_model_lgbm = 1 
use_model_rf = 1 
use_model_svm = 1
use_model_knn = 1 
#----------------------------------------------
# 0 for training the model
# 1 for using the saved version of the model

load_model_ada = 1
load_model_dnn = 1 
load_model_mlp = 1 
load_model_lgbm = 1 
load_model_rf = 1                               
load_model_svm = 1
load_model_knn = 1 
#----------------------------------------------

# Implemented
#----------------------------------------------
# feature_selection_bit = 0 # OFF
feature_selection_bit = 1# On
pick_prob = 1 # set equal one to choose the dataset with probabilities, set to 0 to choose one with the classes.
# pick_prob = 0
generate_feature_importance = 0 # Generate Shap graphs


# choose the features wanted to load.
column_features = [
                    # 'dnn',
                #    'rf',
                   'lgbm',
                #    'ada',
                   'knn',
                   'mlp',
                   'svm',
                #    'cat',
                #    'xgb',
                   'lr',
                   'dt',
                   'label']


#### Pick name of the file according to parameter

In [319]:
# Specify the name of the output text file
if feature_selection_bit == 0:

    if pick_prob == 0:
        output_file_name = "ensemble_level_01_all_features_classes.txt"
        with open(output_file_name, "w") as f: print('---------------------------------------------------------------------------------', file = f)
        with open(output_file_name, "a") as f: print('----ensemble_level_01_all_features_classes--', file = f)

    elif pick_prob == 1:
        output_file_name = "ensemble_level_01_all_features_probabilites.txt"
        with open(output_file_name, "w") as f: print('---------------------------------------------------------------------------------', file = f)
        with open(output_file_name, "a") as f: print('----ensemble_level_01_all_features_probabilites--', file = f)

elif feature_selection_bit == 1:
    if pick_prob == 0:
        output_file_name = "ensemble_level_01_feature_selection_classes.txt"
        with open(output_file_name, "w") as f: print('---------------------------------------------------------------------------------', file = f)
        with open(output_file_name, "a") as f: print('----ensemble_level_01_feature_selection_classes--', file = f)
    elif pick_prob == 1:
        output_file_name = "ensemble_level_01_feature_selection_probabilites.txt"
        with open(output_file_name, "w") as f: print('---------------------------------------------------------------------------------', file = f)
        with open(output_file_name, "a") as f: print('----ensemble_level_01_feature_selection_probabilites--', file = f)

In [320]:
start_program = time.time()

#### Function Definition

In [321]:


def confusion_metrics (name_model,predictions,true_labels):

    name = name_model
    pred_label = predictions
    y_test_01 = true_labels 

    with open(output_file_name, "a") as f: print('--------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print(name, file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test_01))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')


    Acc = accuracy_score(y_test_01, pred_label)
    Precision = precision_score(y_test_01, pred_label, average='macro')
    Recall = recall_score(y_test_01, pred_label, average='macro')
    F1 =  f1_score(y_test_01, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test_01, pred_label)
    MCC = matthews_corrcoef(y_test_01, pred_label)

    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)

    return Acc, Precision, Recall, F1, BACC, MCC


#### Load created datasets

In [322]:

df_level_00_1=pd.read_csv('base_models_prob_feature_selection.csv')
df_level_00_0=pd.read_csv('base_models_class_feature_selection.csv')


In [323]:
df_level_00_1

Unnamed: 0,dnn,rf,lgbm,ada,knn,mlp,svm,cat,xgb,lr,dt,label
0,0.921925,0.962559,1.000000,0.315597,1.0,0.999989,0.009174,0.995496,0.994228,0.997583,1.0,0.0
1,0.916570,0.962559,1.000000,0.323321,1.0,1.000000,0.506630,0.995372,0.993423,0.999931,1.0,0.0
2,0.931765,0.960068,0.999999,0.312277,1.0,0.999954,0.891043,0.994298,0.993254,0.993783,1.0,0.0
3,0.571414,0.987811,1.000000,0.283245,1.0,1.000000,0.507230,0.996083,0.997760,0.999845,1.0,1.0
4,0.570172,0.987811,1.000000,0.283245,1.0,1.000000,0.992353,0.995679,0.997760,0.999840,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4495,0.563370,0.829673,0.999999,0.281672,1.0,1.000000,0.140634,0.982723,0.991977,0.998422,1.0,1.0
4496,0.537440,0.987811,1.000000,0.292088,1.0,0.999999,0.052616,0.995899,0.997760,0.999914,1.0,1.0
4497,0.476535,0.880875,0.999988,0.307149,1.0,0.996455,0.783083,0.914719,0.948133,0.823161,1.0,0.0
4498,0.812448,0.923602,0.999999,0.342729,1.0,1.000000,0.836484,0.990981,0.983675,0.993822,1.0,0.0


In [324]:
y1 = df_level_00_1.pop('label')
X1 = df_level_00_1
df_level_00_1 = X1.assign(label = y1)

In [325]:
y0 = df_level_00_0.pop('label')
X0 = df_level_00_0
df_level_00_0 = X0.assign(label = y0)

#### Toggle the bit to 1 if feature select is wanted

In [326]:
if feature_selection_bit == 1:

    from sklearn.feature_selection import mutual_info_classif
    %matplotlib inline

    # Compute information gain using mutual information
    importances0 = mutual_info_classif(X0, y0)
    importances1 = mutual_info_classif(X1, y1)


    feat_importances0 = pd.Series(importances0, df_level_00_0.columns[0:len(df_level_00_0.columns)-1])
    feat_importances1= pd.Series(importances1, df_level_00_1.columns[0:len(df_level_00_1.columns)-1])

    # feat_importances.plot(kind='barh', color = 'teal')
    feat_importances_sorted0 = feat_importances0.sort_values( ascending=False)
    feat_importances_sorted1 = feat_importances1.sort_values( ascending=False)


    # Print or use the sorted DataFrame
    print(feat_importances_sorted0)
    print(feat_importances_sorted1)

    # feat_importances_sorted.plot(kind='barh', color = 'teal')
    # feat_importances_sorted
    top_features0 = feat_importances_sorted0.nlargest(5)
    top_features1 = feat_importances_sorted1.nlargest(5)

    top_feature_names0 = top_features0.index.tolist()
    top_feature_names1 = top_features1.index.tolist()


    print("Top 5 feature names:")
    print(top_feature_names0)
    print(top_feature_names1)

    column_features0 = top_feature_names0
    column_features1 = top_feature_names1


lgbm    0.993279
xgb     0.972628
dt      0.956487
cat     0.949534
mlp     0.947628
knn     0.895586
lr      0.876125
svm     0.848745
rf      0.778075
dnn     0.551505
ada     0.492850
dtype: float64
rf      0.866737
ada     0.845186
xgb     0.743265
dnn     0.699221
cat     0.476201
lgbm    0.446241
svm     0.342896
mlp     0.186961
lr      0.170701
knn     0.012751
dt      0.004745
dtype: float64
Top 5 feature names:
['lgbm', 'xgb', 'dt', 'cat', 'mlp']
['rf', 'ada', 'xgb', 'dnn', 'cat']


In [327]:

if feature_selection_bit == 1:
    df_level_00_0 = df_level_00_0[column_features0]
    df_level_00_1 = df_level_00_1[column_features1]


In [328]:
df_level_00_1


Unnamed: 0,rf,ada,xgb,dnn,cat
0,0.962559,0.315597,0.994228,0.921925,0.995496
1,0.962559,0.323321,0.993423,0.916570,0.995372
2,0.960068,0.312277,0.993254,0.931765,0.994298
3,0.987811,0.283245,0.997760,0.571414,0.996083
4,0.987811,0.283245,0.997760,0.570172,0.995679
...,...,...,...,...,...
4495,0.829673,0.281672,0.991977,0.563370,0.982723
4496,0.987811,0.292088,0.997760,0.537440,0.995899
4497,0.880875,0.307149,0.948133,0.476535,0.914719
4498,0.923602,0.342729,0.983675,0.812448,0.990981


In [329]:
df_level_00_0

Unnamed: 0,lgbm,xgb,dt,cat,mlp
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...
4495,1.0,1.0,1.0,1.0,1.0
4496,1.0,1.0,1.0,1.0,1.0
4497,0.0,0.0,0.0,0.0,0.0
4498,0.0,0.0,0.0,0.0,0.0


#### If the probabilities option is wanted select 1, if it is classses select 0

In [330]:
if pick_prob == 1:
    df_level_01 = df_level_00_1
else: 
    df_level_01 = df_level_00_0

df_level_01 = df_level_01.assign(label = y1)

y_01 = df_level_01.pop('label') 
    
X_01 = df_level_01
df_level_01 = df_level_01.assign(label = y_01)

split = 0.7
X_train_01,X_test_01, y_train_01, y_test_01 = sklearn.model_selection.train_test_split(X_01, y_01, train_size=split)

## Training the stronger model - STACK level 01

In [331]:
#----------------------------------------------------------------
with open(output_file_name, "a") as f: print('Stack model - Strong learner - level 01', file = f)
with open(output_file_name, "a") as f: print('-------------------------------------------------------', file = f)

In [332]:
X_test_01

Unnamed: 0,rf,ada,xgb,dnn,cat
947,0.960068,0.315597,0.994228,0.937316,0.994870
308,0.923602,0.329028,0.982133,0.848824,0.992103
2607,0.980553,0.285100,0.997526,0.538876,0.993727
779,0.303978,0.355229,0.983888,0.183852,0.932000
1063,0.655765,0.314990,0.890187,0.745144,0.968346
...,...,...,...,...,...
1855,0.871801,0.302229,0.987991,0.431270,0.978155
3627,0.369449,0.280640,0.976961,0.022077,0.828767
1287,0.987811,0.283245,0.997760,0.573287,0.995679
1446,0.962559,0.307769,0.993254,0.928510,0.994358


### Decision tree

In [333]:

start = time.time()

# Create a Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
# Train the classifier on the training data
dt_classifier.fit(X_train_01, y_train_01)
# Make predictions on the test data
preds_dt = dt_classifier.predict(X_test_01)
# Evaluate the accuracy of the model
preds_dt_prob = dt_classifier.predict_proba(X_test_01)

pred_label = preds_dt
name = 'dt'
metrics = confusion_metrics(name, pred_label, y_test_01)

Acc = metrics[0]
Precision = metrics[1]
Recall = metrics[2]
F1 = metrics[3]
BACC = metrics[4]
MCC = metrics[5]    

globals()[f"{name}_acc_00"] = Acc
globals()[f"{name}_pre_00"] = Precision
globals()[f"{name}_rec_00"] = Recall
globals()[f"{name}_f1_00"] = F1
globals()[f"{name}_bacc_00"] = BACC
globals()[f"{name}_mcc_00"] = MCC
end = time.time()
time_taken = end - start
globals()[f"{name}_time_00"] = time_taken
print(time_taken)

---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
       0.0    1.0    2.0   3.0  4.0
0.0  691.0    7.0    8.0   1.0  1.0
1.0    6.0  470.0    2.0   0.0  0.0
2.0    5.0    6.0  102.0   4.0  1.0
3.0    0.0    0.0    2.0  43.0  1.0
4.0    0.0    0.0    0.0   0.0  0.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9674074074074074
Precision total:  0.7495971091794891
Recall total:  0.7516883374496002
F1 total:  0.7504987137135265
BACC total:  0.9396104218120002
MCC total:  0.944987840594813
0.05147409439086914


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
y_pred contains classes not in y_true


### Voting

In [334]:
start = time.time()
    
if pick_prob == 0:
    # Voting start

    import pandas as pd
    from scipy.stats import mode

    df = X_test_01
    # Extract predictions columns
    
   
    predictions = df.loc[:, ~df.columns.isin(['label'])] 

    # Use the mode function along axis 1 to get the most common prediction for each row
    ensemble_predictions, _ = mode(predictions.values, axis=1)

    # Add the ensemble predictions to the DataFrame
    df['ensemble'] = ensemble_predictions.astype(int)

    # Display the DataFrame with ensemble predictions
    print(df)

    pred_label = df ['ensemble'].values
    df.pop('ensemble')

    #testing metrics def
    name = 'voting'
    metrics = confusion_metrics(name, pred_label, y_test_01)

    end = time.time()
    time_taken = end - start

    Acc = metrics[0]
    Precision = metrics[1]
    Recall = metrics[2]
    F1 = metrics[3]
    BACC = metrics[4]
    MCC = metrics[5]    


    globals()[f"{name}_acc_01"] = Acc
    globals()[f"{name}_pre_01"] = Precision
    globals()[f"{name}_rec_01"] = Recall
    globals()[f"{name}_f1_01"] = F1
    globals()[f"{name}_bacc_01"] = BACC
    globals()[f"{name}_mcc_01"] = MCC
    globals()[f"{name}_time_01"] = time_taken
   
else:
    name = 'voting'
    globals()[f"{name}_acc_01"] = 0
    globals()[f"{name}_pre_01"] = 0
    globals()[f"{name}_rec_01"] = 0
    globals()[f"{name}_f1_01"] = 0
    globals()[f"{name}_bacc_01"] = 0
    globals()[f"{name}_mcc_01"] = 0
    globals()[f"{name}_time_01"] = 9999
   

### Average

In [335]:
start = time.time()

if 0 == 0:
    # Average start

    import pandas as pd
    from scipy.stats import mode

    df = X_test_01
    predictions = df.loc[:, ~df.columns.isin(['label'])] 
   

    column_sums = df.sum(axis=1)
    row_average = df.mean(axis=1)

    # Approximate the result to the closest integer
    rounded_average = row_average.round().astype(int)

    df['results'] = rounded_average
    print(df)
 
    pred_label = df ['results'].values
    df.pop('results')

    with open(output_file_name, "a") as f: print('--------------------------------------------------------------------------', file = f)

    name = 'avg'
    metrics = confusion_metrics(name, pred_label, y_test_01)

    Acc = metrics[0]
    Precision = metrics[1]
    Recall = metrics[2]
    F1 = metrics[3]
    BACC = metrics[4]
    MCC = metrics[5]    

    end = time.time()
    time_taken = end - start
    
    globals()[f"{name}_acc_01"] = Acc
    globals()[f"{name}_pre_01"] = Precision
    globals()[f"{name}_rec_01"] = Recall
    globals()[f"{name}_f1_01"] = F1
    globals()[f"{name}_bacc_01"] = BACC
    globals()[f"{name}_mcc_01"] = MCC
    globals()[f"{name}_time_01"] = time_taken

    

            rf       ada       xgb       dnn       cat  results
947   0.960068  0.315597  0.994228  0.937316  0.994870        1
308   0.923602  0.329028  0.982133  0.848824  0.992103        1
2607  0.980553  0.285100  0.997526  0.538876  0.993727        1
779   0.303978  0.355229  0.983888  0.183852  0.932000        1
1063  0.655765  0.314990  0.890187  0.745144  0.968346        1
...        ...       ...       ...       ...       ...      ...
1855  0.871801  0.302229  0.987991  0.431270  0.978155        1
3627  0.369449  0.280640  0.976961  0.022077  0.828767        0
1287  0.987811  0.283245  0.997760  0.573287  0.995679        1
1446  0.962559  0.307769  0.993254  0.928510  0.994358        1
1120  0.619872  0.290535  0.886541  0.953510  0.606012        1

[1350 rows x 6 columns]
---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
      0.0    1.0  2.0  3.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


## Weighed Average

In [336]:

try:
    if pick_prob == 1:
        column_features = column_features1
    else: column_features = column_features0
except:
    None
feature_selection_columns_in_order_of_importance = column_features[:-1]





In [337]:
start = time.time()

if 0 == 0:
    # Average start

    import pandas as pd
    from scipy.stats import mode


    df = X_test_01[feature_selection_columns_in_order_of_importance]
    # Extract predictions columns
    predictions = df.loc[:, ~df.columns.isin(['label'])] #df[column_features]

    # weight
    weights_values = []

    # linear weight distribution
    for i in range(0,len(~df.columns.isin(['label']))):
        weights_values.append(i/(len(~df.columns.isin(['label']))-1))
    print(weights_values)
    # weights_values = [10,3,2,2.3]
    print(weights_values)
    print(df)
    weighted_average = df.multiply(weights_values).sum(axis=1) / sum(weights_values)
    print(weighted_average)
    # Approximate the result to the closest integer
    rounded_weighted_average = weighted_average.round().astype(int)

    print(rounded_weighted_average)

    df['results'] = rounded_weighted_average
    print(df)
 
    pred_label = df ['results'].values

    df.pop('results')

    #testing metrics def
    name = 'weighed_avg'
    metrics = confusion_metrics(name, pred_label, y_test_01)

    Acc = metrics[0]
    Precision = metrics[1]
    Recall = metrics[2]
    F1 = metrics[3]
    BACC = metrics[4]
    MCC = metrics[5]    

    end = time.time()
    time_taken = end - start
    globals()[f"{name}_acc_01"] = Acc
    globals()[f"{name}_pre_01"] = Precision
    globals()[f"{name}_rec_01"] = Recall
    globals()[f"{name}_f1_01"] = F1
    globals()[f"{name}_bacc_01"] = BACC
    globals()[f"{name}_mcc_01"] = MCC
    globals()[f"{name}_time_01"] = time_taken
    
    


[0.0, 0.3333333333333333, 0.6666666666666666, 1.0]
[0.0, 0.3333333333333333, 0.6666666666666666, 1.0]
            rf       ada       xgb       dnn
947   0.960068  0.315597  0.994228  0.937316
308   0.923602  0.329028  0.982133  0.848824
2607  0.980553  0.285100  0.997526  0.538876
779   0.303978  0.355229  0.983888  0.183852
1063  0.655765  0.314990  0.890187  0.745144
...        ...       ...       ...       ...
1855  0.871801  0.302229  0.987991  0.431270
3627  0.369449  0.280640  0.976961  0.022077
1287  0.987811  0.283245  0.997760  0.573287
1446  0.962559  0.307769  0.993254  0.928510
1120  0.619872  0.290535  0.886541  0.953510

[1350 rows x 4 columns]
947     0.852667
308     0.806627
2607    0.649463
779     0.479094
1063    0.721799
          ...   
1855    0.595337
3627    0.383465
1287    0.666438
1446    0.846635
1120    0.820691
Length: 1350, dtype: float64
947     1
308     1
2607    1
779     0
1063    1
       ..
1855    1
3627    0
1287    1
1446    1
1120    1
Length:

Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


      0.0    1.0  2.0  3.0
0.0  18.0  690.0  0.0  0.0
1.0  19.0  459.0  0.0  0.0
2.0  29.0   89.0  0.0  0.0
3.0  46.0    0.0  0.0  0.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.35333333333333333
Precision total:  0.13286839372259404
Recall total:  0.246418693709666
F1 total:  0.14471686849735632
BACC total:  0.246418693709666
MCC total:  -0.049612990936082714


## bagging  with DT

In [338]:


start = time.time()
base_classifier = DecisionTreeClassifier(random_state=42)

# Define the BaggingClassifier
bagging_classifier = BaggingClassifier(base_classifier, n_estimators=10, random_state=42)

# Train the BaggingClassifier
bagging_classifier.fit(X_train_01, y_train_01)

# Make predictions on the test set
y_pred = bagging_classifier.predict(X_test_01)

# Evaluate accuracy
# accuracy = accuracy_score(y_test_01, y_pred)
# print(f'Accuracy: {accuracy}')

with open(output_file_name, "a") as f: print('--------------------------------------------------------------------------', file = f)

name = 'bag_dt'
pred_label = y_pred
metrics = confusion_metrics(name, pred_label, y_test_01)

Acc = metrics[0]
Precision = metrics[1]
Recall = metrics[2]
F1 = metrics[3]
BACC = metrics[4]
MCC = metrics[5]    

end = time.time()
time_taken = end - start
globals()[f"{name}_acc_01"] = Acc
globals()[f"{name}_pre_01"] = Precision
globals()[f"{name}_rec_01"] = Recall
globals()[f"{name}_f1_01"] = F1
globals()[f"{name}_bacc_01"] = BACC
globals()[f"{name}_mcc_01"] = MCC
globals()[f"{name}_time_01"] = time_taken


---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
       0.0    1.0    2.0   3.0  4.0
0.0  694.0    2.0   10.0   1.0  1.0
1.0    4.0  471.0    3.0   0.0  0.0
2.0    2.0    5.0  109.0   1.0  1.0
3.0    0.0    1.0    2.0  43.0  0.0
4.0    0.0    0.0    0.0   0.0  0.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9755555555555555
Precision total:  0.7618629847341546
Recall total:  0.7648186118982208
F1 total:  0.7632005729380337
BACC total:  0.956023264872776
MCC total:  0.9588523534695824


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
y_pred contains classes not in y_true


## bagging  with SVM


In [339]:
start = time.time()

# Instantiate the SGDClassifier with additional hyperparameters
svm_01 = SGDClassifier(
    loss='hinge',           # hinge loss for linear SVM
    penalty='l2',           # L2 regularization to prevent overfitting
    alpha=1e-4,             # Learning rate (small value for fine-grained updates)
    max_iter=1000,          # Number of passes over the training data
    random_state=42,        # Seed for reproducible results
    learning_rate='optimal' # Automatically adjusts the learning rate based on the training data
)

# # Define the base classifier (Decision Tree in this case)
base_classifier = svm_01

# Define the BaggingClassifier
bagging_classifier = BaggingClassifier(base_classifier, n_estimators=10, random_state=42)

# Train the BaggingClassifier
bagging_classifier.fit(X_train_01, y_train_01)

# Make predictions on the test set
y_pred = bagging_classifier.predict(X_test_01)


with open(output_file_name, "a") as f: print('--------------------------------------------------------------------------', file = f)

name = 'bag_svm'
pred_label = y_pred
metrics = confusion_metrics(name, pred_label, y_test_01)

Acc = metrics[0]
Precision = metrics[1]
Recall = metrics[2]
F1 = metrics[3]
BACC = metrics[4]
MCC = metrics[5]    


globals()[f"{name}_acc_01"] = Acc
globals()[f"{name}_pre_01"] = Precision
globals()[f"{name}_rec_01"] = Recall
globals()[f"{name}_f1_01"] = F1
globals()[f"{name}_bacc_01"] = BACC
globals()[f"{name}_mcc_01"] = MCC

end = time.time()
time_taken = end - start
globals()[f"{name}_time_01"] = time_taken


---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
       0.0    1.0   2.0   3.0
0.0  618.0   82.0   6.0   2.0
1.0   18.0  449.0   7.0   4.0
2.0   26.0   51.0  31.0  10.0
3.0    0.0   15.0   0.0  31.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.8362962962962963
Precision total:  0.7624371170445091
Recall total:  0.6872092019375746
F1 total:  0.6967303335701458
BACC total:  0.6872092019375746
MCC total:  0.7265015640445596


## bagging with MLP

In [340]:
start = time.time()

# create MLPClassifier instance
mlp_01 = MLPClassifier(hidden_layer_sizes=(100,), max_iter=200, random_state=1)

base_classifier = mlp_01

# Define the BaggingClassifier
bagging_classifier = BaggingClassifier(base_classifier, n_estimators=10, random_state=42)

# Train the BaggingClassifier
bagging_classifier.fit(X_train_01, y_train_01)

# Make predictions on the test set
y_pred = bagging_classifier.predict(X_test_01)

# Evaluate accuracy
# accuracy = accuracy_score(y_test_01, y_pred)
# print(f'Accuracy: {accuracy}')

with open(output_file_name, "a") as f: print('--------------------------------------------------------------------------', file = f)

name = 'bag_mlp'
pred_label = y_pred
metrics = confusion_metrics(name, pred_label, y_test_01)

Acc = metrics[0]
Precision = metrics[1]
Recall = metrics[2]
F1 = metrics[3]
BACC = metrics[4]
MCC = metrics[5]    


globals()[f"{name}_acc_01"] = Acc
globals()[f"{name}_pre_01"] = Precision
globals()[f"{name}_rec_01"] = Recall
globals()[f"{name}_f1_01"] = F1
globals()[f"{name}_bacc_01"] = BACC
globals()[f"{name}_mcc_01"] = MCC
end = time.time()
time_taken = end - start
globals()[f"{name}_time_01"] = time_taken



Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
       0.0    1.0   2.0   3.0  4.0
0.0  634.0   62.0  11.0   0.0  1.0
1.0    6.0  445.0  17.0  10.0  0.0
2.0   20.0   43.0  50.0   5.0  0.0
3.0    0.0    0.0   0.0  46.0  0.0
4.0    0.0    0.0   0.0   0.0  0.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.8703703703703703
Precision total:  0.6329641942756696
Recall total:  0.6500342765288514
F1 total:  0.6325353010807875
BACC total:  0.8125428456610642
MCC total:  0.7846958164611808


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
y_pred contains classes not in y_true


## bagging knn

In [341]:
knn_01=KNeighborsClassifier(n_neighbors = 5)
start = time.time()

base_classifier = knn_01

# Define the BaggingClassifier
bagging_classifier = BaggingClassifier(base_classifier, n_estimators=10, random_state=42)

# Train the BaggingClassifier
bagging_classifier.fit(X_train_01, y_train_01)

# Make predictions on the test set
y_pred = bagging_classifier.predict(X_test_01)

# Evaluate accuracy
# accuracy = accuracy_score(y_test_01, y_pred)
# print(f'Accuracy: {accuracy}')

with open(output_file_name, "a") as f: print('--------------------------------------------------------------------------', file = f)

name = 'bag_knn'

pred_label = y_pred


metrics = confusion_metrics(name, pred_label, y_test_01)

Acc = metrics[0]
Precision = metrics[1]
Recall = metrics[2]
F1 = metrics[3]
BACC = metrics[4]
MCC = metrics[5]    


globals()[f"{name}_acc_01"] = Acc
globals()[f"{name}_pre_01"] = Precision
globals()[f"{name}_rec_01"] = Recall
globals()[f"{name}_f1_01"] = F1
globals()[f"{name}_bacc_01"] = BACC
globals()[f"{name}_mcc_01"] = MCC
end = time.time()
time_taken = end - start
globals()[f"{name}_time_01"] = time_taken


---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
       0.0    1.0   2.0   3.0
0.0  688.0    3.0  15.0   2.0
1.0   10.0  451.0  16.0   1.0
2.0   14.0    4.0  92.0   8.0
3.0    0.0    0.0   4.0  42.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.942962962962963
Precision total:  0.8669676427610695
Recall total:  0.9019926379977163
F1 total:  0.8830486387075925
BACC total:  0.9019926379977163
MCC total:  0.9041709349963909


## bagging LogRegression

In [342]:
start = time.time()

#Logistic Regression
print('---------------------------------------------------------------------------------')
print('Defining baggin Logistic Regression Model')
print('---------------------------------------------------------------------------------')
logreg_01 = LogisticRegression()


base_classifier = logreg_01

# Define the BaggingClassifier
bagging_classifier = BaggingClassifier(base_classifier, n_estimators=10, random_state=42)

# Train the BaggingClassifier
bagging_classifier.fit(X_train_01, y_train_01)

# Make predictions on the test set
y_pred = bagging_classifier.predict(X_test_01)

# Evaluate accuracy
# accuracy = accuracy_score(y_test_01, y_pred)
# print(f'Accuracy: {accuracy}')

with open(output_file_name, "a") as f: print('--------------------------------------------------------------------------', file = f)

name = 'bag_lr'

pred_label = y_pred


metrics = confusion_metrics(name, pred_label, y_test_01)

Acc = metrics[0]
Precision = metrics[1]
Recall = metrics[2]
F1 = metrics[3]
BACC = metrics[4]
MCC = metrics[5]    


globals()[f"{name}_acc_01"] = Acc
globals()[f"{name}_pre_01"] = Precision
globals()[f"{name}_rec_01"] = Recall
globals()[f"{name}_f1_01"] = F1
globals()[f"{name}_bacc_01"] = BACC
globals()[f"{name}_mcc_01"] = MCC
end = time.time()
time_taken = end - start
globals()[f"{name}_time_01"] = time_taken


---------------------------------------------------------------------------------
Defining baggin Logistic Regression Model
---------------------------------------------------------------------------------


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the doc

---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
       0.0    1.0   2.0   3.0
0.0  625.0   74.0   7.0   2.0
1.0   13.0  445.0  15.0   5.0
2.0   24.0   50.0  38.0   6.0
3.0    0.0    2.0   0.0  44.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.8533333333333334
Precision total:  0.7821766050249246
Recall total:  0.7730715855284187
F1 total:  0.7605427641752488
BACC total:  0.7730715855284187
MCC total:  0.7555371767540533


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Bagging ADA

In [343]:
start = time.time()

ada = AdaBoostClassifier(n_estimators=50, learning_rate=1.0)

base_classifier = ada

# Define the BaggingClassifier
bagging_classifier = BaggingClassifier(base_classifier, n_estimators=10, random_state=42)

# Train the BaggingClassifier
bagging_classifier.fit(X_train_01, y_train_01)

# Make predictions on the test set
y_pred = bagging_classifier.predict(X_test_01)

with open(output_file_name, "a") as f: print('--------------------------------------------------------------------------', file = f)

name = 'bag_ada'

pred_label = y_pred


metrics = confusion_metrics(name, pred_label, y_test_01)

Acc = metrics[0]
Precision = metrics[1]
Recall = metrics[2]
F1 = metrics[3]
BACC = metrics[4]
MCC = metrics[5]    


globals()[f"{name}_acc_01"] = Acc
globals()[f"{name}_pre_01"] = Precision
globals()[f"{name}_rec_01"] = Recall
globals()[f"{name}_f1_01"] = F1
globals()[f"{name}_bacc_01"] = BACC
globals()[f"{name}_mcc_01"] = MCC
end = time.time()
time_taken = end - start
globals()[f"{name}_time_01"] = time_taken


---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
       0.0   1.0   2.0   3.0
0.0  642.0  58.0   6.0   2.0
1.0  400.0  66.0  12.0   0.0
2.0   35.0  58.0  22.0   3.0
3.0    0.0   0.0   5.0  41.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.5711111111111111
Precision total:  0.5847327194759676
Recall total:  0.5306500001541673
F1 total:  0.520142682306318
BACC total:  0.5306500001541673
MCC total:  0.22413722600921565


### Bagging CAT

In [344]:
start = time.time()

bag_cat = catboost.CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='MultiClass', custom_metric='Accuracy')

base_classifier = bag_cat

# Define the BaggingClassifier
bagging_classifier = BaggingClassifier(base_classifier, n_estimators=10, random_state=42)

# Train the BaggingClassifier
bagging_classifier.fit(X_train_01, y_train_01)

# Make predictions on the test set
y_pred = bagging_classifier.predict(X_test_01)

with open(output_file_name, "a") as f: print('--------------------------------------------------------------------------', file = f)

name = 'bag_cat'

pred_label = y_pred


metrics = confusion_metrics(name, pred_label, y_test_01)

Acc = metrics[0]
Precision = metrics[1]
Recall = metrics[2]
F1 = metrics[3]
BACC = metrics[4]
MCC = metrics[5]    


globals()[f"{name}_acc_01"] = Acc
globals()[f"{name}_pre_01"] = Precision
globals()[f"{name}_rec_01"] = Recall
globals()[f"{name}_f1_01"] = F1
globals()[f"{name}_bacc_01"] = BACC
globals()[f"{name}_mcc_01"] = MCC
end = time.time()
time_taken = end - start
globals()[f"{name}_time_01"] = time_taken



0:	learn: 1.3321561	total: 7.35ms	remaining: 727ms
1:	learn: 1.1460055	total: 13.2ms	remaining: 645ms
2:	learn: 1.0076801	total: 18.5ms	remaining: 599ms
3:	learn: 0.8942871	total: 24.2ms	remaining: 580ms
4:	learn: 0.8062557	total: 29.8ms	remaining: 566ms
5:	learn: 0.7359374	total: 35ms	remaining: 548ms
6:	learn: 0.6730170	total: 40.8ms	remaining: 542ms
7:	learn: 0.6202162	total: 46.2ms	remaining: 531ms
8:	learn: 0.5713458	total: 52.5ms	remaining: 531ms
9:	learn: 0.5283769	total: 58.3ms	remaining: 525ms
10:	learn: 0.4895663	total: 63.9ms	remaining: 517ms
11:	learn: 0.4569699	total: 68.9ms	remaining: 505ms
12:	learn: 0.4245054	total: 73.9ms	remaining: 494ms
13:	learn: 0.3996663	total: 79.3ms	remaining: 487ms
14:	learn: 0.3775880	total: 85.4ms	remaining: 484ms
15:	learn: 0.3568403	total: 91.4ms	remaining: 480ms
16:	learn: 0.3364731	total: 97.2ms	remaining: 475ms
17:	learn: 0.3189125	total: 103ms	remaining: 469ms
18:	learn: 0.3020466	total: 109ms	remaining: 465ms
19:	learn: 0.2863941	total

### Baggin LGBM


In [345]:
start = time.time()

lgbm = LGBMClassifier()

base_classifier = lgbm

# Define the BaggingClassifier
bagging_classifier = BaggingClassifier(base_classifier, n_estimators=10, random_state=42)

# Train the BaggingClassifier
bagging_classifier.fit(X_train_01, y_train_01)

# Make predictions on the test set
y_pred = bagging_classifier.predict(X_test_01)

with open(output_file_name, "a") as f: print('--------------------------------------------------------------------------', file = f)

name = 'bag_lgbm'

pred_label = y_pred


metrics = confusion_metrics(name, pred_label, y_test_01)

Acc = metrics[0]
Precision = metrics[1]
Recall = metrics[2]
F1 = metrics[3]
BACC = metrics[4]
MCC = metrics[5]    


globals()[f"{name}_acc_01"] = Acc
globals()[f"{name}_pre_01"] = Precision
globals()[f"{name}_rec_01"] = Recall
globals()[f"{name}_f1_01"] = F1
globals()[f"{name}_bacc_01"] = BACC
globals()[f"{name}_mcc_01"] = MCC
end = time.time()
time_taken = end - start
globals()[f"{name}_time_01"] = time_taken



---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
       0.0    1.0    2.0   3.0  4.0
0.0  698.0    1.0    7.0   1.0  1.0
1.0    7.0  470.0    1.0   0.0  0.0
2.0    4.0    1.0  111.0   2.0  0.0
3.0    0.0    0.0    3.0  43.0  0.0
4.0    0.0    0.0    0.0   0.0  0.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9792592592592593
Precision total:  0.7649733153085713
Recall total:  0.7689199758676792
F1 total:  0.7668872501126401
BACC total:  0.9611499698345991
MCC total:  0.964973376002392


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
y_pred contains classes not in y_true


### Bagging RF

In [346]:
start = time.time()
rf = RandomForestClassifier(max_depth = 5,  n_estimators = 10, min_samples_split = 2, n_jobs = -1)

base_classifier = rf

# Define the BaggingClassifier
bagging_classifier = BaggingClassifier(base_classifier, n_estimators=10, random_state=42)

# Train the BaggingClassifier
bagging_classifier.fit(X_train_01, y_train_01)

# Make predictions on the test set
y_pred = bagging_classifier.predict(X_test_01)

with open(output_file_name, "a") as f: print('--------------------------------------------------------------------------', file = f)

name = 'bag_rf'

pred_label = y_pred

metrics = confusion_metrics(name, pred_label, y_test_01)

Acc = metrics[0]
Precision = metrics[1]
Recall = metrics[2]
F1 = metrics[3]
BACC = metrics[4]
MCC = metrics[5]    


globals()[f"{name}_acc_01"] = Acc
globals()[f"{name}_pre_01"] = Precision
globals()[f"{name}_rec_01"] = Recall
globals()[f"{name}_f1_01"] = F1
globals()[f"{name}_bacc_01"] = BACC
globals()[f"{name}_mcc_01"] = MCC
end = time.time()
time_taken = end - start
globals()[f"{name}_time_01"] = time_taken



---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
       0.0    1.0   2.0   3.0
0.0  695.0    7.0   5.0   1.0
1.0   17.0  454.0   7.0   0.0
2.0   14.0    3.0  99.0   2.0
3.0    0.0    0.0   8.0  38.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9525925925925925
Precision total:  0.9236276481865223
Recall total:  0.8991248051068431
F1 total:  0.910557358759531
BACC total:  0.8991248051068431
MCC total:  0.9195604848401054


### Bagging with many models

##### do bootstrapping 

##### 1. Multiple subsets are created from the original dataset, selecting observations with replacement.


In [347]:
start = time.time()

num_bootstraps = 10  # Adjust the number of bootstraps as needed

original_data_df = X_train_01.assign(label = y_train_01)

In [348]:
boot_df = []
for i in range(0,num_bootstraps): 
    boot_df.append(original_data_df.sample(frac = 1, replace=True).reset_index(drop=True))


In [349]:
boot_df[5]

Unnamed: 0,rf,ada,xgb,dnn,cat,label
0,0.959790,0.315597,0.994228,0.935731,0.995068,0.0
1,0.927451,0.307769,0.996493,0.933363,0.988214,0.0
2,0.543332,0.411966,0.989324,0.471306,0.970823,2.0
3,0.987811,0.303750,0.997760,0.567234,0.995899,1.0
4,0.634661,0.293188,0.884207,0.901184,0.873931,2.0
...,...,...,...,...,...,...
3145,0.960068,0.319886,0.993254,0.933188,0.994152,0.0
3146,0.960068,0.307373,0.994228,0.937913,0.995631,0.0
3147,0.839586,0.298181,0.943468,0.371196,0.921495,0.0
3148,0.962837,0.319886,0.993577,0.894050,0.994019,0.0


#### 2.A base model (weak model) is created on each of these subsets.

In [350]:
bag_comb_pred = []


In [351]:
# SVM
clf = SGDClassifier(
    loss='hinge',           # hinge loss for linear SVM
    penalty='l2',           # L2 regularization to prevent overfitting
    alpha=1e-4,             # Learning rate (small value for fine-grained updates)
    max_iter=1000,          # Number of passes over the training data
    random_state=42,        # Seed for reproducible results
    learning_rate='optimal' # Automatically adjusts the learning rate based on the training data
)
y_train_boot = boot_df[0].pop('label')
X_train_boot = boot_df[0]
clf.fit(X_train_boot, y_train_boot)
preds_svm_01 = clf.predict(X_test_01)
bag_comb_pred.append(preds_svm_01)





In [352]:
#ADA
abc = AdaBoostClassifier(n_estimators=50, learning_rate=1.0)
ada = abc.fit(X_train_01, y_train_01)
y_train_boot = boot_df[1].pop('label')
X_train_boot = boot_df[1]
preds_ada_01 = ada.predict(X_test_01)
bag_comb_pred.append(preds_ada_01)


In [353]:
#Catboost

cat_01 = catboost.CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='MultiClass', custom_metric='Accuracy')
y_train_boot = boot_df[2].pop('label')
X_train_boot = boot_df[2]
cat_01.fit(X_train_boot, y_train_boot, eval_set=(X_test_01, y_test_01), verbose=10)
preds_cat = cat_01.predict(X_test_01)
preds_cat = np.squeeze(preds_cat)
pred_label = preds_cat
bag_comb_pred.append(preds_cat)


0:	learn: 1.3353943	test: 1.3405362	best: 1.3405362 (0)	total: 6.55ms	remaining: 648ms
10:	learn: 0.4784180	test: 0.4940028	best: 0.4940028 (10)	total: 63.1ms	remaining: 510ms
20:	learn: 0.2615392	test: 0.2800742	best: 0.2800742 (20)	total: 122ms	remaining: 460ms
30:	learn: 0.1744494	test: 0.1978095	best: 0.1978095 (30)	total: 180ms	remaining: 401ms
40:	learn: 0.1355911	test: 0.1604291	best: 0.1604291 (40)	total: 236ms	remaining: 339ms
50:	learn: 0.1118923	test: 0.1415299	best: 0.1415299 (50)	total: 285ms	remaining: 274ms
60:	learn: 0.0967557	test: 0.1291668	best: 0.1291668 (60)	total: 339ms	remaining: 217ms
70:	learn: 0.0847308	test: 0.1196154	best: 0.1196154 (70)	total: 392ms	remaining: 160ms
80:	learn: 0.0757855	test: 0.1120742	best: 0.1120742 (80)	total: 441ms	remaining: 103ms
90:	learn: 0.0696515	test: 0.1070302	best: 0.1070302 (90)	total: 490ms	remaining: 48.4ms
99:	learn: 0.0640921	test: 0.1031307	best: 0.1031307 (99)	total: 535ms	remaining: 0us

bestTest = 0.1031306785
bestIter

In [354]:
#MLP
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=200, random_state=1)
y_train_boot = boot_df[3].pop('label')
X_train_boot = boot_df[3]
if 1 == 1 and 0 == 0:
    MLP = mlp.fit(X_train_boot, y_train_boot)
    y_pred = MLP.predict_proba(X_test_01)
    preds_mlp_01 = np.argmax(y_pred,axis = 1)

bag_comb_pred.append(preds_mlp_01)


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


In [355]:
#LGBM
lgbm = LGBMClassifier()
y_train_boot = boot_df[4].pop('label')
X_train_boot = boot_df[4]

if 1 == 1 and 0 == 0:
    lgbm.fit(X_train_boot, y_train_boot)
    preds_lgbm_01 = lgbm.predict(X_test_01)
    bag_comb_pred.append(preds_lgbm_01)

In [356]:
#KNN
knn_clf_01=KNeighborsClassifier(n_neighbors = 5)
y_train_boot = boot_df[5].pop('label')
X_train_boot = boot_df[5]

if 1 == 1 and 0 == 0:
    knn_clf_01.fit(X_train_boot,y_train_boot)
if use_model_knn == 1:
    preds_knn =knn_clf_01.predict(X_test_01)
    bag_comb_pred.append(preds_knn)

In [357]:
#Random Forest
rf = RandomForestClassifier(max_depth = 5,  n_estimators = 10, min_samples_split = 2, n_jobs = -1)
y_train_boot = boot_df[6].pop('label')
X_train_boot = boot_df[6]

if True == True:
    model_rf_01 = rf.fit(X_train_boot,y_train_boot)
    preds_rf_01 = model_rf_01.predict(X_test_01)
    bag_comb_pred.append(preds_rf_01)

In [358]:
#DNN
#Model Parameters
y_train_boot = boot_df[7].pop('label')
X_train_boot = boot_df[7]


dropout_rate = 0.02
nodes = 3
out_layer = 5
optimizer='adam'
loss='sparse_categorical_crossentropy'
epochs=100
batch_size=128
num_columns = X_train_boot.shape[1]
dnn_01 = tf.keras.Sequential()
# Input layer
dnn_01.add(tf.keras.Input(shape=(num_columns,)))
# Dense layers with dropout
dnn_01.add(tf.keras.layers.Dense(nodes))
dnn_01.add(tf.keras.layers.Dropout(dropout_rate))
dnn_01.add(tf.keras.layers.Dense(nodes))
dnn_01.add(tf.keras.layers.Dropout(dropout_rate))
dnn_01.add(tf.keras.layers.Dense(nodes))
dnn_01.add(tf.keras.layers.Dropout(dropout_rate))
dnn_01.add(tf.keras.layers.Dense(nodes))
dnn_01.add(tf.keras.layers.Dropout(dropout_rate))
dnn_01.add(tf.keras.layers.Dense(nodes))
dnn_01.add(tf.keras.layers.Dropout(dropout_rate))
# Output layer
dnn_01.add(tf.keras.layers.Dense(out_layer, activation='softmax'))
dnn_01.compile(optimizer=optimizer, loss=loss,metrics=['accuracy'])
from keras.callbacks import EarlyStopping
# Define EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)
dnn_01.fit(X_train_boot, y_train_boot, epochs=epochs, batch_size=batch_size,validation_split=0.2, callbacks=[early_stopping])
pred_dnn = dnn_01.predict(X_test_01)
preds_dnn_01 = np.argmax(pred_dnn,axis = 1)
bag_comb_pred.append(preds_dnn_01)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100


In [359]:
#LogReg
logreg_01 = LogisticRegression()
y_train_boot = boot_df[8].pop('label')
X_train_boot = boot_df[8]

logreg_01.fit(X_train_boot,y_train_boot)
preds_logreg =logreg_01.predict(X_test_01)
bag_comb_pred.append(preds_logreg)

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [360]:
y_train_boot = boot_df[9].pop('label')
X_train_boot = boot_df[9]

# Create a DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train_boot, label=y_train_boot)
dtest = xgb.DMatrix(X_test_01, label=y_test_01)
# Set XGBoost parameters
params = {
    'objective': 'multi:softmax',  # for multi-class classification
    'num_class': 5,  # specify the number of classes
    'max_depth': 3,
    'learning_rate': 0.1,
    'eval_metric': 'mlogloss'  # metric for multi-class classification
}
# Train the XGBoost model
num_round = 100
xgb_01 = xgb.train(params, dtrain, num_round)
preds_xgb_01 = xgb_01.predict(dtest)
bag_comb_pred.append(preds_xgb_01)

### 3. The models run in parallel and are independent of each other.

In [361]:
bag_vot_df = pd.DataFrame()
for i in range(0,len(bag_comb_pred)):
    bag_vot_df[f'model_{i}'] =  bag_comb_pred[i]
print(bag_vot_df)

      model_0  model_1  model_2  model_3  model_4  model_5  model_6  model_7  \
0         0.0      0.0      0.0        0      0.0      0.0      0.0        0   
1         0.0      0.0      0.0        0      0.0      0.0      0.0        0   
2         1.0      1.0      1.0        1      1.0      1.0      1.0        0   
3         2.0      2.0      2.0        2      2.0      2.0      2.0        1   
4         0.0      0.0      0.0        0      0.0      0.0      0.0        0   
...       ...      ...      ...      ...      ...      ...      ...      ...   
1345      1.0      1.0      0.0        1      0.0      0.0      0.0        0   
1346      1.0      3.0      3.0        3      3.0      3.0      3.0        1   
1347      1.0      1.0      1.0        1      1.0      1.0      1.0        0   
1348      0.0      0.0      0.0        0      0.0      0.0      0.0        0   
1349      0.0      0.0      0.0        0      0.0      0.0      0.0        0   

      model_8  model_9  
0         0.0 

In [362]:
# Voting start

predictions = bag_vot_df 

# Use the mode function along axis 1 to get the most common prediction for each row
ensemble_predictions, _ = mode(predictions.values, axis=1)

# Add the ensemble predictions to the DataFrame
bag_vot_df['ensemble'] = ensemble_predictions.astype(int)

# Display the DataFrame with ensemble predictions
print(bag_vot_df)

pred_label = bag_vot_df ['ensemble'].values
bag_vot_df.pop('ensemble')



      model_0  model_1  model_2  model_3  model_4  model_5  model_6  model_7  \
0         0.0      0.0      0.0        0      0.0      0.0      0.0        0   
1         0.0      0.0      0.0        0      0.0      0.0      0.0        0   
2         1.0      1.0      1.0        1      1.0      1.0      1.0        0   
3         2.0      2.0      2.0        2      2.0      2.0      2.0        1   
4         0.0      0.0      0.0        0      0.0      0.0      0.0        0   
...       ...      ...      ...      ...      ...      ...      ...      ...   
1345      1.0      1.0      0.0        1      0.0      0.0      0.0        0   
1346      1.0      3.0      3.0        3      3.0      3.0      3.0        1   
1347      1.0      1.0      1.0        1      1.0      1.0      1.0        0   
1348      0.0      0.0      0.0        0      0.0      0.0      0.0        0   
1349      0.0      0.0      0.0        0      0.0      0.0      0.0        0   

      model_8  model_9  ensemble  
0   

0       0
1       0
2       1
3       2
4       0
       ..
1345    0
1346    3
1347    1
1348    0
1349    0
Name: ensemble, Length: 1350, dtype: int64

In [363]:
name='bag_comb'
metrics = confusion_metrics(name, pred_label, y_test_01)

Acc = metrics[0]
Precision = metrics[1]
Recall = metrics[2]
F1 = metrics[3]
BACC = metrics[4]
MCC = metrics[5]    


globals()[f"{name}_acc_01"] = Acc
globals()[f"{name}_pre_01"] = Precision
globals()[f"{name}_rec_01"] = Recall
globals()[f"{name}_f1_01"] = F1
globals()[f"{name}_bacc_01"] = BACC
globals()[f"{name}_mcc_01"] = MCC
end = time.time()
time_taken = end - start
globals()[f"{name}_time_01"] = time_taken


---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
       0      1     2     3    4
0  698.0    3.0   5.0   1.0  1.0
1    7.0  468.0   3.0   0.0  0.0
2   18.0   24.0  74.0   2.0  0.0
3    0.0    2.0   3.0  41.0  0.0
4    0.0    0.0   0.0   0.0  0.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9488888888888889
Precision total:  0.7418956339794672
Recall total:  0.6966756392033046
F1 total:  0.7151433459670284
BACC total:  0.8708445490041307
MCC total:  0.913039250261179


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
y_pred contains classes not in y_true


### DNN

In [364]:
print('---------------------------------------------------------------------------------')
print('Defining DNN Model')
print('---------------------------------------------------------------------------------')
start_dnn = time.time()

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

#Model Parameters
dropout_rate = 0.2
nodes = 3
out_layer = 5
optimizer='adam'
loss='sparse_categorical_crossentropy'
epochs=100
batch_size=128


num_columns = X_train_01.shape[1]

dnn_01 = tf.keras.Sequential()

# Input layer
dnn_01.add(tf.keras.Input(shape=(num_columns,)))

# Dense layers with dropout
dnn_01.add(tf.keras.layers.Dense(nodes))
dnn_01.add(tf.keras.layers.Dropout(dropout_rate))

dnn_01.add(tf.keras.layers.Dense(nodes))
dnn_01.add(tf.keras.layers.Dropout(dropout_rate))

dnn_01.add(tf.keras.layers.Dense(nodes))
dnn_01.add(tf.keras.layers.Dropout(dropout_rate))

dnn_01.add(tf.keras.layers.Dense(nodes))
dnn_01.add(tf.keras.layers.Dropout(dropout_rate))

dnn_01.add(tf.keras.layers.Dense(nodes))
dnn_01.add(tf.keras.layers.Dropout(dropout_rate))

# Output layer
dnn_01.add(tf.keras.layers.Dense(out_layer, activation='softmax'))

dnn_01.compile(optimizer=optimizer, loss=loss,metrics=['accuracy'])

dnn_01.summary()



---------------------------------------------------------------------------------
Defining DNN Model
---------------------------------------------------------------------------------
Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_66 (Dense)             (None, 3)                 18        
_________________________________________________________________
dropout_55 (Dropout)         (None, 3)                 0         
_________________________________________________________________
dense_67 (Dense)             (None, 3)                 12        
_________________________________________________________________
dropout_56 (Dropout)         (None, 3)                 0         
_________________________________________________________________
dense_68 (Dense)             (None, 3)                 12        
_________________________________________________________________
dr

In [365]:
#DNN
try:
    from keras.callbacks import EarlyStopping

    # Define EarlyStopping callback
    early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)

    print('---------------------------------------------------------------------------------')
    print('Training DNN')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training DNN', file = f)
    print('---------------------------------------------------------------------------------')
    # Convert Y_test back to its original format
    # y_test = np.argmax(Y_test, axis=1)

    # Start the timer
    start = time.time()
    dnn_01.fit(X_train_01, y_train_01, epochs=epochs, batch_size=batch_size,validation_split=0.2, callbacks=[early_stopping])

    # End the timer
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)

except: 
    None

---------------------------------------------------------------------------------
Training DNN
---------------------------------------------------------------------------------
Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100


In [366]:
#DNN
try:
    start = time.time()
    pred_dnn = dnn_01.predict(X_test_01)
    preds_dnn_01 = np.argmax(pred_dnn,axis = 1)
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
except:
        with open(output_file_name, "a") as f: print('error', file = f)
        preds_dnn_01 = 0


In [367]:
try:
    name = 'dnn'
    pred_label = preds_dnn_01
        
    metrics = confusion_metrics(name, pred_label, y_test_01)

    Acc = metrics[0]
    Precision = metrics[1]
    Recall = metrics[2]
    F1 = metrics[3]
    BACC = metrics[4]
    MCC = metrics[5]    


    globals()[f"{name}_acc_01"] = Acc
    globals()[f"{name}_pre_01"] = Precision
    globals()[f"{name}_rec_01"] = Recall
    globals()[f"{name}_f1_01"] = F1
    globals()[f"{name}_bacc_01"] = BACC
    globals()[f"{name}_mcc_01"] = MCC
    end = time.time()
    time_taken = end - start_dnn
    globals()[f"{name}_time_01"] = time_taken

except: None    

---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
       0.0   1.0   2.0  3.0
0.0  705.0   3.0   0.0  0.0
1.0  454.0  15.0   9.0  0.0
2.0   94.0   2.0  22.0  0.0
3.0    3.0   5.0  38.0  0.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.5614814814814815
Precision total:  0.428007592541309
Recall total:  0.4633076053810553
F1 total:  0.35960857981483274
BACC total:  0.4633076053810553
MCC total:  0.23415900890952782


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


### SVM

In [368]:
#SVM
print('---------------------------------------------------------------------------------')
print('Defining SVM Model')
print('---------------------------------------------------------------------------------')
start_svm = time.time()

from sklearn.linear_model import SGDClassifier

# Instantiate the SGDClassifier with additional hyperparameters
clf = SGDClassifier(
    loss='hinge',           # hinge loss for linear SVM
    penalty='l2',           # L2 regularization to prevent overfitting
    alpha=1e-4,             # Learning rate (small value for fine-grained updates)
    max_iter=1000,          # Number of passes over the training data
    random_state=42,        # Seed for reproducible results
    learning_rate='optimal' # Automatically adjusts the learning rate based on the training data
)

#SVM
start = time.time()
clf.fit(X_train_01, y_train_01)
end = time.time()
clf.score(X_train_01, y_train_01)
time_taken = end - start
with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
joblib.dump(clf, 'svm_level_01.joblib')


clf = loaded_model = joblib.load('svm_level_01.joblib')


#SVM
start = time.time()
preds_svm_01 = clf.predict(X_test_01)
end = time.time()
time_taken = end - start
with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
print('---------------------------------------------------------------------------------')



---------------------------------------------------------------------------------
Defining SVM Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------


In [369]:

pred_label = preds_svm_01
name = 'svm'
metrics = confusion_metrics(name, pred_label, y_test_01)

Acc = metrics[0]
Precision = metrics[1]
Recall = metrics[2]
F1 = metrics[3]
BACC = metrics[4]
MCC = metrics[5]    


globals()[f"{name}_acc_01"] = Acc
globals()[f"{name}_pre_01"] = Precision
globals()[f"{name}_rec_01"] = Recall
globals()[f"{name}_f1_01"] = F1
globals()[f"{name}_bacc_01"] = BACC
globals()[f"{name}_mcc_01"] = MCC
end = time.time()
time_taken = end - start_svm
globals()[f"{name}_time_01"] = time_taken


---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
       0.0    1.0   2.0   3.0
0.0  608.0   90.0  10.0   0.0
1.0    7.0  454.0  16.0   1.0
2.0   19.0   52.0  44.0   3.0
3.0    2.0   14.0   7.0  23.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.8362962962962963
Precision total:  0.7808793902824216
Recall total:  0.6703573032645439
F1 total:  0.7051849414686844
BACC total:  0.6703573032645439
MCC total:  0.7297640488769191


### Random Forest

In [370]:

print('---------------------------------------------------------------------------------')
print('Defining RF Model')
print('---------------------------------------------------------------------------------')
start_rf = time.time()

#Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
rf = RandomForestClassifier(max_depth = 5,  n_estimators = 10, min_samples_split = 2, n_jobs = -1)
#------------------------------------------------------------------------------

if True == True:

    print('---------------------------------------------------------------------------------')
    print('Training RF')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)
    with open(output_file_name, "a") as f: print('Training RF', file = f)
    print('---------------------------------------------------------------------------------')
    #RF
    start = time.time()
    model_rf_01 = rf.fit(X_train_01,y_train_01)
    end = time.time()

    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(model_rf_01, 'rf_base_model_01.joblib')

if 1 == 1:
    model_rf_01  = joblib.load('rf_base_model_01.joblib')

if 1 == 1:

    print('---------------------------------------------------------------------------------')
    print('Prediction RF')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Prediction RF', file = f)
    print('---------------------------------------------------------------------------------')
    #RF
    start = time.time()
    preds_rf_01 = model_rf_01.predict(X_test_01)
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')

    with open(output_file_name, "a") as f: print('-------------------------------------------------------', file = f)
pred_label = preds_rf_01
name='rf'
metrics = confusion_metrics(name, pred_label, y_test_01)

Acc = metrics[0]
Precision = metrics[1]
Recall = metrics[2]
F1 = metrics[3]
BACC = metrics[4]
MCC = metrics[5]    


globals()[f"{name}_acc_01"] = Acc
globals()[f"{name}_pre_01"] = Precision
globals()[f"{name}_rec_01"] = Recall
globals()[f"{name}_f1_01"] = F1
globals()[f"{name}_bacc_01"] = BACC
globals()[f"{name}_mcc_01"] = MCC
end = time.time()
time_taken = end - start_rf
globals()[f"{name}_time_01"] = time_taken


---------------------------------------------------------------------------------
Defining RF Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Training RF
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Prediction RF
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
       0.0    1.0   2.0   3.0
0.0  696.0    8.0   3.0   1.0
1.0   22.0  456.0   0.0   0.0
2.0   20.0   19.0  78.0   1.0
3.0    0.0    1.0   8.0  37.0
---------------------------------------------------

### LGBM

In [371]:
print('---------------------------------------------------------------------------------')
print('Defining LGBM Model')
print('---------------------------------------------------------------------------------')
#LGBM
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier()

start_lgbm = time.time()


if 1 == 1 and 0 == 0:


    print('---------------------------------------------------------------------------------')
    print('Training LGBM')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training LGBM', file = f)
    print('---------------------------------------------------------------------------------')
    start = time.time()
    lgbm.fit(X_train_01, y_train_01)
    end = time.time()

    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(lgbm, 'lgbm_01.joblib')

if 1 == 1:
    lgbm = joblib.load('lgbm_01.joblib')


if 1 == 1:

    print('Prediction LGBM')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Prediction LGBM', file = f)
    print('---------------------------------------------------------------------------------')
    #LGBM
    start = time.time()
    preds_lgbm_01 = lgbm.predict(X_test_01)
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')

    pred_label = preds_lgbm_01
    name='lgbm'
    metrics = confusion_metrics(name, pred_label, y_test_01)

    Acc = metrics[0]
    Precision = metrics[1]
    Recall = metrics[2]
    F1 = metrics[3]
    BACC = metrics[4]
    MCC = metrics[5]    


    globals()[f"{name}_acc_01"] = Acc
    globals()[f"{name}_pre_01"] = Precision
    globals()[f"{name}_rec_01"] = Recall
    globals()[f"{name}_f1_01"] = F1
    globals()[f"{name}_bacc_01"] = BACC
    globals()[f"{name}_mcc_01"] = MCC
    end = time.time()
    time_taken = end - start_lgbm
    globals()[f"{name}_time_01"] = time_taken


---------------------------------------------------------------------------------
Defining LGBM Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Training LGBM
---------------------------------------------------------------------------------


Prediction LGBM
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
       0.0    1.0   2.0   3.0  4.0
0.0  670.0   16.0  20.0   2.0  0.0
1.0   15.0  458.0   4.0   1.0  0.0
2.0    7.0   38.0  67.0   3.0  3.0
3.0    0.0    2.0   7.0  36.0  1.0
4.0    0.0    0.0   0.0   0.0  0.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9118518518518518
Precision total:  0.6800150005347498
Recall total:  0.6509783970506768
F1 total:  0.6638164284938479
BACC total:  0.8137229963133461
MCC total:  0.8511255608397971


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
y_pred contains classes not in y_true


### MLP

In [372]:

#MLP
print('---------------------------------------------------------------------------------')
print('Defining MLP Model')
print('---------------------------------------------------------------------------------')
start_mlp = time.time()


from sklearn.neural_network import MLPClassifier
import time

# create MLPClassifier instance
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=200, random_state=1)

if 1 == 1 and 0 == 0:


    print('---------------------------------------------------------------------------------')
    print('Training MLP')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)
    with open(output_file_name, "a") as f: print('Training MLP', file = f)
    print('---------------------------------------------------------------------------------')

    start = time.time()
    MLP = mlp.fit(X_train_01, y_train_01)
    end = time.time()

    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(MLP, 'mlp_01.joblib')

if 1 == 1:
    MLP = joblib.load('mlp_01.joblib')


if 1 == 1:

    #MLP
    start = time.time()
    y_pred = MLP.predict_proba(X_test_01)
    preds_mlp_01 = np.argmax(y_pred,axis = 1)
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

#MLP
if 1 == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('MLP 01 model', file = f)
    pred_label = preds_mlp_01
    name='mlp'
    metrics = confusion_metrics(name, pred_label, y_test_01)

    Acc = metrics[0]
    Precision = metrics[1]
    Recall = metrics[2]
    F1 = metrics[3]
    BACC = metrics[4]
    MCC = metrics[5]    


    globals()[f"{name}_acc_01"] = Acc
    globals()[f"{name}_pre_01"] = Precision
    globals()[f"{name}_rec_01"] = Recall
    globals()[f"{name}_f1_01"] = F1
    globals()[f"{name}_bacc_01"] = BACC
    globals()[f"{name}_mcc_01"] = MCC
    end = time.time()
    time_taken = end - start_mlp
    globals()[f"{name}_time_01"] = time_taken


---------------------------------------------------------------------------------
Defining MLP Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Training MLP
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
       0      1     2     3
0  638.0   46.0  24.0   0.0
1    7.0  448.0  13.0  10.0
2   18.0   47.0  48.0   5.0
3    0.0    0.0   1.0  45.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.8733333333333333
Precision total:  0.774632065

Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


### ADA

In [373]:
print('---------------------------------------------------------------------------------')
print('Defining ADA Model')
print('---------------------------------------------------------------------------------')
#ADA
start_ada = time.time()


from sklearn.ensemble import AdaBoostClassifier
import time
abc = AdaBoostClassifier(n_estimators=50, learning_rate=1.0)

if 1 == 1 and 0 == 0:

    print('---------------------------------------------------------------------------------')
    print('Training ADA')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training ADA', file = f)
    print('---------------------------------------------------------------------------------')
    #ADA

    start = time.time()
    ada = abc.fit(X_train_01, y_train_01)
    end = time.time()

    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)

    # Assuming 'model' is your trained model
    joblib.dump(ada, 'ada_01.joblib')

if 1 == 1:
    ada = joblib.load('ada_01.joblib')

if 1 == 1:

    print('Prediction ADA')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)
    with open(output_file_name, "a") as f: print('Prediction ADA', file = f)
    print('---------------------------------------------------------------------------------')
    #ADA
    start = time.time()
    preds_ada_01 = ada.predict(X_test_01)
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    print('---------------------------------------------------------------------------------')

if 1 == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('ADA 01 model', file = f)


    pred_label = preds_ada_01
    name='ada'
    metrics = confusion_metrics(name, pred_label, y_test_01)

    Acc = metrics[0]
    Precision = metrics[1]
    Recall = metrics[2]
    F1 = metrics[3]
    BACC = metrics[4]
    MCC = metrics[5]    


    globals()[f"{name}_acc_01"] = Acc
    globals()[f"{name}_pre_01"] = Precision
    globals()[f"{name}_rec_01"] = Recall
    globals()[f"{name}_f1_01"] = F1
    globals()[f"{name}_bacc_01"] = BACC
    globals()[f"{name}_mcc_01"] = MCC
    end = time.time()
    time_taken = end - start_ada
    globals()[f"{name}_time_01"] = time_taken








---------------------------------------------------------------------------------
Defining ADA Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Training ADA
---------------------------------------------------------------------------------
Prediction ADA
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
       0.0    1.0   2.0   3.0
0.0  614.0   87.0   5.0   2.0
1.0    1.0  465.0  12.0   0.0
2.0   23.0   70.0  22.0   3.0
3.0    0.0    0.0   4.0  42.0
---------------------------------------------------------------------------------
METRICS
----------------------------------------

### KNN

In [374]:
#KNN
print('---------------------------------------------------------------------------------')
print('Defining KNN Model')
print('---------------------------------------------------------------------------------')
start_knn = time.time()

knn_clf_01=KNeighborsClassifier(n_neighbors = 5)

if 1 == 1 and 0 == 0:

    #KNN
    print('---------------------------------------------------------------------------------')
    print('Training KNN')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training KNN', file = f)
    print('---------------------------------------------------------------------------------')
    start = time.time()
    knn_clf_01.fit(X_train_01,y_train_01)
    end = time.time()

    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(knn_clf_01, 'knn_01.joblib')


if load_model_knn == 1:
    knn_clf_01 = joblib.load('knn_01.joblib')

if use_model_knn == 1:

    #KNN
    start = time.time()
    preds_knn =knn_clf_01.predict(X_test_01)
    preds_knn
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)
if 1 == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('KNN 01 model', file = f)

    pred_label = preds_knn
    name='knn'
    metrics = confusion_metrics(name, pred_label, y_test_01)

    Acc = metrics[0]
    Precision = metrics[1]
    Recall = metrics[2]
    F1 = metrics[3]
    BACC = metrics[4]
    MCC = metrics[5]    


    globals()[f"{name}_acc_01"] = Acc
    globals()[f"{name}_pre_01"] = Precision
    globals()[f"{name}_rec_01"] = Recall
    globals()[f"{name}_f1_01"] = F1
    globals()[f"{name}_bacc_01"] = BACC
    globals()[f"{name}_mcc_01"] = MCC

    end = time.time()
    time_taken = end - start_knn
    globals()[f"{name}_time_01"] = time_taken



---------------------------------------------------------------------------------
Defining KNN Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Training KNN
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
       0.0    1.0   2.0   3.0
0.0  688.0    5.0  15.0   0.0
1.0   12.0  454.0  11.0   1.0
2.0   13.0    6.0  91.0   8.0
3.0    0.0    0.0   5.0  41.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9437037037037037
Precision total:  0.8767956529403199
Recall total:  0.8960082489781278
F1 total:  0.8859291429757993


### Log Regression

In [375]:
from sklearn.linear_model import LogisticRegression

#Logistic Regression
print('---------------------------------------------------------------------------------')
print('Defining Logistic Regression Model')
print('---------------------------------------------------------------------------------')
logreg_01 = LogisticRegression()
start_lr = time.time()

if 1 == 1 and 0 == 0:

    #KNN
    print('---------------------------------------------------------------------------------')
    print('Training LR ')
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('Training LR', file = f)
    print('---------------------------------------------------------------------------------')
    start = time.time()
    logreg_01.fit(X_train_01,y_train_01)
    end = time.time()

    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed training time ', time_taken, file = f)
    joblib.dump(logreg_01, 'logreg_01.joblib')


if 1 == 1:
    logreg_01 = joblib.load('logreg_01.joblib')

if 1 == 1:

    #lR
    start = time.time()
    preds_logreg =logreg_01.predict(X_test_01)
    end = time.time()
    time_taken = end - start
    with open(output_file_name, "a") as f: print('Elapsed prediction time ', time_taken, file = f)
    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

#LR
if 1 == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('LR 01 model', file = f)

    pred_label = preds_logreg
    # pred_label = label[ypred]
    name='lr'
    metrics = confusion_metrics(name, pred_label, y_test_01)

    Acc = metrics[0]
    Precision = metrics[1]
    Recall = metrics[2]
    F1 = metrics[3]
    BACC = metrics[4]
    MCC = metrics[5]    


    globals()[f"{name}_acc_01"] = Acc
    globals()[f"{name}_pre_01"] = Precision
    globals()[f"{name}_rec_01"] = Recall
    globals()[f"{name}_f1_01"] = F1
    globals()[f"{name}_bacc_01"] = BACC
    globals()[f"{name}_mcc_01"] = MCC
    end = time.time()
    time_taken = end - start_lr
    globals()[f"{name}_time_01"] = time_taken


---------------------------------------------------------------------------------
Defining Logistic Regression Model
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Training LR 
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
       0.0    1.0   2.0   3.0
0.0  625.0   75.0   6.0   2.0
1.0   12.0  447.0  15.0   4.0
2.0   24.0   49.0  40.0   5.0
3.0    0.0    5.0   0.0  41.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.8540740740740741
Precision total:  0.791444493774797
Recall total:  0.7620505509425275
F1 total:  0.76

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Catboost

In [376]:
start = time.time()

cat_01 = catboost.CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='MultiClass', custom_metric='Accuracy')

# Fit the model
cat_01.fit(X_train_01, y_train_01, eval_set=(X_test_01, y_test_01), verbose=10)

# Make predictions on the test set
preds_cat = cat_01.predict(X_test_01)
preds_cat = np.squeeze(preds_cat)

with open(output_file_name, "a") as f: print('--------------------------------------------------------------------------', file = f)

with open(output_file_name, "a") as f: print('catboost', file = f)


pred_label = preds_cat
name='cat'
metrics = confusion_metrics(name, pred_label, y_test_01)

Acc = metrics[0]
Precision = metrics[1]
Recall = metrics[2]
F1 = metrics[3]
BACC = metrics[4]
MCC = metrics[5]    


globals()[f"{name}_acc_01"] = Acc
globals()[f"{name}_pre_01"] = Precision
globals()[f"{name}_rec_01"] = Recall
globals()[f"{name}_f1_01"] = F1
globals()[f"{name}_bacc_01"] = BACC
globals()[f"{name}_mcc_01"] = MCC
end = time.time()
time_taken = end - start
globals()[f"{name}_time_01"] = time_taken


0:	learn: 1.3377869	test: 1.3409543	best: 1.3409543 (0)	total: 6.83ms	remaining: 676ms


10:	learn: 0.4872332	test: 0.4953866	best: 0.4953866 (10)	total: 63.1ms	remaining: 511ms
20:	learn: 0.2735539	test: 0.2851215	best: 0.2851215 (20)	total: 126ms	remaining: 475ms
30:	learn: 0.1858267	test: 0.2003851	best: 0.2003851 (30)	total: 193ms	remaining: 429ms
40:	learn: 0.1451702	test: 0.1600026	best: 0.1600026 (40)	total: 259ms	remaining: 373ms
50:	learn: 0.1219310	test: 0.1366431	best: 0.1366431 (50)	total: 319ms	remaining: 307ms
60:	learn: 0.1072353	test: 0.1231615	best: 0.1231615 (60)	total: 373ms	remaining: 239ms
70:	learn: 0.0976357	test: 0.1141141	best: 0.1141141 (70)	total: 426ms	remaining: 174ms
80:	learn: 0.0893530	test: 0.1061239	best: 0.1061239 (80)	total: 490ms	remaining: 115ms
90:	learn: 0.0826111	test: 0.1007402	best: 0.1007402 (90)	total: 554ms	remaining: 54.8ms
99:	learn: 0.0779486	test: 0.0967726	best: 0.0967726 (99)	total: 611ms	remaining: 0us

bestTest = 0.09677261405
bestIteration = 99

--------------------------------------------------------------------------

### XGB

In [377]:

start = time.time()

# Create a DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train_01, label=y_train_01)
dtest = xgb.DMatrix(X_test_01, label=y_test_01)

# Set XGBoost parameters
params = {
    'objective': 'multi:softmax',  # for multi-class classification
    'num_class': 5,  # specify the number of classes
    'max_depth': 3,
    'learning_rate': 0.1,
    'eval_metric': 'mlogloss'  # metric for multi-class classification
}

# Train the XGBoost model
num_round = 100
xgb_01 = xgb.train(params, dtrain, num_round)

# Make predictions on the test set
preds_xgb_01 = xgb_01.predict(dtest)


if 1 == 1:

    with open(output_file_name, "a") as f: print('---------------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print('xgboost base model', file = f)

    pred_label = preds_xgb_01
    name='xgb'
    metrics = confusion_metrics(name, pred_label, y_test_01)

    Acc = metrics[0]
    Precision = metrics[1]
    Recall = metrics[2]
    F1 = metrics[3]
    BACC = metrics[4]
    MCC = metrics[5]    


    globals()[f"{name}_acc_01"] = Acc
    globals()[f"{name}_pre_01"] = Precision
    globals()[f"{name}_rec_01"] = Recall
    globals()[f"{name}_f1_01"] = F1
    globals()[f"{name}_bacc_01"] = BACC
    globals()[f"{name}_mcc_01"] = MCC
    end = time.time()
    time_taken = end - start
    globals()[f"{name}_time_01"] = time_taken


---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
       0.0    1.0    2.0   3.0
0.0  698.0    2.0    7.0   1.0
1.0    6.0  468.0    4.0   0.0
2.0    5.0    6.0  106.0   1.0
3.0    0.0    0.0    5.0  41.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.9725925925925926
Precision total:  0.9475048247073419
Recall total:  0.9386411591736221
F1 total:  0.9427484202671987
BACC total:  0.9386411591736221
MCC total:  0.953618349169585


 ### Generating Summary Metric Table

In [378]:

names_models = ['ADA',
                'SVM',
                'DNN',
                'MLP',
                'KNN',
                'CAT',
                'XGB',
                'LGBM',
                'RF',
                'LR',
                'VOTING',
                'Bag_svm',
                'Bag_knn',
                'Bag_DT',
                'Bag_LR',
                'Bag_mlp',

                'Bag_rf',
                'Bag_ada',
                'Bag_lgbm',
                # 'Bag_xgb',
                'Bag_cat',
                'Bag_comb',
                'avg',
                'weighed_avg'
                ]

data = [["" for _ in range(5)] for _ in range(len(names_models))]

level_01_acc = [
                ada_acc_01,
                svm_acc_01,
                dnn_acc_01,
                mlp_acc_01,
                knn_acc_01,
                cat_acc_01,
                xgb_acc_01,
                lgbm_acc_01,
                rf_acc_01,
                lr_acc_01,
                voting_acc_01,
                bag_svm_acc_01,
                bag_knn_acc_01,
                bag_dt_acc_01,
                bag_lr_acc_01,
                bag_mlp_acc_01,

                bag_rf_acc_01,
                bag_ada_acc_01,
                bag_lgbm_acc_01,
                # bag_xgb_acc_01,
                bag_cat_acc_01,
                bag_comb_acc_01,

                avg_acc_01,
                weighed_avg_acc_01
                ]  


level_01_pre = [
                ada_pre_01,
                svm_pre_01,
                dnn_pre_01,
                mlp_pre_01,
                knn_pre_01,
                cat_pre_01,
                xgb_pre_01,
                lgbm_pre_01,
                rf_pre_01,
                lr_pre_01,
                voting_pre_01,
                bag_svm_pre_01,
                bag_knn_pre_01,
                bag_dt_pre_01,
                bag_lr_pre_01,
                bag_mlp_pre_01,

                bag_rf_pre_01,
                bag_ada_pre_01,
                bag_lgbm_pre_01,
                # bag_xgb_pre_01,
                bag_cat_pre_01,
                bag_comb_pre_01,

                avg_pre_01,
                weighed_avg_pre_01
                ]  

level_01_rec = [
                ada_rec_01,
                svm_rec_01,
                dnn_rec_01,
                mlp_rec_01,
                knn_rec_01,
                cat_rec_01,
                xgb_rec_01,
                lgbm_rec_01,
                rf_rec_01,
                lr_rec_01,
                voting_rec_01,
                bag_svm_rec_01,
                bag_knn_rec_01,
                bag_dt_rec_01,
                bag_lr_rec_01,
                bag_mlp_rec_01,

                bag_rf_rec_01,
                bag_ada_rec_01,
                bag_lgbm_rec_01,
                # bag_xgb_rec_01,
                bag_cat_rec_01,
                bag_comb_rec_01,

                avg_rec_01,
                weighed_avg_rec_01
                ]  

level_01_f1 = [
                ada_f1_01,
                svm_f1_01,
                dnn_f1_01,
                mlp_f1_01,
                knn_f1_01,
                cat_f1_01,
                xgb_f1_01,
                lgbm_f1_01,
                rf_f1_01,
                lr_f1_01,
                voting_f1_01,
                bag_svm_f1_01,
                bag_knn_f1_01,
                bag_dt_f1_01,
                bag_lr_f1_01,
                bag_mlp_f1_01,

                bag_rf_f1_01,
                bag_ada_f1_01,
                bag_lgbm_f1_01,
                # bag_xgb_f1_01,
                bag_cat_f1_01,
                bag_comb_f1_01,

                avg_f1_01,
                weighed_avg_f1_01
                ]  


# Combine data into a list of tuples for sorting
model_data = list(zip(names_models, level_01_acc, level_01_pre, level_01_rec, level_01_f1))

# Sort by F1-01 score in descending order
model_data_sorted = sorted(model_data, key=lambda x: x[4], reverse=True)

# Separate the sorted data back into individual lists
sorted_names_models, sorted_level_01_acc, sorted_level_01_pre, sorted_level_01_rec, sorted_level_01_f1 = zip(*model_data_sorted)

# Assign the sorted data to the table
for i in range(len(sorted_names_models)):
    data[i][0] = sorted_names_models[i]
    data[i][1] = sorted_level_01_acc[i]
    data[i][2] = sorted_level_01_pre[i] 
    data[i][3] = sorted_level_01_rec[i] 
    data[i][4] = sorted_level_01_f1[i]

# Define column headers
headers = ["Models", "ACC-01", "PRE-01", "REC-01", "F1-01"]

# Print the table
table = tabulate(data, headers=headers, tablefmt="grid")
with open(output_file_name, "a") as f: print('Summary table', file = f)
if pick_prob == 1: 
    with open(output_file_name, "a") as f: print('Level 01 - Probabilities', file = f)
else:
    with open(output_file_name, "a") as f: print('Level 01 - CLASSES', file = f)
if feature_selection_bit == 1: 
    with open(output_file_name, "a") as f: print('Feature Selection was applied', file = f)
else:
    with open(output_file_name, "a") as f: print('All features were used', file = f)


    
print(table)
with open(output_file_name, "a") as f: print(table, file = f)

+-------------+----------+----------+----------+----------+
| Models      |   ACC-01 |   PRE-01 |   REC-01 |    F1-01 |
| CAT         | 0.974815 | 0.951338 | 0.940165 | 0.945653 |
+-------------+----------+----------+----------+----------+
| XGB         | 0.972593 | 0.947505 | 0.938641 | 0.942748 |
+-------------+----------+----------+----------+----------+
| Bag_cat     | 0.968889 | 0.937391 | 0.929474 | 0.933279 |
+-------------+----------+----------+----------+----------+
| Bag_rf      | 0.952593 | 0.923628 | 0.899125 | 0.910557 |
+-------------+----------+----------+----------+----------+
| KNN         | 0.943704 | 0.876796 | 0.896008 | 0.885929 |
+-------------+----------+----------+----------+----------+
| RF          | 0.938519 | 0.92759  | 0.850598 | 0.883723 |
+-------------+----------+----------+----------+----------+
| Bag_knn     | 0.942963 | 0.866968 | 0.901993 | 0.883049 |
+-------------+----------+----------+----------+----------+
| MLP         | 0.873333 | 0.774632 | 0.

In [379]:
# implement time table
names_models = ['ADA',
                'SVM',
                'DNN',
                'MLP',
                'KNN',
                'CAT',
                'XGB',
                'LGBM',
                'RF',
                'LR',
                'VOTING',
                'Bag_svm',
                'Bag_knn',
                'Bag_DT',
                'Bag_LR',
                'Bag_mlp',

                'Bag_rf',
                'Bag_ada',
                'Bag_lgbm',
                # 'Bag_xgb',
                'Bag_cat',
                'Bag_comb',
                'avg',
                'weighed_avg'
                ]

data = [["" for _ in range(2)] for _ in range(len(names_models))]

level_01_time = [
                ada_time_01,
                svm_time_01,
                dnn_time_01,
                mlp_time_01,
                knn_time_01,
                cat_time_01,
                xgb_time_01,
                lgbm_time_01,
                rf_time_01,
                lr_time_01,
                voting_time_01,
                bag_svm_time_01,
                bag_knn_time_01,
                bag_dt_time_01,
                bag_lr_time_01,
                bag_mlp_time_01,

                bag_rf_time_01,
                bag_ada_time_01,
                bag_lgbm_time_01,
                # bag_xgb_time_01,
                bag_cat_time_01,
                bag_comb_time_01,

                avg_time_01,
                weighed_avg_time_01
                ]  


# Combine data into a list of tuples for sorting
model_data = list(zip(names_models, level_01_time))

# Sort by F1-01 score in descending order
model_data_sorted = sorted(model_data, key=lambda x: x[1], reverse=False)

# Separate the sorted data back into individual lists
sorted_names_models, sorted_level_01_time = zip(*model_data_sorted)

# Assign the sorted data to the table
for i in range(len(sorted_names_models)):
    data[i][0] = sorted_names_models[i]
    data[i][1] = sorted_level_01_time[i]

# Define column headers
headers = ["Models", "time-01(sec)"]


# Print the table
table = tabulate(data, headers=headers, tablefmt="grid")
with open(output_file_name, "a") as f: print('Time is counted is seconds', file = f)
print(table)
with open(output_file_name, "a") as f: print(table, file = f)
end_program = time.time()
time_program = end_program - start_program
with open(output_file_name, "a") as f: print('Running time of entire program is:', time_program ,' seconds',file = f)

+-------------+----------------+
| Models      |   time-01(sec) |
| avg         |      0.0400491 |
+-------------+----------------+
| KNN         |      0.0571139 |
+-------------+----------------+
| weighed_avg |      0.0625758 |
+-------------+----------------+
| Bag_knn     |      0.0923586 |
+-------------+----------------+
| SVM         |      0.095645  |
+-------------+----------------+
| LR          |      0.121755  |
+-------------+----------------+
| Bag_DT      |      0.128664  |
+-------------+----------------+
| Bag_svm     |      0.240201  |
+-------------+----------------+
| ADA         |      0.244629  |
+-------------+----------------+
| RF          |      0.289706  |
+-------------+----------------+
| XGB         |      0.319149  |
+-------------+----------------+
| LGBM        |      0.339041  |
+-------------+----------------+
| CAT         |      0.759877  |
+-------------+----------------+
| Bag_LR      |      0.788077  |
+-------------+----------------+
| Bag_ada 

# ------------------------------------------------------------------