# Description of Experiment

This jupyter notebook uses the Edge-IIoTset2023 public dataset (available at https://doi.org/10.1109/ACCESS.2022.3165809) to explore the use of Ensemble Learning methods as a means to improve predictive accuracy for anomaly detection.

- Hardware environment: Intel i7-1017U processor (6 cores 12 threads), 64GB RAM, 1TB SSD storage
- Software environment: Windows 11 Professional, Anaconda Navigator, scikit-learn version 1.3.0
- This notebook takes ~3 hours to run


# Future Works for subsequent projects
- perform  additional feature selection, add correlation heatmap, remove features with low correlation
- add some persistence with pickle, save the reduced dataset as a local CSV file to speed up iteration
- code cleanup: y_test_label should be y_test to make the variable naming more consistent with sample code
- consider using "Beautiful Soup" package to analyze HTTP text data
- use NLP for extracting text data from features like "http_uri_query" to analyze the contents of the text and extract meaningful information
- This project only used 2 classes (normal vs attack), consider using multi-class classification so you can determine what specific type of attack (brute force, DoS, etc)

# Import libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Miscellaneous packages
import time                                           #for calculating elapsed time for training tasks
import os                                             #for checking if file exists
import socket                                         #for getting FQDN of local machine
import math                                           #square root function
import sys

# Packages from scikit-learn
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV       #for hyperparameter optimization
from sklearn.model_selection import cross_val_score    #for cross fold validation
from sklearn.metrics         import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
from sklearn.preprocessing   import StandardScaler
from sklearn.linear_model    import LogisticRegression
from sklearn.svm             import SVC    
from sklearn.neighbors       import KNeighborsClassifier
from sklearn.ensemble        import RandomForestClassifier
from sklearn.neural_network  import MLPClassifier
from sklearn.naive_bayes     import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.ensemble        import BaggingClassifier, VotingClassifier, StackingClassifier, AdaBoostClassifier, GradientBoostingClassifier   #Packages for Ensemble Learning
from sklearn.linear_model    import LogisticRegression          #used by stacking models
from sklearn.tree            import DecisionTreeClassifier      #used by stacking models


from imblearn.under_sampling import RandomUnderSampler  #may need to install with: conda install -c conda-forge imbalanced-learn
from imblearn.over_sampling  import SMOTE               #may need to install with: conda install -c conda-forge imbalanced-learn
import xgboost as xgb                                   #eXtreme Gradient Booster, not part of sklearn, need to install with: pip install xgboost

# Define functions

In [None]:
# function to show missing values in dataset

def get_type_missing(df):
    df_types = pd.DataFrame()
    df_types['data_type'] = df.dtypes
    df_types['missing_values'] = df.isnull().sum()
    return df_types.sort_values(by='missing_values', ascending=False)

In [None]:
# function to create a confusion matrix

def visualize_confusion_matrix(y_test_label, y_pred):
    #
    ## Calculate accuracy
    #accuracy = accuracy_score(y_test_label, y_pred)
    #print("Accuracy:", accuracy)
    #
    # Confusion Matrix
    cm = confusion_matrix(y_test_label, y_pred)
    #
    # visualize confusion matrix with more detailed labels
    # https://medium.com/@dtuk81/confusion-matrix-visualization-fc31e3f30fea
    #
    group_names = ['True Negative','False Positive','False Negative','True Positive']
    group_counts = ["{0:0.0f}".format(value) for value in cm.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in cm.flatten()/np.sum(cm)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    plt.figure(figsize=(3.5, 2.0))  #default figsize is 6.4" wide x 4.8" tall, shrink to 3.5" wide 2.0" tall
    sns.heatmap(cm, annot=labels, fmt='', cmap='Blues', cbar=False)
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.title("Confusion Matrix")
    plt.show()
    
    # use the .ravel function to pull out TN,TP,FN,TP
    # https://analytics4all.org/2020/05/07/python-confusion-matrix/
    TN, FP, FN, TP = cm.ravel()
    
    # calculate different metrics
    Accuracy = (( TP + TN) / ( TP + TN + FP + FN))
    Sensitivity = TP / (TP + FN)
    Specificity = TN / (TN + FP)
    GeometricMean = math.sqrt(Sensitivity * Specificity)

    # Precision is the ratio of true positive predictions to the total number of positive predictions made by the model
    # average=binary for  binary classification models, average=micro for multiclass classification, average=weighted to match classification_report
    Precision = precision_score(y_test_label, y_pred, average='weighted')  
    
    # Recall is the ratio of true positive predictions to the total number of actual positive instances in the data.
    # average=binary for  binary classification models, average=micro for multiclass classification, average=weighted to match classification_report
    Recall = recall_score(y_test_label, y_pred, average='weighted') 
    
    # F1-score is a metric that considers both precision and recall, providing a balance between the two. 
    # average=binary for  binary classification models, average=micro for multiclass classification, average=weighted to match classification_report
    F1 = f1_score(y_test_label, y_pred, average='weighted')
    
    # add details below graph to help interpret results
    print('\n\n')
    print('Confusion matrix\n\n', cm)
    print('\nTrue Negatives  (TN) = ', TN)
    print('False Positives (FP) = ', FP)
    print('False Negatives (FN) = ', FN)
    print('True Positives  (TP) = ', TP)
    print ('\n')
    print ("Accuracy:       ", Accuracy)
    print ("Sensitivity:    ", Sensitivity)
    print ("Specificity:    ", Specificity)
    print ("Geometric Mean: ", GeometricMean)
    print ('\n')
    print ("Precision:       ", Precision)
    print ("Recall:          ", Recall)
    print ("f1-score:        ", F1)
    
    print('\n------------------------------------------------\n')
    # We want TN and TP to be approximately equal, because this indicates the dataset is well balanced.
    # If TN and TP are very different, it indicates imbalanced data, which can lead to low accuracy due to overfitting
    #if (TN/TP*100 < 40 or TN/TP*100 > 60):   #we want TN and TP to be approximately 50%, if the values are below 40% or over 60%, generate a warning
    #    print("WARNING: the confusion matrix shows that TN and TP are very imbalanced, may lead to low accuracy!")
    #
    return cm, Accuracy, Sensitivity, Specificity, GeometricMean, Precision, Recall, F1
    
    



In [None]:
# function to report on model accuracy (TP, FP, FN, FP), precision, recall, f1-score
# this function does not provide anything additional to the results from the previous function

def model_classification_report(cm, y_test_label, y_pred):
    report = classification_report(y_test_label, y_pred, digits=4)
    print('\n')
    print("Classification Report: \n", report)
    print('\n\n\n')



In [None]:
# function to show elapsed time for running notebook

# start a timer so we can calculate the total runtime of this notebook
notebook_start_time = time.time()  #seconds since epoch

def show_elapsed_time():
    #
    # Get the current time as a struct_time object
    current_time_struct = time.localtime()                             
    
    # Format the struct_time as a string (yyyy-mm-dd HH:MM:SS format)
    current_time_str = time.strftime("%Y-%m-%d %H:%M:%S", current_time_struct)  
    
    # Display the current time in HH:MM:SS format
    print("Current Time:", current_time_str)      
    
    # show a running total of elapsed time for the entire notebook
    notebook_end_time = time.time()  #seconds since epoch
    print(f"The entire notebook runtime so far is {(notebook_end_time-notebook_start_time)/60:.0f} minutes")

show_elapsed_time() 

# Initialize variables

In [None]:
# initialize variables to avoid undef errors

accuracy_lr_undersampled_unoptimized  = 0
accuracy_lr_undersampled_optimized    = 0
accuracy_dt_undersampled_unoptimized  = 0
accuracy_dt_undersampled_optimized    = 0
accuracy_ds_undersampled_unoptimized  = 0
accuracy_ds_undersampled_optimized    = 0
accuracy_rf_undersampled_unoptimized  = 0
accuracy_rf_undersampled_optimized    = 0
accuracy_nb_undersampled_unoptimized  = 0
accuracy_nb_undersampled_optimized    = 0
accuracy_svm_undersampled_unoptimized = 0
accuracy_svm_undersampled_optimized   = 0
accuracy_knn_undersampled_unoptimized = 0
accuracy_knn_undersampled_optimized   = 0
accuracy_mlp_undersampled_unoptimized = 0
accuracy_mlp_undersampled_optimized   = 0
accuracy_gb_undersampled_unoptimized  = 0
accuracy_gb_undersampled_optimized    = 0
accuracy_xgb_undersampled_unoptimized = 0
accuracy_xgb_undersampled_optimized   = 0

best_params_lr                        = ""
best_params_dt                        = ""
best_params_ds                        = ""
best_params_rf                        = ""
best_params_nb                        = ""
best_params_svm                       = ""
best_params_knn                       = ""
best_params_mlp                       = ""
best_params_gb                        = ""
best_params_xgb                       = ""

accuracy_ensemble_voting              = 0
accuracy_ensemble_stacking            = 0
accuracy_ensemble_boosting            = 0
accuracy_ensemble_bagging             = 0

cv_count                              = 10  #number of cross-validation folds

# Load dataset

In [None]:
# define CSV source file

filename = 'DNN-EdgeIIoT-dataset.csv'
LAN_location = 'http://datasets.nyx.local:80/datasets/Edge-IIoTset2023/Selected_dataset_for_ML_and_DL'  #high speed local copy on LAN
WAN_location = 'http://datasets.nyx.ca:8081/datasets/Edge-IIoTset2023/Selected_dataset_for_ML_and_DL'   #accessible to entire internet



# Get the FQDN of the local machine
fqdn = socket.getfqdn()
ipv4_address = socket.gethostbyname(socket.gethostname())
print(f"Fully Qualified Domain Name (FQDN):{fqdn}, IPv4 address:{ipv4_address}")
if ( "nyx.local" in fqdn ):
    # If inside the LAN, grab the local copy of the dataset
    print(f"Detected Fully Qualified Domain Name of {fqdn}, dataset source is:\n{LAN_location}/{filename}")
    dataset = f"{LAN_location}/{filename}"
else:
    # If not inside the LAN, grab the dataset from an internet-accessible URL
    print(f"Detected Fully Qualified Domain Name of {fqdn}, dataset source is:\n{WAN_location}/{filename}")
    dataset = f"{WAN_location}/{filename}"

    
print(f"Loading dataset from {dataset}")
df = pd.read_csv(dataset)

In [None]:
# print(f"Dropping rows from the dataset during debugging to speed up this notebook - turn this off when finished debugging!")

# # cut dataset in half if > 2 million rows
# if ( len(df) > 2000000):
#     print(f"Original size of dataset is", len(df), " rows")
#     df.drop(df.index[::2], inplace=True)
#     print(f"Dataset size after dropping all the even-numbered rows is", len(df), " rows")

# # cut dataset in half if > 1 million rows
# if ( len(df) > 1000000):
#     print(f"Original size of dataset is", len(df), " rows")
#     df.drop(df.index[::2], inplace=True)
#     print(f"Dataset size after dropping all the even-numbered rows is", len(df), " rows")

# # cut dataset in half if > 0.5 million rows
# if ( len(df) > 500000):
#     print(f"Original size of dataset is", len(df), " rows")
#     df.drop(df.index[::2], inplace=True)
#     print(f"Dataset size after dropping all the even-numbered rows is", len(df), " rows")

# # cut dataset in half if > 0.5 million rows
# if ( len(df) > 500000):
#     print(f"Original size of dataset is", len(df), " rows")
#     df.drop(df.index[::2], inplace=True)
#     print(f"Dataset size after dropping all the even-numbered rows is", len(df), " rows")

# # cut dataset in half if > 250,000 rows
# if ( len(df) > 250000):
#     print(f"Original size of dataset is", len(df), " rows")
#     df.drop(df.index[::2], inplace=True)
#     print(f"Dataset size after dropping all the even-numbered rows is", len(df), " rows")


# # cut dataset in half if > 100,000 rows
# if ( len(df) > 100000):
#     print(f"Original size of dataset is", len(df), " rows")
#     df.drop(df.index[::2], inplace=True)
#     print(f"Dataset size after dropping all the even-numbered rows is", len(df), " rows")


# # cut dataset in half if > 50,000 rows
# if ( len(df) > 50000):
#     print(f"Original size of dataset is", len(df), " rows")
#     df.drop(df.index[::2], inplace=True)
#     print(f"Dataset size after dropping all the even-numbered rows is", len(df), " rows")


# # cut dataset in half if > 25,000 rows
# if ( len(df) > 25000):
#     print(f"Original size of dataset is", len(df), " rows")
#     df.drop(df.index[::2], inplace=True)
#     print(f"Dataset size after dropping all the even-numbered rows is", len(df), " rows")


In [None]:
#view dimensions of dataset (rows and columns)
print ("Rows,columns in dataset:", df.shape)

In [None]:
# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

# Exploratory Data Analysis (EDA)

In [None]:
# take a quick look at the data
df.head()

In [None]:
# Display all the data rather than just a portion
#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

In [None]:
# check for any missing values in dataset
df.isna().sum()

In [None]:
# check for any missing datatypes
get_type_missing(df)

In [None]:
df.describe()

In [None]:
# look at all the datatypes of that are objects, in case any can be converted to integers
df.describe(include='object')

In [None]:
# look at the values in all of the features

feature_names = df.columns.tolist()

for feature_name in feature_names:
    if feature_name in df.columns:
        print('\n')
        print(f"------------------")
        print(f"{feature_name}")
        print(f"------------------")
        print(df[feature_name].value_counts())


In [None]:
#view dimensions of dataset (rows and columns)
print ("Rows,columns in dataset:", df.shape)

In [None]:
df.info()

# Data preprocessing

## Fix up feature names

In [None]:
# look at the column names
df.columns

In [None]:
print(df['frame.time'].value_counts().head())

print("\nNull Values:")
print(df['frame.time'].isna().sum())

In [None]:
# converting to datetime
def convert_to_datetime(value):
    try:
         return pd.to_datetime(value)
    except:
        return np.nan
       
# skip the time-consuming conversion because we drop this feature later
#df['frame.time'] = df['frame.time'].apply(convert_to_datetime)

In [None]:
# Validating IP address

print(df['ip.src_host'].value_counts().head())
print('_________________________________________________________')
print(df['ip.dst_host'].value_counts().head())
print('_________________________________________________________')
print(df['arp.src.proto_ipv4'].value_counts().head())
print('_________________________________________________________')
print(df['arp.dst.proto_ipv4'].value_counts().head())

In [None]:
# just for fun explore these values in the http.file_data column
#df[df['Attack_label'] == 1]['http.file_data'].value_counts()


In [None]:
df['mqtt.topic'].value_counts()

In [None]:
df['mqtt.protoname'].value_counts()

In [None]:
df['dns.qry.name.len'].value_counts()

In [None]:
df['http.request.method'].value_counts()

- exploring the target dataset

In [None]:
# how many 0 (normal) and 1 (attack) values do we have?
df['Attack_label'].value_counts()

## Visualization

In [None]:
plt.figure(figsize=(15, 6))
sns.countplot(data=df, x='Attack_label', hue='Attack_type', edgecolor='black', linewidth=1)
plt.title('Attack Label vs Attack Type', fontsize=20)
plt.show()

In [None]:
import plotly.express as px

fig = px.pie(df, names='Attack_label', title='Distribution of Attack Labels')
fig.show()


In [None]:
fig = px.pie(df, names='Attack_type', title='Distribution of Attack Type')
fig.show()


- class imbalance issue - this can cause the machine learning model to result in biased results

## Drop features 
Now using our domain knowledge we will only select useful features from our dataset and drop the rest

In [None]:
#view dimensions of dataset (rows and columns)
print ("Rows,columns in dataset:", df.shape)

In [None]:
# Identifying columns that are entirely NaN (empty) or have all zero values
empty_or_zero_columns = df.columns[(df.isnull().all())
| (df == 0).all()   | (df == 1).all() | (df == 1.0).all()
| (df == 0.0).all() | (df == 2).all() | (df == 2.0).all()]

# Displaying the identified columns
empty_features = empty_or_zero_columns.tolist()

print("These columns are all empty features:")
print(empty_features)


for feature in empty_features:
  if feature in df.columns:
    df.drop(feature, axis=1, inplace=True)
    print("Dropping empty feature:", feature)

In [None]:
# show the columns to confirm the features have been dropped
df.head()

In [None]:
#view dimensions of dataset (rows and columns)
print ("Rows,columns in dataset:", df.shape)

In [None]:
# drop these features

feature_names = ["frame.time", "ip.src_host", "ip.dst_host", "arp.src.proto_ipv4","arp.dst.proto_ipv4",
                "http.file_data","http.request.full_uri","icmp.transmit_timestamp",
                "http.request.uri.query", "tcp.options","tcp.payload","tcp.srcport",
                "tcp.dstport", "udp.port", "mqtt.msg", "icmp.unused", "http.tls_port", 'dns.qry.type', 
                'dns.retransmit_request_in', "mqtt.msg_decoded_as", "mbtcp.trans_id", "mbtcp.unit_id", "http.request.method", "http.referer", 
                "http.request.version", "dns.qry.name.len", "mqtt.conack.flags", "mqtt.protoname", "mqtt.topic"]

# potential_drop_list = ['arp.opcode']

for feature_name in feature_names:
  if feature_name in df.columns:
    df.drop(feature_name, axis=1, inplace=True)
    print("Dropping feature:", feature_name)


In [None]:
#view dimensions of dataset (rows and columns)
print ("Rows,columns in dataset after dropping features:", df.shape)

In [None]:
# print(df[df['tcp.flags.ack'] == 1]['Attack_label'].value_counts(normalize=True))
# print(df[df['tcp.flags.ack'] == 0]['Attack_label'].value_counts(normalize=True))

df['Attack_label'].groupby(df['tcp.flags.ack']).value_counts(normalize=True)
# hence we group by is prefered

In [None]:
df.info()

In [None]:
#view dimensions of dataset (rows and columns)
print ("Rows,columns in dataset:", df.shape)

## Label encoding

- Problem: if we use a machine learning model to predict the Attack label, it could predict it as 0.1, 0.2 or 0.99 which is not a valid Attack label
- Solution: Label Encoder

![one hot encoding](https://www.blog.trainindata.com/wp-content/uploads/2023/10/cover-2.gif "one hot encoding")

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Attack_label'] = le.fit_transform(df['Attack_label'])

df['Attack_label'].value_counts()

In [None]:
# The final column in the dataset is Attack_type, and will contain one of these values:

# Display unique values in the "Attack_type" column
unique_attack_types = df['Attack_type'].unique()
print("Unique Attack Types:")
print(unique_attack_types)

In [None]:
# separate X and y variables (independent and dependent variables)

X = df.drop(['Attack_label', 'Attack_type'], axis=1)
y_label = df['Attack_label']
y_type = df['Attack_type']

In [None]:
X

In [None]:
y_label

In [None]:
# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

# train / test / split

In [None]:
y_type

In [None]:
X_train, X_test, y_train_label, y_test_label = train_test_split(X, y_label, test_size=0.2, random_state=42)

# Class balancing with random undersampling

In [None]:
# Initialize RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy=1, random_state=42)

# Apply Random Under Sampling
X_train_resampled, y_train_label_resampled = rus.fit_resample(X_train, y_train_label)

In [None]:
# If you wanted to balance the classes with SMOTE instead, sample code shown below:

## Create an instance of the SMOTE class
#smote = SMOTE(sampling_strategy='auto')

## Apply SMOTE to the training data
#X_train_resampled, y_train_type_resampled = smote.fit_resample(X_train, y_train_type)


In [None]:
print("Class balance before resampling")
print(y_train_label.value_counts())
print('\n')
print("Class balance after resampling")
print(y_train_label_resampled.value_counts())

In [None]:
# BUG ALERT - classes are not balanced!  
# I think this is because we are using label encoding of the Attack_type feature, not the Attack_label feature

# confirm the classes are balanced

# the final column in the dataframe is named "Attack_label", and will be 0 f the data is normal,
# or 1 if the data indicates an attack.

# Figure out how many rows of each class exist in the dataframe
normal_class = (df[(df['Attack_label'] == 0)])
print("Number of rows in   normal class:", (len(normal_class)) )

abnormal_class = (df[(df['Attack_label'] == 1)])
print(f"Number of rows in abnormal class:", (len(abnormal_class)) )

total_rows = len(abnormal_class) + len(normal_class)
print(f"Total Number of rows (normal+abnormal): {total_rows}" )

balance = len(abnormal_class) / total_rows * 100
balance = round(balance,2)

print(f"Percentage of abnormal class in dataset (abnormal/total*100): {balance}%")
if (balance  < 10): print("This dataset is very imbalanced, please beware of overfitting.")
if (balance == 50): print("This dataset is perfectly balanced.")




# Feature scaling

In [None]:
is_data_scaled = "yes"   #yes|no flag to turn feature scaling on|off to see if it changes prediction accuracy


if (is_data_scaled == "yes"):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_resampled)
    X_test_scaled = scaler.transform(X_test)  # Only transform the test set, don't fit
    # Save the values under original names so we can use consistent names in subsequent sections
    X_train_resampled = X_train_scaled
    X_test = X_test_scaled
else:
    print(f"WARNING: dataset is not being scaled, so the results may be skewed due to data distribution!")


# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

# Reduce dataset size to speed up analysis

In [None]:
print(f"The current size of the dataset is:")
print(f"   X_train_resampled       contains", len(X_train_resampled), "rows")
print(f"   y_train_label_resampled contains", len(y_train_label_resampled), "rows")
print(f"\nThe objective of this section is to see if we can speed up the training process by reducing the size of the dataset, but not losing too much accuracy.")

In [None]:
# Define a list of fractions to keep
fractions_to_keep = [0.01, 0.02, 0.05, 0.10, 0.25, 0.50, 0.75, 1.0]

#initialize variables
best_accuracy = 0
best_fraction_to_keep = 0


# Iterate through different fractions
for fraction_to_keep in fractions_to_keep:
    # Randomly subsample the training set
    num_samples_to_keep = int(len(X_train_resampled) * fraction_to_keep)
    random_indices = np.random.choice(len(X_train_resampled), num_samples_to_keep, replace=False)

    X_train_subsampled = X_train_resampled[random_indices]
    y_train_subsampled = y_train_label_resampled.iloc[random_indices]   #use .iloc becaue y_train_label_resampled is a 1-dimensional array

    # Train your model on the subsampled data
    clf = LogisticRegression(max_iter=800, random_state=42)
    clf.fit(X_train_subsampled, y_train_subsampled)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Evaluate accuracy on the test set
    accuracy = accuracy_score(y_test_label, y_pred)
    print(f"Accuracy on the test set (fraction_to_keep={fraction_to_keep:.4f}): {accuracy:.4f}")
    
    # Save the accuracy levels for later comparison
    if fraction_to_keep == 0.01: accuracy_001 = accuracy
    if fraction_to_keep == 0.02: accuracy_002 = accuracy
    if fraction_to_keep == 0.05: accuracy_005 = accuracy
    if fraction_to_keep == 0.10: accuracy_010 = accuracy
    if fraction_to_keep == 0.25: accuracy_025 = accuracy
    if fraction_to_keep == 0.50: accuracy_050 = accuracy
    if fraction_to_keep == 0.75: accuracy_075 = accuracy
    if fraction_to_keep == 1.0:  accuracy_100 = accuracy
    
    # keep track of the best accuracy
    if accuracy > best_accuracy: 
        best_accuracy = accuracy
        best_fraction_to_keep = fraction_to_keep


print(f"The highest accuracy is {best_accuracy:.4f} using the {best_fraction_to_keep} fraction of the dataset")    

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

In [None]:
# Visualize the results from the previous cell

# Data extracted from the image
data = {
    'fraction_to_keep': [0.010, 0.020, 0.050, 0.100, 0.250, 0.500, 0.750, 1.000],
    'accuracy': [accuracy_001, accuracy_002, accuracy_005, accuracy_010, accuracy_025, accuracy_050, accuracy_075, accuracy_100]
}

# Create a DataFrame
df = pd.DataFrame(data)

plt.figure(figsize=(10, 6))
plt.plot(df['fraction_to_keep'], df['accuracy'], marker='o')

# Adding titles and labels
plt.title('Accuracy on the Test Set by Fraction of Data Kept')
plt.xlabel('Fraction of Data Kept')
plt.ylabel('Accuracy')

# Adding text for each data point
for i in range(len(df)):
    plt.text(df['fraction_to_keep'][i], df['accuracy'][i], f"{df['fraction_to_keep'][i]*100}%", ha='right')

# Adding grid for better readability
plt.grid(True)

# Save the figure with texts
fig_path_with_text = 'accuracy_vs_data_fraction_with_text.png'
plt.savefig(fig_path_with_text)

# Show the figure
plt.show()


In [None]:
# This cell will programnmatically determine the best_fraction_to_keep, by sacrificing some (small) amount of accuracy for speed.
# Exactly how small?  Let's go with an acceptable loss of 1% of accuracy for better speed.

acceptable_loss_of_accuracy = 0.0100  # 0.01*100= 1%  Tweak this value depending on how much accuracy you are willing to sacrifice

if ((best_accuracy - acceptable_loss_of_accuracy) <= accuracy_100):
    print(f"Using 100% of the dataset gives {accuracy_100*100:.2f}% accuracy, which is an acceptable trade-off between accuracy and speed.")
    best_fraction_to_keep = 1.0

if ((best_accuracy - acceptable_loss_of_accuracy) <= accuracy_075):
    print(f"Using  75% of the dataset gives {accuracy_075*100:.2f}% accuracy, which is an acceptable trade-off between accuracy and speed.")
    best_fraction_to_keep = 0.75

if ((best_accuracy - acceptable_loss_of_accuracy) <= accuracy_050):
    print(f"Using  50% of the dataset gives {accuracy_050*100:.2f}% accuracy, which is an acceptable trade-off between accuracy and speed.")
    best_fraction_to_keep = 0.50

if ((best_accuracy - acceptable_loss_of_accuracy) <= accuracy_025):
    print(f"Using  25% of the dataset gives {accuracy_025*100:.2f}% accuracy, which is an acceptable trade-off between accuracy and speed.")
    best_fraction_to_keep = 0.25

if ((best_accuracy - acceptable_loss_of_accuracy) <= accuracy_010):
    print(f"Using  10% of the dataset gives {accuracy_010*100:.2f}% accuracy, which is an acceptable trade-off between accuracy and speed.")
    best_fraction_to_keep = 0.10

if ((best_accuracy - acceptable_loss_of_accuracy) <= accuracy_005):
    print(f"Using   5% of the dataset gives {accuracy_005*100:.2f}% accuracy, which is an acceptable trade-off between accuracy and speed.")
    best_fraction_to_keep = 0.05

if ((best_accuracy - acceptable_loss_of_accuracy) <= accuracy_002):
    print(f"Using   2% of the dataset gives {accuracy_002*100:.2f}% accuracy, which is an acceptable trade-off between accuracy and speed.")
    best_fraction_to_keep = 0.02
    
if ((best_accuracy - acceptable_loss_of_accuracy) <= accuracy_001):
    print(f"Using   1% of the dataset gives {accuracy_001*100:.2f}% accuracy, which is an acceptable trade-off between accuracy and speed.")
    best_fraction_to_keep = 0.01

print(f"\nBased on the above calculations, we will keep {best_fraction_to_keep*100:.0f}% of the dataset, which will still provide acceptable accuracy.")


In [None]:
# Based on the accuracy calculations in the previous cell, decide how much of the dataset to keep
fraction_to_keep = best_fraction_to_keep

# Randomly subsample the training set
num_samples_to_keep = int(len(X_train_resampled) * fraction_to_keep)
random_indices = np.random.choice(len(X_train_resampled), num_samples_to_keep, replace=False)

#save the sub-sampled data to temporary variable names
X_train_subsampled = X_train_resampled[random_indices]
y_train_subsampled = y_train_label_resampled.iloc[random_indices]   #use .iloc becaue y_train_label_resampled is a 1-dimensional array

#save the sub-sampled data back to the original variable names that are used in subsequent sections
X_train_resampled = X_train_subsampled
y_train_label_resampled = y_train_subsampled


print(f"After downsampling the data without losing too much accuracy, the new size of the dataset is:")
print(f"   X_train_resampled       contains", len(X_train_resampled), "rows")
print(f"   y_train_label_resampled contains", len(y_train_label_resampled), "rows")

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

# Model training with traditional classifiers

## Logistic Regression

In [None]:
# Create an instance of the LogisticRegression model
clf = LogisticRegression()

default_params = clf.get_params()
print(f"Training model with default hyperparameters of: {default_params}")
        
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Predict the labels for the test data
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)

# save accuracy for later comparison
accuracy_lr_undersampled_unoptimized = accuracy

# call previously defined function to create confusion matrix
# We want to see approximately equal results from TN and TP 
cm = visualize_confusion_matrix(y_test_label, y_pred)

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

## LR hyperparameter optimization

In [None]:
# Create an instance of the  model
clf = LogisticRegression()

# Define the hyperparameters to tune
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 300],
    'random_state': [42]                 #for reproducible results
}

# Create an instance of GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=cv_count, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train_resampled, y_train_label_resampled)

# Get the best hyperparameters
best_params = grid_search.best_params_
best_scores = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Scores:", best_scores)

# Create a new instance of the model with the best hyperparameters
clf = LogisticRegression(**best_params)

# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Predict the labels for the test data
y_pred = clf.predict(X_test)

# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
lr_crossval_score_mean = cross_val_score_result.mean()  #save mean   crossval score in a variable for later comparison
lr_crossval_score_std  = cross_val_score_result.std()   #save stddev crossval score in a variable for later comparison

# Evaluate the model
Accuracy = accuracy_score(y_test_label, y_pred)
print("Accuracy:", Accuracy)

# save best parameters for later comparison
best_params_lr = best_params

# call previously defined function to create confusion matrix
cm, Accuracy, Sensitivity, Specificity, GeometricMean, Precision, Recall, F1 = visualize_confusion_matrix(y_test_label, y_pred)

# save results calculated for this model for later comparison to other models
accuracy_lr_undersampled_optimized      = Accuracy
sensitivity_lr_undersampled_optimized   = Sensitivity
specificity_lr_undersampled_optimized   = Specificity
geometricmean_lr_undersampled_optimized = GeometricMean
precision_lr_undersampled_optimized     = Precision
recall_lr_undersampled_optimized        = Recall
f1_lr_undersampled_optimized            = F1

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

## Decision Tree

In [None]:
# Create an instance of the DecisionTreeClassifier model
clf = DecisionTreeClassifier()

default_params = clf.get_params()
print(f"Training model with default hyperparameters of: {default_params}")
        
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Predict the labels for the test data
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)

# save accuracy for later comparison
accuracy_dt_undersampled_unoptimized = accuracy

# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

## DT hyperparameter optimization

In [None]:
# Create an instance of the DecisionTreeClassifier model
clf = DecisionTreeClassifier()

# Define the hyperparameters to tune
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'random_state': [42]                 #for reproducible results
}

# Create an instance of GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=cv_count,n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train_resampled, y_train_label_resampled)

# Get the best hyperparameters
best_params = grid_search.best_params_
best_scores = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Scores:", best_scores)

# Create a new instance of the model with the best hyperparameters
clf = DecisionTreeClassifier(**best_params)

# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Predict the labels for the test data
y_pred = clf.predict(X_test)

# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
dt_crossval_score_mean = cross_val_score_result.mean()  #save mean   crossval score in a variable for later comparison
dt_crossval_score_std  = cross_val_score_result.std()   #save stddev crossval score in a variable for later comparison

# Evaluate the model
Accuracy = accuracy_score(y_test_label, y_pred)
print("Accuracy:", Accuracy)

# save best parameters for later comparison
best_params_dt = best_params

# call previously defined function to create confusion matrix
cm, Accuracy, Sensitivity, Specificity, GeometricMean, Precision, Recall, F1 = visualize_confusion_matrix(y_test_label, y_pred)

# save results calculated for this model for later comparison to other models
accuracy_dt_undersampled_optimized      = Accuracy
sensitivity_dt_undersampled_optimized   = Sensitivity
specificity_dt_undersampled_optimized   = Specificity
geometricmean_dt_undersampled_optimized = GeometricMean
precision_dt_undersampled_optimized     = Precision
recall_dt_undersampled_optimized        = Recall
f1_dt_undersampled_optimized            = F1

# show a running total of elapsed time for the entire notebook
show_elapsed_time()  

## Decision Stump 
Decision Stump is a special case of the Decision Tree classifier with max_depth=1

The term "Decision Stump" typically refers to a decision tree with only one level, meaning it makes decisions based on a single feature. 

The main hyperparameters for a decision stump are usually the splitting criterion and the choice of the feature to split on. 

However, since decision stumps are simple, there might not be a lot of hyperparameters to optimize compared to more complex models.


In [None]:
# check to see if there is any benefit to using Decision Stump instead of Decision Tree
if (accuracy_ds_undersampled_unoptimized < accuracy_dt_undersampled_unoptimized):
    print(f"NOTE: Decision Stump is a special case of Decision Tree with max_depth=1, but does not seem to be beneficial for this dataset.")
    print(f"Decision Tree accuracy is {accuracy_dt_undersampled_unoptimized*100:.2f}%, while Decision Stump accuracy is only {accuracy_ds_undersampled_unoptimized*100:.2f}%")

In [None]:
# Create an instance of the DecisionTreeClassifier model
clf = DecisionTreeClassifier(max_depth=1)

default_params = clf.get_params()
print(f"Training model with default hyperparameters of: {default_params}")
        
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Predict the labels for the test data
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)


# save accuracy for later comparison
accuracy_ds_undersampled_unoptimized = accuracy

# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

## DS hyperparameter optimization


Remember that decision stumps are very simple models, and hyperparameter tuning might not have as much impact as it would on more complex models. It's always a good practice to experiment and validate the performance on a validation set or through cross-validation.

In [None]:
# Create an instance of the DecisionTreeClassifier model with max_depth=1
clf = DecisionTreeClassifier(max_depth=1)

# Define the hyperparameters to tune
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [1],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'random_state': [42]                 #for reproducible results
}

# Create an instance of GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=cv_count,n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train_resampled, y_train_label_resampled)

# Get the best hyperparameters
best_params = grid_search.best_params_
best_scores = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Scores:", best_scores)

# Create a new instance of the model with the best hyperparameters
clf = DecisionTreeClassifier(**best_params)

# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Predict the labels for the test data
y_pred = clf.predict(X_test)

# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
ds_crossval_score_mean = cross_val_score_result.mean()  #save mean   crossval score in a variable for later comparison
ds_crossval_score_std  = cross_val_score_result.std()   #save stddev crossval score in a variable for later comparison

# Evaluate the model
Accuracy = accuracy_score(y_test_label, y_pred)
print("Accuracy:", Accuracy)

# save best parameters for later comparison
best_params_ds = best_params

# call previously defined function to create confusion matrix
cm, Accuracy, Sensitivity, Specificity, GeometricMean, Precision, Recall, F1 = visualize_confusion_matrix(y_test_label, y_pred)

# save results calculated for this model for later comparison to other models
accuracy_ds_undersampled_optimized      = Accuracy
sensitivity_ds_undersampled_optimized   = Sensitivity
specificity_ds_undersampled_optimized   = Specificity
geometricmean_ds_undersampled_optimized = GeometricMean
precision_ds_undersampled_optimized     = Precision
recall_ds_undersampled_optimized        = Recall
f1_ds_undersampled_optimized            = F1

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

In [None]:
# check to see if there is any benefit to using Decision Stump instead of Decision Tree
if (accuracy_ds_undersampled_optimized < accuracy_dt_undersampled_optimized):
    print(f"NOTE: Decision Stump is a special case of Decision Tree with max_depth=1, but does not seem to be beneficial for this dataset.")
    print(f"Decision Tree accuracy is {accuracy_dt_undersampled_optimized*100:.2f}%, while Decision Stump accuracy is only {accuracy_ds_undersampled_optimized*100:.2f}%")

## Random Forest Classifier

In [None]:
# Create an instance of the RandomForestClassifier model
clf = RandomForestClassifier(n_jobs=-1, random_state=42)

default_params = clf.get_params()
print(f"Training model with default hyperparameters of: {default_params}")

# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Predict the labels for the test data
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)

# save accuracy for later comparison
accuracy_rf_undersampled_unoptimized = accuracy

# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

## RF hyperparameter optimization

In [None]:
# Create an instance of the RandomForestClassifier model
clf = RandomForestClassifier(n_jobs=-1)

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10],
    'random_state': [42]                 #for reproducible results
}

# Create an instance of GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=cv_count, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train_resampled, y_train_label_resampled)

# Get the best hyperparameters
best_params = grid_search.best_params_
best_scores = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Scores:", best_scores)

# Create a new instance of the model with the best hyperparameters
clf = RandomForestClassifier(**best_params)

# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Predict the labels for the test data
y_pred = clf.predict(X_test)

# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
rf_crossval_score_mean = cross_val_score_result.mean()  #save mean   crossval score in a variable for later comparison
rf_crossval_score_std  = cross_val_score_result.std()   #save stddev crossval score in a variable for later comparison

# Evaluate the model
Accuracy = accuracy_score(y_test_label, y_pred)
print("Accuracy:", Accuracy)

# save best parameters for later comparison
best_params_rf = best_params

# call previously defined function to create confusion matrix
cm, Accuracy, Sensitivity, Specificity, GeometricMean, Precision, Recall, F1 = visualize_confusion_matrix(y_test_label, y_pred)

# save results calculated for this model for later comparison to other models
accuracy_rf_undersampled_optimized      = Accuracy
sensitivity_rf_undersampled_optimized   = Sensitivity
specificity_rf_undersampled_optimized   = Specificity
geometricmean_rf_undersampled_optimized = GeometricMean
precision_rf_undersampled_optimized     = Precision
recall_rf_undersampled_optimized        = Recall
f1_rf_undersampled_optimized            = F1

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

## Naive Bayes 

In [None]:
# Create an instance of the model
#clf = GaussianNB()    # suitable for continuous features
#clf = MultinomialNB() # used for discrete data like word counts
clf = BernoulliNB()    # suitable for binary data, gives best accuracy for this dataset

default_params = clf.get_params()
print(f"Training model with default hyperparameters of: {default_params}")
        
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Predict the labels for the test data
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)

# save accuracy for later comparison
accuracy_nb_undersampled_unoptimized = accuracy

# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

## NB hyperparameter optimization

In [None]:
# Create an instance of the model
clf = BernoulliNB()

# Define the hyperparameters to tune
# skip the sigmoid and poly kernels, rarely used
param_grid = {'alpha': [0.1, 0.01, 0.001]}




# Create an instance of GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=cv_count, n_jobs=-1)

# Fit the grid search to the training data
print("Performing GridSearchCV")
grid_search.fit(X_train_resampled, y_train_label_resampled)

# Get the best hyperparameters
best_params = grid_search.best_params_
best_scores = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Scores:", best_scores)

# Create a new instance of model with the best hyperparameters
clf = BernoulliNB(**best_params)

# Fit the model to the training data
print("Fitting the model")
clf.fit(X_train_resampled, y_train_label_resampled)

# Predict the labels for the test data
y_pred = clf.predict(X_test)

# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
nb_crossval_score_mean = cross_val_score_result.mean()  #save mean   crossval score in a variable for later comparison
nb_crossval_score_std  = cross_val_score_result.std()   #save stddev crossval score in a variable for later comparison

# Evaluate the model
Accuracy = accuracy_score(y_test_label, y_pred)
print("Accuracy:", Accuracy)

# save best parameters for later comparison
best_params_nb = best_params

# call previously defined function to create confusion matrix
cm, Accuracy, Sensitivity, Specificity, GeometricMean, Precision, Recall, F1 = visualize_confusion_matrix(y_test_label, y_pred)

# save results calculated for this model for later comparison to other models
accuracy_nb_undersampled_optimized      = Accuracy
sensitivity_nb_undersampled_optimized   = Sensitivity
specificity_nb_undersampled_optimized   = Specificity
geometricmean_nb_undersampled_optimized = GeometricMean
precision_nb_undersampled_optimized     = Precision
recall_nb_undersampled_optimized        = Recall
f1_nb_undersampled_optimized            = F1

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

## SVM classifier

In [None]:
# Create an instance of the model
clf = SVC()

default_params = clf.get_params()
print(f"Training model with default hyperparameters of: {default_params}")
        
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Predict the labels for the test data
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)

# save accuracy for later comparison
accuracy_svm_undersampled_unoptimized = accuracy

# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

## SVM hyperparameter optimization

In [None]:
print("WARNING: SVM hyperparameter optimization is very CPU-intensive, this will take some time...")

In [None]:
# Create an instance of the model
clf = SVC()

# Define the hyperparameters to tune
# skip the sigmoid and poly kernels, rarely used
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['rbf', 'linear'],
    'probability': [True],               #probability=True is required for VotingClassifier
    'random_state': [42]                 #for reproducible results
}



# Create an instance of GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=cv_count, n_jobs=-1)

# Fit the grid search to the training data
print("Performing GridSearchCV")
grid_search.fit(X_train_resampled, y_train_label_resampled)

# Get the best hyperparameters
best_params = grid_search.best_params_
best_scores = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Scores:", best_scores)

# Create a new instance of model with the best hyperparameters
clf = SVC(**best_params)

# Fit the model to the training data
print("Fitting the model")
clf.fit(X_train_resampled, y_train_label_resampled)

# Predict the labels for the test data
y_pred = clf.predict(X_test)

# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
svm_crossval_score_mean = cross_val_score_result.mean()  #save mean   crossval score in a variable for later comparison
svm_crossval_score_std  = cross_val_score_result.std()   #save stddev crossval score in a variable for later comparison

# Evaluate the model
Accuracy = accuracy_score(y_test_label, y_pred)
print("Accuracy:", Accuracy)

# save best parameters for later comparison
best_params_svm = best_params

# call previously defined function to create confusion matrix
cm, Accuracy, Sensitivity, Specificity, GeometricMean, Precision, Recall, F1 = visualize_confusion_matrix(y_test_label, y_pred)

# save results calculated for this model for later comparison to other models
accuracy_svm_undersampled_optimized      = Accuracy
sensitivity_svm_undersampled_optimized   = Sensitivity
specificity_svm_undersampled_optimized   = Specificity
geometricmean_svm_undersampled_optimized = GeometricMean
precision_svm_undersampled_optimized     = Precision
recall_svm_undersampled_optimized        = Recall
f1_svm_undersampled_optimized            = F1

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

## KNN classifier 

In [None]:
# Create an instance of the model with the desired number of neighbors (you can adjust n_neighbors)
clf = KNeighborsClassifier(n_neighbors=5)  # You can change the value of n_neighbors as needed

default_params = clf.get_params()
print(f"Training model with default hyperparameters of: {default_params}")

# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Predict the labels for the test data
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)

# save accuracy for later comparison
accuracy_knn_undersampled_unoptimized = accuracy

# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

## KNN hyperparameter optimization

In [None]:
# Create an instance of the model
clf = KNeighborsClassifier()

# Define the hyperparameters to tune
param_grid = {
    'n_neighbors': [5,10,15,20,30],
    'weights': ['uniform', 'distance']
}



# Create an instance of GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=cv_count, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train_resampled, y_train_label_resampled)

# Get the best hyperparameters
best_params = grid_search.best_params_
best_scores = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Scores:", best_scores)

# Create a new instance of the model with the best hyperparameters
clf = KNeighborsClassifier(**best_params)

# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Predict the labels for the test data
y_pred = clf.predict(X_test)

# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
knn_crossval_score_mean = cross_val_score_result.mean()  #save mean   crossval score in a variable for later comparison
knn_crossval_score_std  = cross_val_score_result.std()   #save stddev crossval score in a variable for later comparison

# Evaluate the model
Accuracy = accuracy_score(y_test_label, y_pred)
print("Accuracy:", Accuracy)

# save best parameters for later comparison
best_params_knn = best_params

# call previously defined function to create confusion matrix
cm, Accuracy, Sensitivity, Specificity, GeometricMean, Precision, Recall, F1 = visualize_confusion_matrix(y_test_label, y_pred)

# save results calculated for this model for later comparison to other models
accuracy_knn_undersampled_optimized      = Accuracy
sensitivity_knn_undersampled_optimized   = Sensitivity
specificity_knn_undersampled_optimized   = Specificity
geometricmean_knn_undersampled_optimized = GeometricMean
precision_knn_undersampled_optimized     = Precision
recall_knn_undersampled_optimized        = Recall
f1_knn_undersampled_optimized            = F1

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

## MLP Multi-Layer Perceptron deep neural network classifier

MLPClassifier is a class in scikit-learn that represents a Multi-layer Perceptron (MLP) classifier, which is a type of artificial neural network. 

An MLP is a feedforward neural network that consists of multiple layers of nodes (neurons) and can learn complex patterns and relationships in data. 

The MLPClassifier is specifically designed for classification tasks.

Example of all hyperparameters:
    
    mlp_classifier = MLPClassifier(
    hidden_layer_sizes=(100, 50),  # Architecture of hidden layers
    activation='relu',             # Activation function ('relu' is common)
    solver='adam',                 # Optimization solver
    alpha=0.0001,                  # L2 penalty (regularization)
    batch_size='auto',             # Size of mini-batches ('auto' is adaptive)
    learning_rate='constant',      # Learning rate schedule
    learning_rate_init=0.001,      # Initial learning rate
    max_iter=500,                  # Maximum number of iterations
    shuffle=True,                  # Shuffle data in each iteration
    random_state=42,               # Random seed for reproducibility
    verbose=True                   # Print progress during training
)




In [None]:
# Create an instance of the model 
clf = MLPClassifier(random_state=42)

default_params = clf.get_params()
print(f"Training model with default hyperparameters of: {default_params}")

# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Predict the labels for the test data
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)

# save accuracy for later comparison
accuracy_mlp_undersampled_unoptimized = accuracy

# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

## MLP hyperparameter optimization

In [None]:
#mlp_classifier = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)


# Create an instance of the model
clf = MLPClassifier()

# Define the hyperparameters to tune
param_grid = {
    'hidden_layer_sizes': [(100, 50), (50, 25), (150, 100)],  #tuples for hidden layers
    'max_iter': [300, 500, 800],
    'alpha': [0.0001, 0.001, 0.01],
    'random_state': [42]                 #for reproducible results
}


# other exaples to use in param_grid for testing
#param_grid = {
#    'hidden_layer_sizes': [(50, 25), (100, 50), (100, 100)],
#    'activation': ['relu', 'tanh'],
#    'alpha': [0.0001, 0.001, 0.01],
#    'learning_rate': ['constant', 'adaptive'],
#    'max_iter': [200, 300, 500],
#}


# Create an instance of GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=cv_count, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train_resampled, y_train_label_resampled)

# Get the best hyperparameters
best_params = grid_search.best_params_
best_scores = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Scores:", best_scores)

# Create a new instance of the model with the best hyperparameters
clf = MLPClassifier(**best_params)

# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Predict the labels for the test data
y_pred = clf.predict(X_test)

# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
mlp_crossval_score_mean = cross_val_score_result.mean()  #save mean   crossval score in a variable for later comparison
mlp_crossval_score_std  = cross_val_score_result.std()   #save stddev crossval score in a variable for later comparison

# Evaluate the model
Accuracy = accuracy_score(y_test_label, y_pred)
print("Accuracy:", Accuracy)

# save best parameters for later comparison
best_params_mlp = best_params

# call previously defined function to create confusion matrix
cm, Accuracy, Sensitivity, Specificity, GeometricMean, Precision, Recall, F1 = visualize_confusion_matrix(y_test_label, y_pred)

# save results calculated for this model for later comparison to other models
accuracy_mlp_undersampled_optimized      = Accuracy
sensitivity_mlp_undersampled_optimized   = Sensitivity
specificity_mlp_undersampled_optimized   = Specificity
geometricmean_mlp_undersampled_optimized = GeometricMean
precision_mlp_undersampled_optimized     = Precision
recall_mlp_undersampled_optimized        = Recall
f1_mlp_undersampled_optimized            = F1

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

## GB Gradient Boosting classifier

XGboost and gradient boosting are both ensemble learning models
Gradient boosting is built into sklearn, but xgboost needs to install its own package
Let's start with gradient boosting




model = GradientBoostingClassifier(
    n_estimators=100,           # Number of boosting stages (trees)
    learning_rate=0.1,          # Step size shrinkage to prevent overfitting
    max_depth=3,                # Maximum tree depth
    random_state=42             # Seed for reproducibility
)




In [None]:
# Create an instance of the model
clf = GradientBoostingClassifier(random_state=42) 

default_params = clf.get_params()
print(f"Training model with default hyperparameters of: {default_params}")

# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Predict the labels for the test data
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)

# save accuracy for later comparison
accuracy_gb_undersampled_unoptimized = accuracy

# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

## GB hyperparameter optimization

In [None]:
# Create an instance of the model
clf = GradientBoostingClassifier()

default_params = clf.get_params()
print(f"Training model with default hyperparameters of: {default_params}")

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [10, 100, 300], 
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10],
    'random_state': [42]                 #for reproducible results
}

# Create an instance of GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=cv_count, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train_resampled, y_train_label_resampled)

# Get the best hyperparameters
best_params = grid_search.best_params_
best_scores = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Scores:", best_scores)

# Create a new instance of the model with the best hyperparameters
clf = GradientBoostingClassifier(**best_params)

# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Predict the labels for the test data
y_pred = clf.predict(X_test)

# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
gb_crossval_score_mean = cross_val_score_result.mean()  #save mean   crossval score in a variable for later comparison
gb_crossval_score_std  = cross_val_score_result.std()   #save stddev crossval score in a variable for later comparison

# Evaluate the model
Accuracy = accuracy_score(y_test_label, y_pred)
print("Accuracy:", Accuracy)

# save best parameters for later comparison
best_params_gb = best_params

# call previously defined function to create confusion matrix
cm, Accuracy, Sensitivity, Specificity, GeometricMean, Precision, Recall, F1 = visualize_confusion_matrix(y_test_label, y_pred)

# save results calculated for this model for later comparison to other models
accuracy_gb_undersampled_optimized      = Accuracy
sensitivity_gb_undersampled_optimized   = Sensitivity
specificity_gb_undersampled_optimized   = Specificity
geometricmean_gb_undersampled_optimized = GeometricMean
precision_gb_undersampled_optimized     = Precision
recall_gb_undersampled_optimized        = Recall
f1_gb_undersampled_optimized            = F1

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

## XBG XGBoost eXtreme Gradient Boosting classifier

XGBoost (eXtreme Gradient Boosting) is a popular and powerful open-source machine learning library designed for speed and performance. 

It is an implementation of gradient boosting, a machine learning technique that builds a series of weak learners (typically decision trees) and combines their predictions to create a stronger, more accurate model.

XGBoost is known for its efficiency, scalability, and ability to handle diverse types of data.

XGBoost is not built into sklearn, you will need to install the package with: pip install xgboost


In this example, the xgb.DMatrix is a data structure that XGBoost uses for efficient training. 
The params dictionary contains various hyperparameters for the XGBoost model, and xgb.train is used to train the model. 
Finally, predictions are made on the test set, and the accuracy is evaluated.

In [None]:
# The xgboost library is not part of the default install of sklearn, check to see if xgboost library is installed
if 'xgboost' in sys.modules:
    print(f"Confirmed xgboost library is installed")
else:
    print(f"ERROR: xgboost library is NOT installed, please install with: pip install xgboost")



# only run the rest of the cell if the xgboost library is installed
if 'xgboost' in sys.modules:
    # Convert data to DMatrix format (optimized data structure for XGBoost)
    dtrain = xgb.DMatrix(X_train_resampled, label=y_train_label_resampled)
    dtest  = xgb.DMatrix(X_test, label=y_test_label)

    # Set parameters for XGBoost
    params = {
        'objective': 'multi:softmax',  # Multi-class classification
        'num_class': 3,  # Number of classes
        'max_depth': 3,
        'eta': 0.1,
        'eval_metric': 'merror'  # Mean classification error
    }

    # Train the XGBoost model
    num_rounds = 100
    xgb_model = xgb.train(params, dtrain, num_rounds)

    # Make predictions on the test set
    y_pred = xgb_model.predict(dtest)

    # Convert predicted probabilities to class labels
    y_pred = [int(round(pred)) for pred in y_pred]

    # Evaluate the accuracy
    accuracy = accuracy_score(y_test_label, y_pred)
    print(f"Accuracy: {accuracy}")

    accuracy = clf.score(X_test, y_test_label)
    print(f"Accuracy: {accuracy}")

    # save accuracy for later comparison
    accuracy_xgb_undersampled_unoptimized = accuracy

    # call previously defined function to create confusion matrix
    cm = visualize_confusion_matrix(y_test_label, y_pred)

    # show a running total of elapsed time for the entire notebook
    show_elapsed_time() 

## XGB hyperparameter optimization

In [None]:
# The xgboost library is not part of the default install of sklearn, check to see if xgboost library is installed
if 'xgboost' in sys.modules:
    print(f"Confirmed xgboost library is installed")
else:
    print(f"ERROR: xgboost library is NOT installed, please install with: pip install xgboost")


# only run the rest of the cell if the xgboost library is installed
if 'xgboost' in sys.modules:

    # Create an instance of the model
    clf = xgb.XGBClassifier()

    default_params = clf.get_params()
    print(f"Default hyperparameters are: {default_params}")
    print('\n')

    # Define the hyperparameters to tune
    param_grid = {
        'objective': ['multi:softmax'],
        'num_class': [3],  # Number of classes
        'max_depth': [3, 5, 7],
        'learning_rate': [0.1, 0.01, 0.001],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'n_estimators': [50, 100, 200],
        'random_state': [42]              #for reproducible results
    }
    print(f"Adjusting hyperparameters to: {param_grid}")
    print('\n')

    # Use GridSearchCV to find the best hyperparameters
    print(f"Performing GridSearchCV")
    grid_search = GridSearchCV(clf, param_grid, cv=cv_count, scoring='accuracy')
    print(f"Fitting model")
    grid_search.fit(X_train_resampled, y_train_label_resampled)

    # Print the best hyperparameters
    best_params = grid_search.best_params_
    best_scores = grid_search.best_score_
    print("Best Parameters:", best_params)
    print("Best Scores:", best_scores)

    # Evaluate the model with the best hyperparameters on the test set
    clf = grid_search.best_estimator_
    y_pred = clf.predict(X_test)

    # final cross validation
    cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
    print(f"Cross validation scores: {cross_val_score_result}")
    print(f"Mean cross validation score: {cross_val_score_result.mean()}")
    print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
    xgb_crossval_score_mean = cross_val_score_result.mean()  #save mean   crossval score in a variable for later comparison
    xgb_crossval_score_std  = cross_val_score_result.std()   #save stddev crossval score in a variable for later comparison


    # Evaluate the model
    Accuracy = accuracy_score(y_test_label, y_pred)
    print("Accuracy:", Accuracy)

    # save best parameters for later comparison
    best_params_xgb = best_params

    # call previously defined function to create confusion matrix
    cm, Accuracy, Sensitivity, Specificity, GeometricMean, Precision, Recall, F1 = visualize_confusion_matrix(y_test_label, y_pred)

    # save results calculated for this model for later comparison to other models
    accuracy_xgb_undersampled_optimized      = Accuracy
    sensitivity_xgb_undersampled_optimized   = Sensitivity
    specificity_xgb_undersampled_optimized   = Specificity
    geometricmean_xgb_undersampled_optimized = GeometricMean
    precision_xgb_undersampled_optimized     = Precision
    recall_xgb_undersampled_optimized        = Recall
    f1_xgb_undersampled_optimized            = F1

    # show a running total of elapsed time for the entire notebook
    show_elapsed_time() 

# Compare acccuracy of LR, DT, DS, RF, NB, SVM, KNN, MLP, GB, XGB

In [None]:
# this section compares the accuracy of different methods:

if (is_data_scaled == "yes"): 
    print(f"NOTE: This dataset has been scaled to avoid skewing the results due to large data distribution")
if (is_data_scaled == "no"): 
    print(f"NOTE: This dataset has NOT been scaled, so the results may be inaccurate!")
print('\n')
          
print(f"LR  accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_lr_undersampled_unoptimized*100:.2f}%")
print(f"LR  accuracy on undersampled balanced data, after  hyperparameter optimimization: {accuracy_lr_undersampled_optimized*100:.2f}%")
print('\n')
print(f"DT  accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_dt_undersampled_unoptimized*100:.2f}%")
print(f"DT  accuracy on undersampled balanced data, after  hyperparameter optimimization: {accuracy_dt_undersampled_optimized*100:.2f}%")
print('\n')
print(f"DS  accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_ds_undersampled_unoptimized*100:.2f}%")
print(f"DS  accuracy on undersampled balanced data, after  hyperparameter optimimization: {accuracy_ds_undersampled_optimized*100:.2f}%")
print('\n')
print(f"RF  accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_rf_undersampled_unoptimized*100:.2f}%")
print(f"RF  accuracy on undersampled balanced data, after  hyperparameter optimimization: {accuracy_rf_undersampled_optimized*100:.2f}%")
print('\n')
print(f"NB  accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_nb_undersampled_unoptimized*100:.2f}%")
print(f"NB  accuracy on undersampled balanced data, after  hyperparameter optimimization: {accuracy_nb_undersampled_optimized*100:.2f}%")
print('\n')
print(f"SVM accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_svm_undersampled_unoptimized*100:.2f}%")
print(f"SVM accuracy on undersampled balanced data, after  hyperparameter optimimization: {accuracy_svm_undersampled_optimized*100:.2f}%")
print('\n')
print(f"KNN accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_knn_undersampled_unoptimized*100:.2f}%")
print(f"KNN accuracy on undersampled balanced data, after  hyperparameter optimimization: {accuracy_knn_undersampled_optimized*100:.2f}%")
print('\n')
print(f"MLP accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_mlp_undersampled_unoptimized*100:.2f}%")
print(f"MLP accuracy on undersampled balanced data, after  hyperparameter optimimization: {accuracy_mlp_undersampled_optimized*100:.2f}%")
print('\n')
print(f"GB  accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_gb_undersampled_unoptimized*100:.2f}%")
print(f"GB  accuracy on undersampled balanced data, after  hyperparameter optimimization: {accuracy_gb_undersampled_optimized*100:.2f}%")
print('\n')
print(f"XGB accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_xgb_undersampled_unoptimized*100:.2f}%")
print(f"XGB accuracy on undersampled balanced data, after  hyperparameter optimimization: {accuracy_xgb_undersampled_optimized*100:.2f}%")
print('\n')




# Model training with ensemble learning
This section takes the individual ML algorithms tested earlier, then runs them through an ensemble model
The goal is to see if ensemble learning can give us higher accuracy

## Descriptions of ensemble classifiers

__Voting Classifier:__  2 methods: hard voting (majority vote), and soft voting (takes the average of predictive probabilities, takes the class with the highest average probability)

__Stacking Classifier:__ Generates a final model based on multiple base models.  Predictions in intermediate steps are used to generate meta-models.

__Boosting Classifer:__ Trains weak model, generate new model on poorly performing instances, tweak the weights to get better accuracy.  The AdaBoostClassifier is an ensemble learning algorithm that belongs to the family of boosting methods. It is specifically designed for binary classification problems but can be extended to multi-class classification. AdaBoost stands for Adaptive Boosting, and its primary goal is to combine the predictions from multiple weak classifiers to create a strong classifier.

__Bagging Classifier:__ Bagging (Bootstrap Aggregating) is an ensemble learning technique that aims to improve the stability and accuracy of machine learning models. It involves training multiple instances of the same base model on different subsets of the training data. The predictions from individual models are then combined, often by averaging or voting, to produce the final prediction.  BaggingClassifier is a powerful ensemble technique that is particularly effective when applied to base models with high variance. It offers improved generalization, stability, and robustness, but it may not be the optimal choice for all scenarios, and its effectiveness depends on the characteristics of the base model and the dataset.

__Comparison Table__

| Method   | Combines Models | Strengths                                                   | Weaknesses                              |
|----------|-----------------|-------------------------------------------------------------|-----------------------------------------|
| Voting   | Yes             | Simple, effective for balancing out model weaknesses.       | Not as sophisticated as other methods.  |
| Stacking | Yes             | Can leverage the strengths of a combination of models.      | Risk of overfitting.                    |
| Boosting | No              | Can turn a weak model into a strong one.                    | Sensitive to noisy data and outliers.   |
| Bagging  | No              | Minimizes overfitting with data with high variance          | Depends on base model performance       |




## Setup common parameters for all the EL models

In [None]:
# test to be removed
#best_params_svm = {'C': 10, 'kernel': 'rbf', 'probability': True, 'random_state': 42}
#best_params_knn = {'n_neighbors': 10, 'weights': 'distance'}
#best_params_mlp = {'alpha': 0.01, 'hidden_layer_sizes': (50, 25), 'max_iter': 300, 'random_state': 42}
#best_params_gb = {}
#best_params_xgb = {}


In [None]:
print(f"Best parameters for LR:  {best_params_lr}")
print(f"Best parameters for DT:  {best_params_dt}")
print(f"Best parameters for DS:  {best_params_ds}")
print(f"Best parameters for RF:  {best_params_rf}")
print(f"Best parameters for NB:  {best_params_nb}")
print(f"Best parameters for SVM: {best_params_svm}")
print(f"Best parameters for KNN: {best_params_knn}")
print(f"Best parameters for MLP: {best_params_mlp}")
print(f"Best parameters for GB:  {best_params_gb}")
print(f"Best parameters for XGB: {best_params_xgb}")

In [None]:
# In the previous cell, we have optimized hyperparameters for each of the base classifiers saved in python dictionaries.
# Now we will use the ** unpacking syntax to pass the key-value pairs from the dictionaries as keyword arguments to each classifier constructor.
# This way, the hyperparameters specified in each dictionary are correctly applied when creating each individual classifier.

# Define individual classifiers using hyperparameters calculated earlier
lr_clf  = LogisticRegression(**best_params_lr)
dt_clf  = DecisionTreeClassifier(**best_params_dt)
ds_clf  = DecisionTreeClassifier(**best_params_ds)
rf_clf  = RandomForestClassifier(**best_params_rf)
nb_clf  = BernoulliNB(**best_params_nb)
svm_clf = SVC(**best_params_svm)  #need probability=True for voting classifier, already set in hyperparameter optimization section
knn_clf = KNeighborsClassifier(**best_params_knn)
mlp_clf = MLPClassifier(**best_params_mlp)
gb_clf  = GradientBoostingClassifier(**best_params_gb)
xgb_clf = xgb.XGBClassifier(**best_params_xgb)

print(f"Best parameters for LR:  {lr_clf}")
print(f"Best parameters for DT:  {dt_clf}")
print(f"Best parameters for DS:  {ds_clf}")
print(f"Best parameters for RF:  {rf_clf}")
print(f"Best parameters for NB:  {nb_clf}")
print(f"Best parameters for SVM: {svm_clf}")
print(f"Best parameters for KNN: {knn_clf}")
print(f"Best parameters for MLP: {mlp_clf}")
print(f"Best parameters for GB:  {gb_clf}")
print(f"Best parameters for XGB: {xgb_clf}")

## Voting classifier

In this example:

SVC, KNeighborsClassifier, and RandomForestClassifier are individual classifiers.

A VotingClassifier is created with these classifiers and a soft voting strategy. Soft voting predicts the class label based on the argmax of the sums of the predicted probabilities.

The ensemble model is trained on the training set.

Predictions are made on the test set, and the performance of the ensemble model is evaluated.

You can adjust the parameters of the individual classifiers and the VotingClassifier based on your specific needs. Note that not all classifiers support probability estimates (probability=True), so make sure to check the documentation for each classifier.

Ensemble methods like VotingClassifier are beneficial when combining diverse models that capture different aspects of the data, leading to a more robust and accurate overall model.






In [None]:
# Try the voting classifier with all the base models

# Create a VotingClassifier with 'soft' voting (uses predicted probabilities)
clf = VotingClassifier(
    estimators=[('lr', lr_clf), ('dt', dt_clf), ('rf', rf_clf), ('nb', nb_clf), ('svm', svm_clf), ('knn', knn_clf), ('mlp', mlp_clf), ('gb', gb_clf)],
    voting='soft'  # 'hard' for majority voting, 'soft' for weighted voting based on probabilities
)

# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
Accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", Accuracy)

# save accuracy for later comparison
accuracy_ensemble_voting = Accuracy

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

In [None]:
# Try the voting classifier with stronger learners to see if you get better accuracy

# Create a VotingClassifier with 'soft' voting (uses predicted probabilities)
clf = VotingClassifier(
    estimators=[('svm', svm_clf), ('rf', rf_clf), ('dt', dt_clf)],
    voting='soft'  # 'hard' for majority voting, 'soft' for weighted voting based on probabilities
)

# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
Accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", Accuracy)

# save accuracy for later comparison
accuracy_ensemble_voting = Accuracy

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

In [None]:
# Try the voting classifier with the weakest base models

# Create a VotingClassifier with 'soft' voting (uses predicted probabilities)
clf = VotingClassifier(
    estimators=[('lr', lr_clf), ('nb', nb_clf), ('svm', svm_clf), ('knn', knn_clf), ('mlp', mlp_clf)],
    voting='soft'  # 'hard' for majority voting, 'soft' for weighted voting based on probabilities
)

# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
voting_crossval_score_mean = cross_val_score_result.mean()  #save mean   crossval score in a variable for later comparison
voting_crossval_score_std  = cross_val_score_result.std()   #save stddev crossval score in a variable for later comparison

# Evaluate the model
Accuracy = accuracy_score(y_test_label, y_pred)
print("Accuracy:", Accuracy)

# save best parameters for later comparison
best_params_ensemble_voting = clf

# call previously defined function to create confusion matrix
cm, Accuracy, Sensitivity, Specificity, GeometricMean, Precision, Recall, F1 = visualize_confusion_matrix(y_test_label, y_pred)

# save results calculated for this model for later comparison to other models
accuracy_ensemble_voting      = Accuracy
sensitivity_ensemble_voting   = Sensitivity
specificity_ensemble_voting   = Specificity
geometricmean_ensemble_voting = GeometricMean
precision_ensemble_voting     = Precision
recall_ensemble_voting        = Recall
f1_ensemble_voting            = F1

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

### Voting hyperparameter optimization

In [None]:
# The following cell takes ~20 minutes to run, but tuning the hyperparameters, does not improve the accuracy over the previous cell,
# so the following cell has been commented out to save processing time

In [None]:
# # Create the VotingClassifier
# clf = VotingClassifier(estimators=[('lr', lr_clf), ('svm', svm_clf), ('nb', nb_clf), ('knn', knn_clf), ('mlp', mlp_clf)])


# # Define the hyperparameters to tune
# param_grid = {
#     'voting': ['hard', 'soft'],                    # Include options for hard and soft voting
#     'weights': [None, [1, 2, 1, 2, 3]]             # If using soft voting, you can also optimize the weights, higher values mean more weight for that estimator
#     #'lr__C':  [0.1, 1, 10],                       # LR  hyperparameter tuning not required because parameters have already been optimized, so tuning here would be redundant
#     #'svm__C': [0.1, 1, 10],                       # SVC hyperparameter tuning not required because parameters have already been optimized, so tuning here would be redundant
#     #'knn__n_neighbors': [5, 10, 30],              # KNN hyperparameter tuning not required because parameters have already been optimized, so tuning here would be redundant
#     #'nb__alpha':  [0.1, 0.01, 0.001],             # NB  hyperparameter tuning not required because parameters have already been optimized, so tuning here would be redundant
#     #'mlp__alpha': [0.1, 0.01, 0.001]              # MLP hyperparameter tuning not required because parameters have already been optimized, so tuning here would be redundant
# }


# # Use GridSearchCV for hyperparameter tuning
# print(f"Performing GridSearchCV")
# grid_search = GridSearchCV(clf, param_grid, cv=cv_count, scoring='accuracy')
# print(f"Fitting model")
# grid_search.fit(X_train_resampled, y_train_label_resampled)

# # Validate on Test Set
# clf = grid_search.best_estimator_
# print(f"Found best_estimator_  {clf}")
# y_pred = clf.predict(X_test)

# # final cross validation
# cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
# print(f"Cross validation scores: {cross_val_score_result}")
# print(f"Mean cross validation score: {cross_val_score_result.mean()}")
# print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
# voting_crossval_score_mean = cross_val_score_result.mean()  #save mean   crossval score in a variable for later comparison
# voting_crossval_score_std  = cross_val_score_result.std()   #save stddev crossval score in a variable for later comparison

# # Evaluate the model
# Accuracy = accuracy_score(y_test_label, y_pred)
# print("Accuracy:", Accuracy)

# # save best parameters for later comparison
# best_params_ensemble_voting = clf

# # call previously defined function to create confusion matrix
# cm, Accuracy, Sensitivity, Specificity, GeometricMean, Precision, Recall, F1 = visualize_confusion_matrix(y_test_label, y_pred)

# # save results calculated for this model for later comparison to other models
# accuracy_ensemble_voting      = Accuracy
# sensitivity_ensemble_voting   = Sensitivity
# specificity_ensemble_voting   = Specificity
# geometricmean_ensemble_voting = GeometricMean
# precision_ensemble_voting     = Precision
# recall_ensemble_voting        = Recall
# f1_ensemble_voting            = F1

# # show a running total of elapsed time for the entire notebook
# show_elapsed_time() 

In [None]:
# Create the VotingClassifier
clf = VotingClassifier(estimators=[('lr', lr_clf), ('svm', svm_clf), ('nb', nb_clf), ('knn', knn_clf), ('mlp', mlp_clf)])


# Define the hyperparameters to tune
param_grid = {
    'voting': ['hard', 'soft'],                    # Include options for hard and soft voting
    'weights': [None, [1, 2, 1, 2, 3]]             # If using soft voting, you can also optimize the weights, higher values mean more weight for that estimator
     #'lr__C':  [0.1, 1, 10],                      # LR  hyperparameter tuning not required because parameters have already been optimized, so tuning here would be redundant
     #'svm__C': [0.1, 1, 10],                      # SVC hyperparameter tuning not required because parameters have already been optimized, so tuning here would be redundant
     #'knn__n_neighbors': [5, 10, 30],             # KNN hyperparameter tuning not required because parameters have already been optimized, so tuning here would be redundant
     #'nb__alpha':  [0.1, 0.01, 0.001],            # NB  hyperparameter tuning not required because parameters have already been optimized, so tuning here would be redundant
     #'mlp__alpha': [0.1, 0.01, 0.001]             # MLP hyperparameter tuning not required because parameters have already been optimized, so tuning here would be redundant
}


# Use GridSearchCV for hyperparameter tuning
print(f"Performing GridSearchCV")
grid_search = GridSearchCV(clf, param_grid, cv=cv_count, scoring='accuracy')
print(f"Fitting model")
grid_search.fit(X_train_resampled, y_train_label_resampled)

# Validate on Test Set
clf = grid_search.best_estimator_
print(f"Found best_estimator_  {clf}")
y_pred = clf.predict(X_test)

# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
voting_crossval_score_mean = cross_val_score_result.mean()  #save mean   crossval score in a variable for later comparison
voting_crossval_score_std  = cross_val_score_result.std()   #save stddev crossval score in a variable for later comparison

# Evaluate the model
Accuracy = accuracy_score(y_test_label, y_pred)
print("Accuracy:", Accuracy)

# save best parameters for later comparison
best_params_ensemble_voting = clf

# call previously defined function to create confusion matrix
cm, Accuracy, Sensitivity, Specificity, GeometricMean, Precision, Recall, F1 = visualize_confusion_matrix(y_test_label, y_pred)

## save results calculated for this model for later comparison to other models
#accuracy_ensemble_voting      = Accuracy
#sensitivity_ensemble_voting   = Sensitivity
#specificity_ensemble_voting   = Specificity
#geometricmean_ensemble_voting = GeometricMean
#precision_ensemble_voting     = Precision
#recall_ensemble_voting        = Recall
#f1_ensemble_voting            = F1

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

## Stacking classifier

This model (StackingClassifier) uses multiple base estimators such as LR, NB, SVC, KNN, etc.

A StackingClassifier is created with these multiple base classifiers classifiers and a meta-classifier (LogisticRegression) as the final estimator.

The stacking ensemble model is trained on the training set.

Predictions are made on the test set, and the performance of the stacking ensemble model is evaluated.

You can customize the base estimators, the final estimator, and other parameters of the StackingClassifier based on your specific needs.

In [None]:
# Try all the base estimators with the default final_estimator

# Create a stacking ensemble model with a logistic regression meta-classifier
clf = StackingClassifier(
    estimators=[('lr', lr_clf), ('dt', dt_clf), ('rf', rf_clf), ('nb', nb_clf), ('svm', svm_clf), ('knn', knn_clf), ('mlp', mlp_clf), ('gb', gb_clf)],
    final_estimator=LogisticRegression()
)

# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
Accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", Accuracy)

# save accuracy for later comparison
accuracy_ensemble_stacking = Accuracy

# show a running total of elapsed time for the entire notebook
show_elapsed_time()

In [None]:
# Try only the strongest base classifiers in the stacking classifier,  with the default final_estimator

clf = StackingClassifier(
    estimators=[('dt', dt_clf), ('rf', rf_clf),  ('gb', gb_clf)],
    final_estimator=LogisticRegression()
)

# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
Accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", Accuracy)

# save accuracy for later comparison
accuracy_ensemble_stacking = Accuracy

# show a running total of elapsed time for the entire notebook
show_elapsed_time()

In [None]:
# Try only the weakest base models with the default final_estimator

# Create a stacking ensemble model with a logistic regression meta-classifier
clf = StackingClassifier(
    estimators=[('lr', lr_clf), ('nb', nb_clf), ('svm', svm_clf), ('knn', knn_clf), ('mlp', mlp_clf)],
    final_estimator=LogisticRegression()
)

# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
stacking_crossval_score_mean = cross_val_score_result.mean()  #save mean   crossval score in a variable for later comparison
stacking_crossval_score_std  = cross_val_score_result.std()   #save stddev crossval score in a variable for later comparison

# Evaluate the model
Accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", Accuracy)

# save best parameters for later comparison
best_params_ensemble_stacking = clf

# call previously defined function to create confusion matrix
cm, Accuracy, Sensitivity, Specificity, GeometricMean, Precision, Recall, F1 = visualize_confusion_matrix(y_test_label, y_pred)

# save results calculated for this model for later comparison to other models
accuracy_ensemble_stacking      = Accuracy
sensitivity_ensemble_stacking   = Sensitivity
specificity_ensemble_stacking   = Specificity
geometricmean_ensemble_stacking = GeometricMean
precision_ensemble_stacking     = Precision
recall_ensemble_stacking        = Recall
f1_ensemble_stacking            = F1

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 


In [None]:
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

### Stacking hyperparameter optimization

In [None]:
# The following cell takes ~8 hours to run all the final_estimators, but does increase the accuracy by ~4%

# After running the following cell once, we know that a final_estimator of SVM gives the highest accuracy

# Each final_estimator takes approximately this much time to run:
#   NB   40 minutes, accuracy 0.5140
#   SVM  55 minutes, accuracy 0.9134
#   KNN  45 minutes, accuracy 0.9008
#   MLP 180 minutes, accuracy 0.9119
#   LR  140 minutes, accuracy 0.8717
# So let's speed up subsequent executions by only skipping everything except SVM as the final_estimator

In [None]:
# start with multiple weak base_estimators, then create a for loop to test each final_estimator, keeping track of the best final_estimator


estimator_type = "weak"   #strong|weak flag to determine which base estimators to use

strong_base_estimators = [('rf', rf_clf), ('gb', gb_clf), ('dt', dt_clf)]
weak_base_estimators   = [('lr', lr_clf), ('nb', nb_clf), ('svm', svm_clf), ('knn', knn_clf), ('mlp', mlp_clf)]
#final_estimators       = ['RandomForestClassifier', 'DecisionTreeClassifier', 'GradientBoostingClassifier','LogisticRegression','BernoulliNB', 'SVC', 'KNN', 'MLPClassifier']
#final_estimators       = ['BernoulliNB', 'SVC', 'KNN', 'MLPClassifier', 'LogisticRegression']
#final_estimators       = ['BernoulliNB', 'SVC', 'KNN', 'MLPClassifier', 'LogisticRegression']
final_estimators       = ['SVC']  #we know SVC gives the highest accuracy, save time by skipping the other final_estimators


if (estimator_type == "strong"): base_estimators = strong_base_estimators
if (estimator_type == "weak"):   base_estimators = weak_base_estimators

best_final_estimator_name     = "none"
best_final_estimator_accuracy = 0   #initialize value to keep track of the accuracy level of each final classifier

for my_final_estimator in final_estimators:
    print('\n')
    print(f"Testing hyperparameter optimization with {estimator_type} base estimators {base_estimators} and final_estimator={my_final_estimator}")
    
    if (my_final_estimator == 'RandomForestClassifier'):  
        ensemble = StackingClassifier(estimators=base_estimators, final_estimator=RandomForestClassifier())
        ensemble_params = {'final_estimator__n_estimators': [50, 100, 200], 'final_estimator__max_depth': [None, 5, 10, 15]}  #tunable hyperparameters for final_estimator
    if (my_final_estimator == 'DecisionTreeClassifier'): 
        ensemble = StackingClassifier(estimators=base_estimators, final_estimator=DecisionTreeClassifier())
        ensemble_params = {'final_estimator__max_depth': [None, 5, 10, 15]}  #tunable hyperparameters for final_estimator
    if (my_final_estimator == 'GradientBoostingClassifier'): 
        ensemble = StackingClassifier(estimators=base_estimators, final_estimator=GradientBoostingClassifier())
        ensemble_params = {'final_estimator__n_estimators': [10, 100, 300], 'final_estimator__learning_rate': [0.1, 0.01, 0.2], 'final_estimator__max_depth': [3,5,10]}  #tunable hyperparameters for final_estimator
    if (my_final_estimator == 'LogisticRegression'): 
        ensemble = StackingClassifier(estimators=base_estimators, final_estimator=LogisticRegression())
        ensemble_params = {'final_estimator__C': [1, 10, 100], 'final_estimator__max_iter': [100, 200, 300]}  #tunable hyperparameters for final_estimator
    if (my_final_estimator == 'BernoulliNB'): 
        ensemble = StackingClassifier(estimators=base_estimators, final_estimator=BernoulliNB())
        ensemble_params = {'final_estimator__alpha': [0.1, 0.001]}  #tunable hyperparameters for final_estimator
    if (my_final_estimator == 'SVC'): 
        ensemble = StackingClassifier(estimators=base_estimators, final_estimator=SVC())
        ensemble_params = {'final_estimator__C': [1, 10]}  #tunable hyperparameters for final_estimator
    if (my_final_estimator == 'KNN'): 
        ensemble = StackingClassifier(estimators=base_estimators, final_estimator=KNeighborsClassifier())
        ensemble_params = {'final_estimator__n_neighbors': [10,30]}  #tunable hyperparameters for final_estimator
    if (my_final_estimator == 'MLPClassifier'): 
        ensemble = StackingClassifier(estimators=base_estimators, final_estimator=MLPClassifier())
        ensemble_params = {'final_estimator__hidden_layer_sizes': [(100, 50), (50, 25), (150, 100)], 'final_estimator__max_iter': [500, 800], 'final_estimator__alpha': [0.001, 0.01]}  #tunable hyperparameters for final_estimator
        
    print(f"Performing GridSearchCV for final_estimator={my_final_estimator}")
    ensemble_grid = GridSearchCV(ensemble, ensemble_params, cv=cv_count, scoring='accuracy')
    print(f"Fitting model")
    ensemble_grid.fit(X_train_resampled, y_train_label_resampled)

    # Validate on Test Set
    clf = ensemble_grid.best_estimator_
    print(f"Found best_estimator_  {clf}")
    y_pred = clf.predict(X_test)

    # final cross validation
    cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
    print(f"Cross validation scores: {cross_val_score_result}")
    print(f"Mean cross validation score: {cross_val_score_result.mean()}")
    print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
    stacking_crossval_score_mean = cross_val_score_result.mean()  #save mean   crossval score in a variable for later comparison
    stacking_crossval_score_std  = cross_val_score_result.std()   #save stddev crossval score in a variable for later comparison


    # Evaluate the model
    Accuracy = accuracy_score(y_test_label, y_pred)
    print("Accuracy:", Accuracy)
    
    # of all the final_estimators, check to see if this final_estimator provides the best accuracy
    if (Accuracy > best_final_estimator_accuracy):
        best_final_estimator_name     = my_final_estimator  #save the name of the final_estimator that is currently the best
        best_final_estimator_accuracy = Accuracy            #save the accuracy of the final estimator that is currently the best
        print(f"The best final_estimator so far is {best_final_estimator_name}, with accuracy of {best_final_estimator_accuracy}")
    else:
        print(f"This is not the best base classifier")

    # save best parameters for later comparison
    best_params_ensemble_stacking = clf

    # call previously defined function to create confusion matrix
    cm, Accuracy, Sensitivity, Specificity, GeometricMean, Precision, Recall, F1 = visualize_confusion_matrix(y_test_label, y_pred)

    # save results calculated for this model for later comparison to other models
    accuracy_ensemble_stacking      = Accuracy
    sensitivity_ensemble_stacking   = Sensitivity
    specificity_ensemble_stacking   = Specificity
    geometricmean_ensemble_stacking = GeometricMean
    precision_ensemble_stacking     = Precision
    recall_ensemble_stacking        = Recall
    f1_ensemble_stacking            = F1

    # show a running total of elapsed time for the entire notebook
    show_elapsed_time() 


# after testing all the final_estimators, display the best one
print(f"After checking each final_estimator, the best final_estimator is {best_final_estimator_name}, with accuracy of {best_final_estimator_accuracy}")

## Bagging classifier

In [None]:
# the following cell takes ~200 minutes to run 

# The following cell takes ~8 hours to run all the final_estimators, but does increase the accuracy by ~4%

# After running the following cell once, we know that a final_estimator of MLP gives the highest accuracy

# Each final_estimator takes approximately this much time to run:
#   NB    2 minutes, accuracy 0.7700
#   SVM 180 minutes
#   KNN  10 minutes
#   MLP  10 minutes, accuracy 0.8946
#   LR    2 minutes, accuracy 0.8756
# So let's speed up subsequent executions by  skipping the time-consuming SVM and KNN since we know they are not the best base_classifier

In [None]:
# Bagging can only use a single base classifier
# Use a for loop to test all the base classifiers with bagging, one base classifier at a time, keeping track of the best_final_estimator_name


best_base_classifier_name     = "none"
best_base_classifier_accuracy = 0   #initialize value to keep track of the accuracy level of each base classifier

base_classifiers = [lr_clf, dt_clf, rf_clf, nb_clf, svm_clf, knn_clf, mlp_clf, gb_clf, xgb_clf]  #xgb_clf causing error?
base_classifiers = [lr_clf, dt_clf, rf_clf, nb_clf, svm_clf, knn_clf, mlp_clf, gb_clf]           # all classifiers
base_classifiers = [dt_clf, rf_clf, gb_clf]                                                      # strong learners
base_classifiers = [lr_clf, nb_clf, svm_clf, knn_clf, mlp_clf]                                   # weak learners
base_classifiers = [lr_clf, nb_clf, mlp_clf]                                                     # remove SVM and KNN because we know from testing they are not the optimal base_classifier
for base_classifier in base_classifiers:
    print("\n")
    print(f"------------------------------------")
    print(f"Base classifier is {base_classifier}")
    print(f"------------------------------------")

    # Define the BaggingClassifier
    clf = BaggingClassifier(base_classifier, n_estimators=50, random_state=42)

    # Fit the model to the training data
    clf.fit(X_train_resampled, y_train_label_resampled)

    # Predict on the test set
    y_pred = clf.predict(X_test)
    
    # Final cross validation
    cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
    print(f"Cross validation scores: {cross_val_score_result}")
    print(f"Mean cross validation score: {cross_val_score_result.mean()}")
    print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
    bagging_crossval_score_mean = cross_val_score_result.mean()  #save mean   crossval score in a variable for later comparison
    bagging_crossval_score_std  = cross_val_score_result.std()   #save stddev crossval score in a variable for later comparison


    # Evaluate the model
    Accuracy = accuracy_score(y_test_label, y_pred)
    print(f"Accuracy on Test Set: {Accuracy}")


    # of all the base_classifiers, check to see if this base_classifier provides the best accuracy
    if (accuracy > best_base_classifier_accuracy):
        best_base_classifier_name     = base_classifier     #save the name of the base_classifier that is currently the best
        best_base_classifier_accuracy = accuracy            #save the accuracy of the final estimator that is currently the best
        print(f"The best base_classifier so far is {best_base_classifier_name}, with accuracy of {best_base_classifier_accuracy}")
    else:
        print(f"This is not the best base classifier")
 

    # call previously defined function to create confusion matrix
    cm, Accuracy, Sensitivity, Specificity, GeometricMean, Precision, Recall, F1 = visualize_confusion_matrix(y_test_label, y_pred)

    # save results calculated for this model for later comparison to other models
    accuracy_ensemble_bagging      = Accuracy
    sensitivity_ensemble_bagging   = Sensitivity
    specificity_ensemble_bagging   = Specificity
    geometricmean_ensemble_bagging = GeometricMean
    precision_ensemble_bagging     = Precision
    recall_ensemble_bagging        = Recall
    f1_ensemble_bagging            = F1

    # show a running total of elapsed time for the entire notebook
    show_elapsed_time() 


# after testing all the final_estimators, display the best one
print(f"After checking each base_classifier, the best base_classifier is {best_base_classifier_name}, with accuracy of {best_base_classifier_accuracy}, and best_params of {best_params}")

### Bagging hyperparameter optimization

In [None]:
# HINT: in sklearn.ensemble.BaggingClassifier version 1.2.0, the "base_estimator" parameter was renamed to "estimator"
# The base_estimator parameter is deprecated in sklearn version 1.2.0, and will be removed in version 1.4.0
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html
# Check to see if this version of BaggingClassifer() expects to have a "base_estimator" or "estimator" parameter


# Print the version of scikit-learn
print("Currently installed scikit-learn version is:", sklearn.__version__)

# Create an instance of the BaggingClassifier model
clf = BaggingClassifier()

# Figure out which parameters exist
default_params = clf.get_params()
print(f"Default parameters are {default_params}")

# Check to see if the base_estimator parameter exists in the BaggingClassifier, which would indicate an outdated version of scikit-learn
desired_parameter1 = 'base_estimator'  # Replace with the parameter you want to check
desired_parameter2 = 'estimator'  # Replace with the parameter you want to check

# This if block will only be executed if the scikit-learn package is older than 1.2
if (desired_parameter1 in clf.get_params()) and not (desired_parameter2 in clf.get_params()) :
    print('\n')
    print(f"WARNING: the '{desired_parameter1}' parameter exists, but the '{desired_parameter2}' parameter does not exist the BaggingClassifier.")
    print(f"The parameter 'base_estimator' was deprecated in favor of 'estimator' in sklearn 1.2.0, will be removed entirely in sklearn 1.4.0.")
    print(f"Your currently installed version of scikit-learn is", sklearn.__version__)
    print(f"You may wish to update your installed version of scikit-learn to a minimum of 1.2.0 so you can use the 'estimator__' parameter in the next cell.")
    print(f"If you are unable to update your installed version of scikit-learn, you will need to change 'estimator__' to 'base_estimator__' in the following cell for compatibility with your version of scikit-learn.")
    print(f"If you are     using Anaconda Navigator, you can upgrade with:  conda update conda, conda update scikit-learn")
    print(f"If you are not using Anaconda Navigator, you can upgrade with:  pip install --upgrade scikit-learn")

In [None]:
# The following cell takes ~10 hours to run, but tuning the hyperparameters, does not improve the accuracy over the previous cell,
# so the following cell has been commented out to save processing time

In [None]:
# # Try different weak learners with different BaggingClassifier parameters, keeping track of which base_estimator provides the best accuracy

# best_base_estimator_name     = "none"
# best_base_estimator_accuracy = 0   #initialize value to keep track of the accuracy level of each base classifier


# base_estimators = ['lr', 'nb', 'svm', 'mlp', 'knn']                                  # weak learners
# for base_estimator in base_estimators:
#     print("\n")
#     print(f"------------------------------------")
#     print(f"Base estimator is {base_estimator}")
#     print(f"------------------------------------")

#     if (base_estimator == 'lr'):
#         clf = BaggingClassifier(LogisticRegression(), random_state=42)            # Define the BaggingClassifier 
#         param_grid = {'estimator__penalty': [best_params_lr['penalty']],          # optimized hyperparameter from base_estimator
#                       'estimator__C': [best_params_lr['C']],                      # optimized hyperparameter from base_estimator
#                       'estimator__solver': [best_params_lr['solver']],            # optimized hyperparameter from base_estimator
#                       'estimator__max_iter': [best_params_lr['max_iter']],        # optimized hyperparameter from base_estimator
#                       'n_estimators': [100],                                      # Number of base estimators
#                       'max_samples': [1.0],                                       # The proportion of samples  to draw from X to train each base estimator
#                       'max_features': [1.0]                                       # The proportion of features to draw from X to train each base estimator
#                      }

#     if (base_estimator == 'nb'):
#         clf = BaggingClassifier(BernoulliNB(), random_state=42)                   # Define the BaggingClassifier
#         param_grid = {'estimator__alpha': [best_params_nb['alpha']],              # optimized hyperparameter from base_estimator
#                       'n_estimators': [50, 100, 200],                             # Number of base estimators
#                       'max_samples': [0.5, 0.7, 1.0],                             # The proportion of samples  to draw from X to train each base estimator
#                       'max_features': [0.5, 0.7, 1.0]                             # The proportion of features to draw from X to train each base estimator
#                      }


#     if (base_estimator == 'svm'):
#         clf = BaggingClassifier(SVC(), random_state=42)                           # Define the BaggingClassifier
#         param_grid = {'estimator__C': [best_params_svm['C']],                     # optimized hyperparameter from base_estimator
#                       'estimator__kernel': [best_params_svm['kernel']],           # optimized hyperparameter from base_estimator
#                       'n_estimators': [200],                                      # Number of base estimators
#                       'max_samples': [1.0],                                       # The proportion of samples  to draw from X to train each base estimator
#                       'max_features': [1.0]                                       # The proportion of features to draw from X to train each base estimator
#                      }

#     if (base_estimator == 'knn'):
#         clf = BaggingClassifier(KNeighborsClassifier(), random_state=42)           # Define the BaggingClassifier
#         param_grid = {'estimator__n_neighbors': [best_params_knn['n_neighbors']],  # optimized hyperparameter from base_estimator
#                       'estimator__weights': [best_params_knn['weights']],          # optimized hyperparameter from base_estimator
#                       'n_estimators': [100],                                       # Number of base estimators
#                       'max_samples': [1.0],                                        # The proportion of samples  to draw from X to train each base estimator
#                       'max_features': [0.5]                                        # The proportion of features to draw from X to train each base estimator
#                      }

#     if (base_estimator == 'mlp'):
#         clf = BaggingClassifier(MLPClassifier(), random_state=42)                   # Define the BaggingClassifier
#         param_grid = {'estimator__hidden_layer_sizes': [best_params_mlp['hidden_layer_sizes']],   # optimized hyperparameter from base_estimator
#                       'estimator__max_iter': [best_params_mlp['max_iter']],         # optimized hyperparameter from base_estimator
#                       'estimator__alpha': [best_params_mlp['alpha']],               # optimized hyperparameter from base_estimator
#                       'n_estimators': [100],                                        # Number of base estimators
#                       'max_samples': [1.0],                                         # The proportion of samples  to draw from X to train each base estimator
#                       'max_features': [0.5]                                         # The proportion of features to draw from X to train each base estimator
#                      }

        
#     # Use GridSearchCV for hyperparameter tuning
#     print(f"Performing GridSearchCV")
#     grid_search = GridSearchCV(clf, param_grid, cv=cv_count, scoring='accuracy')
#     print(f"Fitting model")
#     grid_search.fit(X_train_resampled, y_train_label_resampled)

    
#     # Print the best hyperparameters
#     best_params = grid_search.best_params_
#     best_scores = grid_search.best_score_
#     print("Best Parameters:", best_params)
#     print("Best Scores:", best_scores)

#     # Evaluate the model with the best hyperparameters on the test set
#     clf = grid_search.best_estimator_
#     y_pred = clf.predict(X_test)

#     # Final cross validation
#     cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
#     print(f"Cross validation scores: {cross_val_score_result}")
#     print(f"Mean cross validation score: {cross_val_score_result.mean()}")
#     print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
#     bagging_crossval_score_mean = cross_val_score_result.mean()  #save mean   crossval score in a variable for later comparison
#     bagging_crossval_score_std  = cross_val_score_result.std()   #save stddev crossval score in a variable for later comparison


#     # Evaluate the model
#     Accuracy = accuracy_score(y_test_label, y_pred)
#     print(f"Accuracy on Test Set: {Accuracy}")

#     # of all the base_classifiers, check to see if this base_classifier provides the best accuracy
#     if (Accuracy > best_base_estimator_accuracy):
#         best_params_ensemble_bagging = best_params         #save best parameters for later comparison
#         best_base_estimator_name     = base_estimator      #save the name of the base_classifier that is currently the best
#         best_base_estimator_accuracy = accuracy            #save the accuracy of the final estimator that is currently the best
#         print(f"The best base_estimator so far is {best_base_estimator_name}, with accuracy of {best_base_estimator_accuracy}")
#     else:
#         print(f"This is not the best base estimator")
        
#     # call previously defined function to create confusion matrix
#     cm, Accuracy, Sensitivity, Specificity, GeometricMean, Precision, Recall, F1 = visualize_confusion_matrix(y_test_label, y_pred)

#     # save results calculated for this model for later comparison to other models
#     accuracy_ensemble_bagging      = Accuracy
#     sensitivity_ensemble_bagging   = Sensitivity
#     specificity_ensemble_bagging   = Specificity
#     geometricmean_ensemble_bagging = GeometricMean
#     precision_ensemble_bagging     = Precision
#     recall_ensemble_bagging        = Recall
#     f1_ensemble_bagging            = F1

#     # show a running total of elapsed time for the entire notebook
#     show_elapsed_time() 


# # after testing all the final_estimators, display the best one
# print(f"After checking each base_estimator, the best base_estimator is {best_base_estimator_name}, with accuracy of {best_base_estimator_accuracy}, and best_params of {best_params}")

## Boosting Classifier

In this example:

LR, SVC, KNN, NB, MLP are individual base classifiers.

An AdaBoostClassifier is created with these base classifiers.

The AdaBoost classifier is trained on the training set.

Predictions are made on the test set, and the performance of the AdaBoost classifier is evaluated.

You can adjust the parameters such as n_estimators and learning_rate based on your specific needs. Note that AdaBoost works best with weak learners, so base classifiers like decision trees with limited depth are commonly used.

The AdaBoostClassifier can use different base classifiers (weak learners) as its base estimator. The base_estimator parameter of the AdaBoostClassifier allows you to specify the type of weak learner to use. If not specified, the default is a decision stump (DecisionTreeClassifier(max_depth=1)).

Using RandomForestClassifier as a base estimator for AdaBoostClassifier is generally not a common practice because AdaBoost is typically used with weak learners, and Random Forests are already ensemble methods that use multiple decision trees.

However, if you still want to experiment with this combination, you can specify RandomForestClassifier as a base_estimator in AdaBoostClassifier.

Keep in mind that using RandomForestClassifier as a base estimator for AdaBoost might not provide significant advantages, as Random Forests are already powerful ensemble models. AdaBoost is often more beneficial when combined with weak learners like shallow decision trees (stumps). It's recommended to experiment with different combinations and evaluate their performance on your specific dataset.

In [None]:
# HINT: in sklearn.ensemble.AdaBoostClassifier version 1.2.0, the "base_estimator" parameter was renamed to "estimator"
# The base_estimator parameter is deprecated in sklearn version 1.2.0, and will be removed in version 1.4.0
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html
# Check to see if this version of AdaBoostClassifer() expects to have a "base_estimator" or "estimator" parameter


# Print the version of scikit-learn
print("Currently installed scikit-learn version is:", sklearn.__version__)

# Create an instance of the BaggingClassifier model
clf = AdaBoostClassifier()

# Figure out which parameters exist
default_params = clf.get_params()
print(f"Default parameters are {default_params}")

# Check to see if the base_estimator parameter exists in the BaggingClassifier, which would indicate an outdated version of scikit-learn
desired_parameter1 = 'base_estimator'  # Replace with the parameter you want to check
desired_parameter2 = 'estimator'  # Replace with the parameter you want to check

# This if block will only be executed if the scikit-learn package is older than 1.2
if (desired_parameter1 in clf.get_params()) and not (desired_parameter2 in clf.get_params()) :
    print('\n')
    print(f"WARNING: the '{desired_parameter1}' parameter exists, but the '{desired_parameter2}' parameter does not exist the AdaBoostClassifier.")
    print(f"The parameter 'base_estimator' was deprecated in favor of 'estimator' in sklearn 1.2.0, will be removed entirely in sklearn 1.4.0.")
    print(f"Your currently installed version of scikit-learn is", sklearn.__version__)
    print(f"You may wish to update your installed version of scikit-learn to a minimum of 1.2.0 so you can use the 'estimator__' parameter in the next cell.")
    print(f"If you are unable to update your installed version of scikit-learn, you will need to change 'estimator__' to 'base_estimator__' in the following cell for compatibility with your version of scikit-learn.")
    print(f"If you are     using Anaconda Navigator, you can upgrade with:  conda update conda, conda update scikit-learn")
    print(f"If you are not using Anaconda Navigator, you can upgrade with:  pip install --upgrade scikit-learn")

In [None]:
# AdaBoostClassifier with multiple base classifiers

print(f"The following base classifiers have already been optimized:")
print('\n')
print(lr_clf)
print(nb_clf)
print(svm_clf)
print(knn_clf)
print(mlp_clf)
print('\n')

# Define multiple base classifiers
#base_classifiers = [
#    LogisticRegression(C=100, max_iter=100, penalty='l2', solver='liblinear'),
#    BernoulliNB(alpha=0.1),
#    SVC(kernel='linear', C=0.1),  # Support Vector Machine with linear kernel
#    KNeighborsClassifier(n_neighbors=10, weights='uniform'),
#    MLPClassifier(hidden_layer_sizes=[50,25], max_iter=800)
#]
base_classifiers = [lr_clf, nb_clf, svm_clf, knn_clf, mlp_clf]

# Create the AdaBoostClassifier, setting estimator=None because we will add multiple base_classifiers in the next step
clf = AdaBoostClassifier(estimator=None, n_estimators=50, random_state=42)

# Set the base classifiers as the base estimator
clf.estimator_ = base_classifiers

# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)

# Predict on the test set
y_pred = clf.predict(X_test)

# Final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
boosting_crossval_score_mean = cross_val_score_result.mean()  #save mean   crossval score in a variable for later comparison
boosting_crossval_score_std  = cross_val_score_result.std()   #save stddev crossval score in a variable for later comparison

# Evaluate the model accuracy
Accuracy = accuracy_score(y_test_label, y_pred)
print(f"Final Accuracy on Test Set: {Accuracy}")
# This method of calculating accuracy generates an error with AdaBoostClassifier
#Accuracy = clf.score(X_test, y_test_label)
#print("Accuracy:", Accuracy)
#print('\n')
    
# call previously defined function to create confusion matrix
cm, Accuracy, Sensitivity, Specificity, GeometricMean, Precision, Recall, F1 = visualize_confusion_matrix(y_test_label, y_pred)

# save results calculated for this model for later comparison to other models
accuracy_ensemble_boosting      = Accuracy
sensitivity_ensemble_boosting   = Sensitivity
specificity_ensemble_boosting   = Specificity
geometricmean_ensemble_boosting = GeometricMean
precision_ensemble_boosting     = Precision
recall_ensemble_boosting        = Recall
f1_ensemble_boosting            = F1

# show a running total of elapsed time for the entire notebook
show_elapsed_time() 

### Boosting hyperparameter optimization

In [None]:
# the following cell does not increase the accuracy (actually decreases accuracy by a small amount), 
# so the following cell has been commented out

In [None]:
# print(f"Performing hyperparameter optimization for AdaBoostClassifier")
# print(f"The following base classifiers have already been optimized:")
# print('\n')
# print(lr_clf)
# print(nb_clf)
# print(svm_clf)
# print(knn_clf)
# print(mlp_clf)
# print('\n')

# # Define multiple base classifiers
# #base_classifiers = [
# #    LogisticRegression(C=100, max_iter=100, penalty='l2', solver='liblinear'),
# #    BernoulliNB(alpha=0.1),
# #    SVC(kernel='linear', C=0.1),  # Support Vector Machine with linear kernel
# #    KNeighborsClassifier(n_neighbors=10, weights='uniform'),
# #    MLPClassifier(hidden_layer_sizes=[50,25], max_iter=800)
# #]
# base_classifiers = [lr_clf, nb_clf, svm_clf, knn_clf, mlp_clf]

# # Define the hyperparameters to tune for AdaBoostClassifier
# param_grid = {
#     'n_estimators': [50, 100, 200],               # Number of boosting rounds
#     'learning_rate': [0.01, 0.1, 1.0]             # Weight applied to each classifier
# }

# # Create the AdaBoostClassifier, setting estimator=None because we will add multiple base_classifiers in the next step
# clf = AdaBoostClassifier(estimator=None, random_state=42)

# # Set the base classifiers as the base_estimator
# clf.estimator_ = base_classifiers

# # Use GridSearchCV for hyperparameter tuning
# print(f"Performing GridSearchCV")
# grid_search = GridSearchCV(clf, param_grid, cv=cv_count, scoring='accuracy')
# print(f"Fitting model")
# grid_search.fit(X_train_resampled, y_train_label_resampled)

# # Validate on Test Set
# clf = grid_search.best_estimator_
# print(f"Found best_estimator_  {clf}")
# y_pred = clf.predict(X_test)

# # Final cross validation
# cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
# print(f"Cross validation scores: {cross_val_score_result}")
# print(f"Mean cross validation score: {cross_val_score_result.mean()}")
# print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
# boosting_crossval_score_mean = cross_val_score_result.mean()  #save mean   crossval score in a variable for later comparison
# boosting_crossval_score_std  = cross_val_score_result.std()   #save stddev crossval score in a variable for later comparison


# # Evaluate the model accuracy
# Accuracy = accuracy_score(y_test_label, y_pred)
# print(f"Final Accuracy on Test Set: {Accuracy}")
# # This method of calculating accuracy generates an error with AdaBoostClassifier
# #Accuracy = clf.score(X_test, y_test_label)
# #print("Accuracy:", Accuracy)
# #print('\n')
    
# # call previously defined function to create confusion matrix
# cm, Accuracy, Sensitivity, Specificity, GeometricMean, Precision, Recall, F1 = visualize_confusion_matrix(y_test_label, y_pred)

# # save results calculated for this model for later comparison to other models
# accuracy_ensemble_boosting      = Accuracy
# sensitivity_ensemble_boosting   = Sensitivity
# specificity_ensemble_boosting   = Specificity
# geometricmean_ensemble_boosting = GeometricMean
# precision_ensemble_boosting     = Precision
# recall_ensemble_boosting        = Recall
# f1_ensemble_boosting            = F1

# # show a running total of elapsed time for the entire notebook
# show_elapsed_time() 

# Comparison of all models

In [None]:
print(f"LR  accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_lr_undersampled_unoptimized*100:.2f}%")
print(f"LR  accuracy on undersampled balanced data, after  hyperparameter optimimization: {accuracy_lr_undersampled_optimized*100:.2f}%")
print('\n')
print(f"DT  accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_dt_undersampled_unoptimized*100:.2f}%")
print(f"DT  accuracy on undersampled balanced data, after  hyperparameter optimimization: {accuracy_dt_undersampled_optimized*100:.2f}%")
print('\n')
print(f"DS  accuracy on undersampled balanced data, without hyperparameter optimimization: {accuracy_ds_undersampled_unoptimized*100:.2f}%")
print(f"DS  accuracy on undersampled balanced data, with    hyperparameter optimimization: {accuracy_ds_undersampled_optimized*100:.2f}%")
print('\n')
print(f"RF  accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_rf_undersampled_unoptimized*100:.2f}%")
print(f"RF  accuracy on undersampled balanced data, after  hyperparameter optimimization: {accuracy_rf_undersampled_optimized*100:.2f}%")
print('\n')
print(f"NB  accuracy on undersampled balanced data, without hyperparameter optimimization: {accuracy_nb_undersampled_unoptimized*100:.2f}%")
print(f"NB  accuracy on undersampled balanced data, with    hyperparameter optimimization: {accuracy_nb_undersampled_optimized*100:.2f}%")
print('\n')
print(f"SVM accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_svm_undersampled_unoptimized*100:.2f}%")
print(f"SVM accuracy on undersampled balanced data, after  hyperparameter optimimization: {accuracy_svm_undersampled_optimized*100:.2f}%")
print('\n')
print(f"KNN accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_knn_undersampled_unoptimized*100:.2f}%")
print(f"KNN accuracy on undersampled balanced data, after  hyperparameter optimimization: {accuracy_knn_undersampled_optimized*100:.2f}%")
print('\n')
print(f"MLP accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_mlp_undersampled_unoptimized*100:.2f}%")
print(f"MLP accuracy on undersampled balanced data, after  hyperparameter optimimization: {accuracy_mlp_undersampled_optimized*100:.2f}%")
print('\n')
print(f"GB  accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_gb_undersampled_unoptimized*100:.2f}%")
print(f"GB  accuracy on undersampled balanced data, after  hyperparameter optimimization: {accuracy_gb_undersampled_optimized*100:.2f}%")
print('\n')
print(f"XGB accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_xgb_undersampled_unoptimized*100:.2f}%")
print(f"XGB accuracy on undersampled balanced data, after  hyperparameter optimimization: {accuracy_xgb_undersampled_optimized*100:.2f}%")
print('\n')
print(f"Ensemble voting   accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_ensemble_voting*100:.2f}%")
print(f"Ensemble stacking accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_ensemble_stacking*100:.2f}%")
print(f"Ensemble bagging  accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_ensemble_bagging*100:.2f}%")
print(f"Ensemble boosting accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_ensemble_boosting*100:.2f}%")



In [None]:
#accuracy_lr_undersampled_optimized = 0.8740
#accuracy_nb_undersampled_optimized = 0.7679
#accuracy_svm_undersampled_optimized = 0.8777
#accuracy_knn_undersampled_optimized = 0.8803
#accuracy_mlp_undersampled_optimized = 0.8884
#accuracy_ensemble_voting = 0.8856
#accuracy_ensemble_stacking = 0.8930
#accuracy_ensemble_bagging = 0.9115
#accuracy_ensemble_boosting = 0.9458

In [None]:
# Create a bar graph that shows the accuracy of the base classifiers and ensemble classifiers

# Show the values that will be used in the graph
print(f"The following accuracy values will be used for visualization:")
print(f"   LR       {accuracy_lr_undersampled_optimized:.4f}")
print(f"   NB       {accuracy_nb_undersampled_optimized:.4f}")
print(f"   SVM      {accuracy_svm_undersampled_optimized:.4f}")
print(f"   KNN      {accuracy_knn_undersampled_optimized:.4f}")
print(f"   MLP      {accuracy_mlp_undersampled_optimized:.4f}")
print(f"   Voting   {accuracy_ensemble_voting:.4f}")
print(f"   Stacking {accuracy_ensemble_stacking:.4f}")
print(f"   Bagging  {accuracy_ensemble_bagging:.4f}")
print(f"   Boosting {accuracy_ensemble_boosting:.4f}")

labels = ["LR", "NB", "SVM", "KNN", "MLP", "Voting", "Stacking", "Bagging", "Boosting"]
values = [accuracy_lr_undersampled_optimized*100, accuracy_nb_undersampled_optimized*100, accuracy_svm_undersampled_optimized*100, accuracy_knn_undersampled_optimized*100, accuracy_mlp_undersampled_optimized*100, accuracy_ensemble_voting*100, accuracy_ensemble_stacking*100, accuracy_ensemble_bagging*100, accuracy_ensemble_boosting*100]

import matplotlib.pyplot as plt

# Increase the width of the graph
fig, ax = plt.subplots(figsize=(10, 6))  # Adjust the figsize as needed

# Increase spacing between bars
bar_width = 0.6  # Adjust the width as needed
bar_positions = range(len(labels))

# Create a bar graph
#bars = plt.bar(bar_positions, values, width=bar_width, color='blue')
bars = plt.bar(bar_positions, values, width=bar_width, color=['lightgreen']*5 + ['darkgreen']*4)  # Last 4 bars are darkgreen

# Dynamically set y-axis limits
plt.ylim(min(values*100) - 5, max(values) + 5)

# Add labels and title
plt.xlabel('')
plt.ylabel('Accuracy (%)')
plt.title('Model Accuracies for Edge-IIoTset2023 dataset')

# Set x-axis ticks and labels
plt.xticks(bar_positions, labels)

# Annotate each bar with its respective value
for bar, value in zip(bars, values):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1, f'{value:.2f}%', ha='center', va='bottom')

# Display the bar graph
plt.show()

In [None]:
# Same as previous cell but a horizontal bar graph instead of vertical

import matplotlib.pyplot as plt


# Show the values that will be used in the graph
print(f"The following accuracy values will be used for visualization:")
print(f"   LR       {accuracy_lr_undersampled_optimized:.4f}")
print(f"   NB       {accuracy_nb_undersampled_optimized:.4f}")
print(f"   SVM      {accuracy_svm_undersampled_optimized:.4f}")
print(f"   KNN      {accuracy_knn_undersampled_optimized:.4f}")
print(f"   MLP      {accuracy_mlp_undersampled_optimized:.4f}")
print(f"   Voting   {accuracy_ensemble_voting:.4f}")
print(f"   Stacking {accuracy_ensemble_stacking:.4f}")
print(f"   Bagging  {accuracy_ensemble_bagging:.4f}")
print(f"   Boosting {accuracy_ensemble_boosting:.4f}")



# Given data
labels = ["LR", "NB", "SVM", "KNN", "MLP", "Voting", "Stacking", "Bagging", "Boosting"]
values = [accuracy_lr_undersampled_optimized*100, accuracy_nb_undersampled_optimized*100, accuracy_svm_undersampled_optimized*100, accuracy_knn_undersampled_optimized*100, accuracy_mlp_undersampled_optimized*100, accuracy_ensemble_voting*100, accuracy_ensemble_stacking*100, accuracy_ensemble_bagging*100, accuracy_ensemble_boosting*100]

# Increase the height of the graph
fig, ax = plt.subplots(figsize=(10, 6))  # Adjust the figsize as needed

# Increase spacing between bars
bar_height = 0.6  # Adjust the height as needed
bar_positions = range(len(labels))

# Create a horizontal bar graph
#bars = plt.barh(bar_positions, values, height=bar_height, color='blue')
bars = plt.barh(bar_positions, values, height=bar_height, color=['lightgreen']*5 + ['darkgreen']*4)  # Last 4 bars are darkgreen

# Dynamically set x-axis limits
plt.xlim(min(values) - 5, max(values) + 5)

# Add labels and title
plt.ylabel('')
plt.xlabel('Accuracy (%)')
plt.title('Model Accuracies for Edge-IIoTset2023 dataset')

# Set y-axis ticks and labels
plt.yticks(bar_positions, labels)

# Annotate each bar with its respective value
for bar, value in zip(bars, values):
    plt.text(bar.get_width() + 1, bar.get_y() + bar.get_height() / 2, f'{value:.2f}%', va='center', ha='left')

# Display the horizontal bar graph
plt.show()


In [None]:
# This code generates a box plot with the given mean and standard deviation values for each classifier. 
# The showmeans=True option ensures that the means are displayed as red dots, and sym='' removes the outliers for a cleaner representation. 
# Adjust the code as needed for your specific requirements.

# Show the values that will be used in the graph
print(f"The following Mean and Standard Deviation values will be used for visualization:")
print(f"   LR       {lr_crossval_score_mean:.4f}   {lr_crossval_score_std:.4f}")
print(f"   NB       {nb_crossval_score_mean:.4f}   {nb_crossval_score_std:.4f}")
print(f"   SVM      {svm_crossval_score_mean:.4f}   {svm_crossval_score_std:.4f}")
print(f"   KNN      {knn_crossval_score_mean:.4f}   {knn_crossval_score_std:.4f}")
print(f"   MLP      {mlp_crossval_score_mean:.4f}   {mlp_crossval_score_std:.4f}")
print(f"   Voting   {voting_crossval_score_mean:.4f}   {voting_crossval_score_std:.4f}")
print(f"   Stacking {stacking_crossval_score_mean:.4f}   {stacking_crossval_score_std:.4f}")
print(f"   Bagging  {bagging_crossval_score_mean:.4f}   {bagging_crossval_score_std:.4f}")
print(f"   Boosting {boosting_crossval_score_mean:.4f}   {boosting_crossval_score_std:.4f}")

# Prepare the data
labels = ["LR", "NB", "SVM", "KNN", "MLP", "Voting", "Stacking", "Bagging", "Boosting"]
means = [lr_crossval_score_mean*100, nb_crossval_score_mean*100, svm_crossval_score_mean*100, knn_crossval_score_mean*100, mlp_crossval_score_mean*100, voting_crossval_score_mean*100, stacking_crossval_score_mean*100, bagging_crossval_score_mean*100, boosting_crossval_score_mean*100]
std_devs = [lr_crossval_score_std*100, nb_crossval_score_std*100, svm_crossval_score_std*100, knn_crossval_score_std*100, mlp_crossval_score_std*100, voting_crossval_score_std*100, stacking_crossval_score_std*100, bagging_crossval_score_std*100, boosting_crossval_score_std*100]

# Create the box plot
fig, ax = plt.subplots()

# Plot the box plots
box_data = [np.random.normal(mean, std, 100) for mean, std in zip(means, std_devs)]
ax.boxplot(box_data, labels=labels, showmeans=True, meanline=True, sym='')

# Rotate the x-axis labels by 45 degrees to make them fit
plt.xticks(rotation=45, ha='right')

# Set labels and title
ax.set_xlabel('Classifiers')
ax.set_ylabel('Mean Accuracy % with Std-Dev')
ax.set_title('Classifier Performance for Edge-IIoT2023 dataset')

# Show the plot
plt.show()




In [None]:
# create a graph showing accuracy and F1-score

# The fig, ax1 = plt.subplots() creates a figure with two subplots, where ax1 is the left subplot.
# The bar graph for accuracy is plotted on the left subplot (ax1), and the line graph for F1-scores is plotted on the right subplot (ax2).
# The right y-axis is shared between the two subplots.

# Show the values that will be used in the graph
print(f"The following Accuracy and F1-Score values will be used for visualization:")
print(f"   LR       Accuracy:{accuracy_lr_undersampled_optimized:.4f}   F1-Score:{f1_lr_undersampled_optimized:.4f}")
print(f"   NB       Accuracy:{accuracy_nb_undersampled_optimized:.4f}   F1-Score:{f1_nb_undersampled_optimized:.4f}")
print(f"   SVM      Accuracy:{accuracy_svm_undersampled_optimized:.4f}   F1-Score:{f1_svm_undersampled_optimized:.4f}")
print(f"   KNN      Accuracy:{accuracy_knn_undersampled_optimized:.4f}   F1-Score:{f1_knn_undersampled_optimized:.4f}")
print(f"   MLP      Accuracy:{accuracy_mlp_undersampled_optimized:.4f}   F1-Score:{f1_mlp_undersampled_optimized:.4f}")
print(f"   Voting   Accuracy:{accuracy_ensemble_voting:.4f}   F1-Score:{f1_ensemble_voting:.4f}")
print(f"   Stacking Accuracy:{accuracy_ensemble_stacking:.4f}   F1-Score:{f1_ensemble_stacking:.4f}")
print(f"   Bagging  Accuracy:{accuracy_ensemble_bagging:.4f}   F1-Score:{f1_ensemble_bagging:.4f}")
print(f"   Boosting Accuracy:{accuracy_ensemble_boosting:.4f}   F1-Score:{f1_ensemble_boosting:.4f}")


# Prepare the data
labels = ["LR", "NB", "SVM", "KNN","MLP", "Voting", "Stacking", "Bagging", "Boosting"]
accuracy = [accuracy_lr_undersampled_optimized*100, accuracy_nb_undersampled_optimized*100, accuracy_svm_undersampled_optimized*100, accuracy_knn_undersampled_optimized*100, accuracy_mlp_undersampled_optimized*100, accuracy_ensemble_voting*100, accuracy_ensemble_stacking*100, accuracy_ensemble_bagging*100, accuracy_ensemble_boosting*100]
f1score = [f1_lr_undersampled_optimized*100, f1_nb_undersampled_optimized*100, f1_svm_undersampled_optimized*100, f1_knn_undersampled_optimized*100, f1_mlp_undersampled_optimized*100, f1_ensemble_voting*100, f1_ensemble_stacking*100, f1_ensemble_bagging*100, f1_ensemble_boosting*100]
                                             

# Create a figure with two subplots
fig, ax1 = plt.subplots(figsize=(10, 5))

# Bar graph for accuracy on the left y-axis
color = 'skyblue'
ax1.bar(labels, accuracy, color=color, label='Accuracy')
ax1.set_xlabel('Classifiers')
ax1.set_ylabel('Accuracy', color=color)
ax1.tick_params(axis='y', labelcolor=color)
ax1.set_title('Classifier Accuracy and F1 Score for Edge-IIoT2023 dataset')

# Line graph for F1-scores on the right y-axis
ax2 = ax1.twinx()
color = 'orange'
ax2.plot(labels, f1score, marker='o', color=color, label='F1 Score')
ax2.set_ylabel('F1 Score', color=color)
ax2.tick_params(axis='y', labelcolor=color)

## Set the same y-axis limits for both axes
min_y = min(min(accuracy), min(f1score))
max_y = max(max(accuracy), max(f1score))
ax1.set_ylim(min_y-5, max_y+5)
ax2.set_ylim(min_y-5, max_y+5)


# Show legends for both graphs
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')

# Show the plot
plt.show()


In [None]:
# create a graph showing multiple lines

# Multiple lines are plotted for each performance metric (accuracy, sensitivity, specificity, geometric mean, precision, recall, F1 score).
# plt.legend() is used to display a legend for better identification of each line.
# The x-axis labels are rotated by 45 degrees (plt.xticks(rotation=45, ha='right')) for better visibility.

# Show the values that will be used in the graph
print(f"The following values will be used for visualization:")
print(f"   LR       Acc:{accuracy_lr_undersampled_optimized:.4f}   Sens:{sensitivity_lr_undersampled_optimized:.4f}  Spec:{specificity_lr_undersampled_optimized:.4f}  GM:{geometricmean_lr_undersampled_optimized:.4f}  Prec:{precision_lr_undersampled_optimized:.4f}  Rec:{recall_lr_undersampled_optimized:.4f}  F1:{f1_lr_undersampled_optimized:.4f}")
print(f"   NB       Acc:{accuracy_nb_undersampled_optimized:.4f}   Sens:{sensitivity_nb_undersampled_optimized:.4f}  Spec:{specificity_nb_undersampled_optimized:.4f}  GM:{geometricmean_nb_undersampled_optimized:.4f}  Prec:{precision_nb_undersampled_optimized:.4f}  Rec:{recall_nb_undersampled_optimized:.4f}  F1:{f1_nb_undersampled_optimized:.4f}")
print(f"   SVM      Acc:{accuracy_svm_undersampled_optimized:.4f}   Sens:{sensitivity_svm_undersampled_optimized:.4f}  Spec:{specificity_svm_undersampled_optimized:.4f}  GM:{geometricmean_svm_undersampled_optimized:.4f}  Prec:{precision_svm_undersampled_optimized:.4f}  Rec:{recall_svm_undersampled_optimized:.4f}  F1:{f1_svm_undersampled_optimized:.4f}")
print(f"   KNN      Acc:{accuracy_knn_undersampled_optimized:.4f}   Sens:{sensitivity_knn_undersampled_optimized:.4f}  Spec:{specificity_knn_undersampled_optimized:.4f}  GM:{geometricmean_knn_undersampled_optimized:.4f}  Prec:{precision_knn_undersampled_optimized:.4f}  Rec:{recall_knn_undersampled_optimized:.4f}  F1:{f1_knn_undersampled_optimized:.4f}")
print(f"   MLP      Acc:{accuracy_mlp_undersampled_optimized:.4f}   Sens:{sensitivity_mlp_undersampled_optimized:.4f}  Spec:{specificity_mlp_undersampled_optimized:.4f}  GM:{geometricmean_mlp_undersampled_optimized:.4f}  Prec:{precision_mlp_undersampled_optimized:.4f}  Rec:{recall_mlp_undersampled_optimized:.4f}  F1:{f1_mlp_undersampled_optimized:.4f}")
print(f"   Voting   Acc:{accuracy_ensemble_voting:.4f}   Sens:{sensitivity_ensemble_voting:.4f}  Spec:{sensitivity_ensemble_voting:.4f}  GM:{geometricmean_ensemble_voting:.4f}  Prec:{precision_ensemble_voting:.4f}  Rec:{recall_ensemble_voting:.4f}  F1:{f1_ensemble_voting:.4f}")
print(f"   Stacking Acc:{accuracy_ensemble_stacking:.4f}   Sens:{sensitivity_ensemble_stacking:.4f}  Spec:{sensitivity_ensemble_stacking:.4f}  GM:{geometricmean_ensemble_stacking:.4f}  Prec:{precision_ensemble_stacking:.4f}  Rec:{recall_ensemble_stacking:.4f}  F1:{f1_ensemble_stacking:.4f}")
print(f"   Bagging  Acc:{accuracy_ensemble_bagging:.4f}   Sens:{sensitivity_ensemble_bagging:.4f}  Spec:{sensitivity_ensemble_bagging:.4f}  GM:{geometricmean_ensemble_bagging:.4f}  Prec:{precision_ensemble_bagging:.4f}  Rec:{recall_ensemble_bagging:.4f}  F1:{f1_ensemble_bagging:.4f}")
print(f"   Boosting Acc:{accuracy_ensemble_boosting:.4f}   Sens:{sensitivity_ensemble_boosting:.4f}  Spec:{sensitivity_ensemble_boosting:.4f}  GM:{geometricmean_ensemble_boosting:.4f}  Prec:{precision_ensemble_boosting:.4f}  Rec:{recall_ensemble_boosting:.4f}  F1:{f1_ensemble_boosting:.4f}")


# Prepare the data
labels = ["LR", "NB", "SVM", "KNN","MLP", "Voting", "Stacking", "Bagging", "Boosting"]
accuracy = [accuracy_lr_undersampled_optimized*100, accuracy_nb_undersampled_optimized*100, accuracy_svm_undersampled_optimized*100, accuracy_knn_undersampled_optimized*100, accuracy_mlp_undersampled_optimized*100, accuracy_ensemble_voting*100, accuracy_ensemble_stacking*100, accuracy_ensemble_bagging*100, accuracy_ensemble_boosting*100]
sensitivity = [sensitivity_lr_undersampled_optimized*100, sensitivity_nb_undersampled_optimized*100, sensitivity_svm_undersampled_optimized*100, sensitivity_knn_undersampled_optimized*100, sensitivity_mlp_undersampled_optimized*100, sensitivity_ensemble_voting*100, sensitivity_ensemble_stacking*100, sensitivity_ensemble_bagging*100, sensitivity_ensemble_boosting*100]
specificity = [specificity_lr_undersampled_optimized*100, specificity_nb_undersampled_optimized*100, specificity_svm_undersampled_optimized*100, specificity_knn_undersampled_optimized*100, specificity_mlp_undersampled_optimized*100, specificity_ensemble_voting*100, specificity_ensemble_stacking*100, specificity_ensemble_bagging*100, specificity_ensemble_boosting*100]
geometricmean = [geometricmean_lr_undersampled_optimized*100, geometricmean_nb_undersampled_optimized*100, geometricmean_svm_undersampled_optimized*100, geometricmean_knn_undersampled_optimized*100, geometricmean_mlp_undersampled_optimized*100, geometricmean_ensemble_voting*100, geometricmean_ensemble_stacking*100, geometricmean_ensemble_bagging*100, geometricmean_ensemble_boosting*100]
precision = [precision_lr_undersampled_optimized*100, precision_nb_undersampled_optimized*100, precision_svm_undersampled_optimized*100, precision_knn_undersampled_optimized*100, precision_mlp_undersampled_optimized*100, precision_ensemble_voting*100, precision_ensemble_stacking*100, precision_ensemble_bagging*100, precision_ensemble_boosting*100]
recall = [recall_lr_undersampled_optimized*100, recall_nb_undersampled_optimized*100, recall_svm_undersampled_optimized*100, recall_knn_undersampled_optimized*100, recall_mlp_undersampled_optimized*100, recall_ensemble_voting*100, recall_ensemble_stacking*100, recall_ensemble_bagging*100, recall_ensemble_boosting*100]
f1score = [f1_lr_undersampled_optimized*100, f1_nb_undersampled_optimized*100, f1_svm_undersampled_optimized*100, f1_knn_undersampled_optimized*100, f1_mlp_undersampled_optimized*100, f1_ensemble_voting*100, f1_ensemble_stacking*100, f1_ensemble_bagging*100, f1_ensemble_boosting*100]

# Create a line graph with multiple lines
plt.figure(figsize=(12, 6))

# Plot lines for each metric
plt.plot(labels, accuracy, label='Accuracy', marker='o')
plt.plot(labels, sensitivity, label='Sensitivity', marker='o')
plt.plot(labels, specificity, label='Specificity', marker='o')
plt.plot(labels, geometricmean, label='Geometric Mean', marker='o')
plt.plot(labels, precision, label='Precision', marker='o')
plt.plot(labels, recall, label='Recall', marker='o')
plt.plot(labels, f1score, label='F1 Score', marker='o')

# Set labels and title
plt.xlabel('Classifiers')
plt.ylabel('Percentage (%)')
plt.title('Performance Metrics for Classifiers for Edge-IIoT2023 dataset')
plt.legend()  # Show legend

# Rotate x-axis labels for better visibility
plt.xticks(rotation=45, ha='right')

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
# Similar to previous graph, but only shows Accuracy, Sensitivity, Specificity, Geometric Mean

# create a graph showing multiple lines

# Multiple lines are plotted for each performance metric (accuracy, sensitivity, specificity, geometric mean, precision, recall, F1 score).
# plt.legend() is used to display a legend for better identification of each line.
# The x-axis labels are rotated by 45 degrees (plt.xticks(rotation=45, ha='right')) for better visibility.

# Show the values that will be used in the graph
print(f"The following values will be used for visualization:")
print(f"   LR       Acc:{accuracy_lr_undersampled_optimized:.4f}   Sens:{sensitivity_lr_undersampled_optimized:.4f}  Spec:{specificity_lr_undersampled_optimized:.4f}  GM:{geometricmean_lr_undersampled_optimized:.4f}")
print(f"   NB       Acc:{accuracy_nb_undersampled_optimized:.4f}   Sens:{sensitivity_nb_undersampled_optimized:.4f}  Spec:{specificity_nb_undersampled_optimized:.4f}  GM:{geometricmean_nb_undersampled_optimized:.4f}")
print(f"   SVM      Acc:{accuracy_svm_undersampled_optimized:.4f}   Sens:{sensitivity_svm_undersampled_optimized:.4f}  Spec:{specificity_svm_undersampled_optimized:.4f}  GM:{geometricmean_svm_undersampled_optimized:.4f}")
print(f"   KNN      Acc:{accuracy_knn_undersampled_optimized:.4f}   Sens:{sensitivity_knn_undersampled_optimized:.4f}  Spec:{specificity_knn_undersampled_optimized:.4f}  GM:{geometricmean_knn_undersampled_optimized:.4f}")
print(f"   MLP      Acc:{accuracy_mlp_undersampled_optimized:.4f}   Sens:{sensitivity_mlp_undersampled_optimized:.4f}  Spec:{specificity_mlp_undersampled_optimized:.4f}  GM:{geometricmean_mlp_undersampled_optimized:.4f}")
print(f"   Voting   Acc:{accuracy_ensemble_voting:.4f}   Sens:{sensitivity_ensemble_voting:.4f}  Spec:{sensitivity_ensemble_voting:.4f}  GM:{geometricmean_ensemble_voting:.4f}")
print(f"   Stacking Acc:{accuracy_ensemble_stacking:.4f}   Sens:{sensitivity_ensemble_stacking:.4f}  Spec:{sensitivity_ensemble_stacking:.4f}  GM:{geometricmean_ensemble_stacking:.4f}")
print(f"   Bagging  Acc:{accuracy_ensemble_bagging:.4f}   Sens:{sensitivity_ensemble_bagging:.4f}  Spec:{sensitivity_ensemble_bagging:.4f}  GM:{geometricmean_ensemble_bagging:.4f}")
print(f"   Boosting Acc:{accuracy_ensemble_boosting:.4f}   Sens:{sensitivity_ensemble_boosting:.4f}  Spec:{sensitivity_ensemble_boosting:.4f}  GM:{geometricmean_ensemble_boosting:.4f}")


# Prepare the data
labels = ["LR", "NB", "SVM", "KNN","MLP", "Voting", "Stacking", "Bagging", "Boosting"]
accuracy = [accuracy_lr_undersampled_optimized*100, accuracy_nb_undersampled_optimized*100, accuracy_svm_undersampled_optimized*100, accuracy_knn_undersampled_optimized*100, accuracy_mlp_undersampled_optimized*100, accuracy_ensemble_voting*100, accuracy_ensemble_stacking*100, accuracy_ensemble_bagging*100, accuracy_ensemble_boosting*100]
sensitivity = [sensitivity_lr_undersampled_optimized*100, sensitivity_nb_undersampled_optimized*100, sensitivity_svm_undersampled_optimized*100, sensitivity_knn_undersampled_optimized*100, sensitivity_mlp_undersampled_optimized*100, sensitivity_ensemble_voting*100, sensitivity_ensemble_stacking*100, sensitivity_ensemble_bagging*100, sensitivity_ensemble_boosting*100]
specificity = [specificity_lr_undersampled_optimized*100, specificity_nb_undersampled_optimized*100, specificity_svm_undersampled_optimized*100, specificity_knn_undersampled_optimized*100, specificity_mlp_undersampled_optimized*100, specificity_ensemble_voting*100, specificity_ensemble_stacking*100, specificity_ensemble_bagging*100, specificity_ensemble_boosting*100]
geometricmean = [geometricmean_lr_undersampled_optimized*100, geometricmean_nb_undersampled_optimized*100, geometricmean_svm_undersampled_optimized*100, geometricmean_knn_undersampled_optimized*100, geometricmean_mlp_undersampled_optimized*100, geometricmean_ensemble_voting*100, geometricmean_ensemble_stacking*100, geometricmean_ensemble_bagging*100, geometricmean_ensemble_boosting*100]
precision = [precision_lr_undersampled_optimized*100, precision_nb_undersampled_optimized*100, precision_svm_undersampled_optimized*100, precision_knn_undersampled_optimized*100, precision_mlp_undersampled_optimized*100, precision_ensemble_voting*100, precision_ensemble_stacking*100, precision_ensemble_bagging*100, precision_ensemble_boosting*100]
recall = [recall_lr_undersampled_optimized*100, recall_nb_undersampled_optimized*100, recall_svm_undersampled_optimized*100, recall_knn_undersampled_optimized*100, recall_mlp_undersampled_optimized*100, recall_ensemble_voting*100, recall_ensemble_stacking*100, recall_ensemble_bagging*100, recall_ensemble_boosting*100]
f1score = [f1_lr_undersampled_optimized*100, f1_nb_undersampled_optimized*100, f1_svm_undersampled_optimized*100, f1_knn_undersampled_optimized*100, f1_mlp_undersampled_optimized*100, f1_ensemble_voting*100, f1_ensemble_stacking*100, f1_ensemble_bagging*100, f1_ensemble_boosting*100]

# Create a line graph with multiple lines
plt.figure(figsize=(12, 6))

# Plot lines for each metric
plt.plot(labels, accuracy, label='Accuracy', marker='o')
plt.plot(labels, sensitivity, label='Sensitivity', marker='o')
plt.plot(labels, specificity, label='Specificity', marker='o')
plt.plot(labels, geometricmean, label='Geometric Mean', marker='o')
#plt.plot(labels, precision, label='Precision', marker='o')
#plt.plot(labels, recall, label='Recall', marker='o')
#plt.plot(labels, f1score, label='F1 Score', marker='o')

# Set labels and title
plt.xlabel('Classifiers')
plt.ylabel('Percentage (%)')
plt.title('Performance Metrics for Classifiers for Edge-IIoT2023 dataset')
plt.legend()  # Show legend

# Rotate x-axis labels for better visibility
plt.xticks(rotation=45, ha='right')

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
# Similar to previous graph, but only shows Accuracy, Precision, Recall, F1-score

# create a graph showing multiple lines

# Multiple lines are plotted for each performance metric (accuracy, sensitivity, specificity, geometric mean, precision, recall, F1 score).
# plt.legend() is used to display a legend for better identification of each line.
# The x-axis labels are rotated by 45 degrees (plt.xticks(rotation=45, ha='right')) for better visibility.


# Show the values that will be used in the graph
print(f"The following values will be used for visualization:")
print(f"   LR       Acc:{accuracy_lr_undersampled_optimized:.4f}   Prec:{precision_lr_undersampled_optimized:.4f}  Rec:{recall_lr_undersampled_optimized:.4f}  F1:{f1_lr_undersampled_optimized:.4f}")
print(f"   NB       Acc:{accuracy_nb_undersampled_optimized:.4f}   Prec:{precision_nb_undersampled_optimized:.4f}  Rec:{recall_nb_undersampled_optimized:.4f}  F1:{f1_nb_undersampled_optimized:.4f}")
print(f"   SVM      Acc:{accuracy_svm_undersampled_optimized:.4f}   Prec:{precision_svm_undersampled_optimized:.4f}  Rec:{recall_svm_undersampled_optimized:.4f}  F1:{f1_svm_undersampled_optimized:.4f}")
print(f"   KNN      Acc:{accuracy_knn_undersampled_optimized:.4f}   Prec:{precision_knn_undersampled_optimized:.4f}  Rec:{recall_knn_undersampled_optimized:.4f}  F1:{f1_knn_undersampled_optimized:.4f}")
print(f"   MLP      Acc:{accuracy_mlp_undersampled_optimized:.4f}   Prec:{precision_mlp_undersampled_optimized:.4f}  Rec:{recall_mlp_undersampled_optimized:.4f}  F1:{f1_mlp_undersampled_optimized:.4f}")
print(f"   Voting   Acc:{accuracy_ensemble_voting:.4f}   Prec:{precision_ensemble_voting:.4f}  Rec:{recall_ensemble_voting:.4f}  F1:{f1_ensemble_voting:.4f}")
print(f"   Stacking Acc:{accuracy_ensemble_stacking:.4f}   Prec:{precision_ensemble_stacking:.4f}  Rec:{recall_ensemble_stacking:.4f}  F1:{f1_ensemble_stacking:.4f}")
print(f"   Bagging  Acc:{accuracy_ensemble_bagging:.4f}   Prec:{precision_ensemble_bagging:.4f}  Rec:{recall_ensemble_bagging:.4f}  F1:{f1_ensemble_bagging:.4f}")
print(f"   Boosting Acc:{accuracy_ensemble_boosting:.4f}   Prec:{precision_ensemble_boosting:.4f}  Rec:{recall_ensemble_boosting:.4f}  F1:{f1_ensemble_boosting:.4f}")


# Prepare the data
labels = ["LR", "NB", "SVM", "KNN","MLP", "Voting", "Stacking", "Bagging", "Boosting"]
accuracy = [accuracy_lr_undersampled_optimized*100, accuracy_nb_undersampled_optimized*100, accuracy_svm_undersampled_optimized*100, accuracy_knn_undersampled_optimized*100, accuracy_mlp_undersampled_optimized*100, accuracy_ensemble_voting*100, accuracy_ensemble_stacking*100, accuracy_ensemble_bagging*100, accuracy_ensemble_boosting*100]
sensitivity = [sensitivity_lr_undersampled_optimized*100, sensitivity_nb_undersampled_optimized*100, sensitivity_svm_undersampled_optimized*100, sensitivity_knn_undersampled_optimized*100, sensitivity_mlp_undersampled_optimized*100, sensitivity_ensemble_voting*100, sensitivity_ensemble_stacking*100, sensitivity_ensemble_bagging*100, sensitivity_ensemble_boosting*100]
specificity = [specificity_lr_undersampled_optimized*100, specificity_nb_undersampled_optimized*100, specificity_svm_undersampled_optimized*100, specificity_knn_undersampled_optimized*100, specificity_mlp_undersampled_optimized*100, specificity_ensemble_voting*100, specificity_ensemble_stacking*100, specificity_ensemble_bagging*100, specificity_ensemble_boosting*100]
geometricmean = [geometricmean_lr_undersampled_optimized*100, geometricmean_nb_undersampled_optimized*100, geometricmean_svm_undersampled_optimized*100, geometricmean_knn_undersampled_optimized*100, geometricmean_mlp_undersampled_optimized*100, geometricmean_ensemble_voting*100, geometricmean_ensemble_stacking*100, geometricmean_ensemble_bagging*100, geometricmean_ensemble_boosting*100]
precision = [precision_lr_undersampled_optimized*100, precision_nb_undersampled_optimized*100, precision_svm_undersampled_optimized*100, precision_knn_undersampled_optimized*100, precision_mlp_undersampled_optimized*100, precision_ensemble_voting*100, precision_ensemble_stacking*100, precision_ensemble_bagging*100, precision_ensemble_boosting*100]
recall = [recall_lr_undersampled_optimized*100, recall_nb_undersampled_optimized*100, recall_svm_undersampled_optimized*100, recall_knn_undersampled_optimized*100, recall_mlp_undersampled_optimized*100, recall_ensemble_voting*100, recall_ensemble_stacking*100, recall_ensemble_bagging*100, recall_ensemble_boosting*100]
f1score = [f1_lr_undersampled_optimized*100, f1_nb_undersampled_optimized*100, f1_svm_undersampled_optimized*100, f1_knn_undersampled_optimized*100, f1_mlp_undersampled_optimized*100, f1_ensemble_voting*100, f1_ensemble_stacking*100, f1_ensemble_bagging*100, f1_ensemble_boosting*100]

# Create a line graph with multiple lines
plt.figure(figsize=(12, 6))

# Plot lines for each metric
plt.plot(labels, accuracy, label='Accuracy', marker='o')
#plt.plot(labels, sensitivity, label='Sensitivity', marker='o')
#plt.plot(labels, specificity, label='Specificity', marker='o')
#plt.plot(labels, geometricmean, label='Geometric Mean', marker='o')
plt.plot(labels, precision, label='Precision', marker='o')
plt.plot(labels, recall, label='Recall', marker='o')
plt.plot(labels, f1score, label='F1 Score', marker='o')

# Set labels and title
plt.xlabel('Classifiers')
plt.ylabel('Percentage (%)')
plt.title('Performance Metrics for Classifiers for Edge-IIoT2023 dataset')
plt.legend()  # Show legend

# Rotate x-axis labels for better visibility
plt.xticks(rotation=45, ha='right')

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
# show a running total of elapsed time for the entire notebook
show_elapsed_time() 