<a href="https://colab.research.google.com/github/njgeorge000158/Spam-Detection-with-Supervised-Machine-Learning-Models/blob/main/spam_detector_hyperparameters_optimization_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#*******************************************************************************************
 #
 #  File Name:  spam_detector_hyperparameters_optimization_colab.ipynb
 #
 #  File Description:
 #      This interactive Python notebook, spam_detector_hyperparameters_optimization.ipynb,
 #      reads a csv file, spam-data.csv, and uses Python and scikit-learn module to find
 #      the best hyperparameters for supervised learning models (binary classification)
 #      that detects spam in e-mails. Here is a list of the models:
 #
 #      logistic regression
 #      decision tree
 #      random forest
 #      support vector machine
 #      k-nearest neighbor
 #
 #
 #  Date            Description                             Programmer
 #  ----------      ------------------------------------    ------------------
 #  04/22/2024      Initial Development                     Nicholas J. George
 #
 #******************************************************************************************/

from google.colab import drive
drive.mount('/content/gdrive/')

import sys
sys.path.insert(0,'./gdrive/MyDrive/spam_detection')

import os
os.environ['HV_DOC_HTML'] = 'true'

!apt-get update
!apt install firefox
!pip install -U geckodriver
!pip install -U dataframe_image
!pip install -U selenium
!pip install -U kaleido
!pip install -U hvplot
!pip install -U plotly
!pip install -U panel
!pip install -U bokeh
!pip install -U imblearn

import hvplot
import hvplot.pandas

import pandas as pd

import holoviews as hv
hv.extension('bokeh')

import logx
logx.set_logs_directory_path('./gdrive/MyDrive/spam_detection/logs')
logx.set_images_directory_path('./gdrive/MyDrive/spam_detection/images')

import pandasx
pandasx.set_google_colab(True)

import classificationsx
import logx
import pandasx
import spam_detector_constants

import copy
import pickle

import numpy as np
import pandas as pd

from IPython.display import clear_output

from imblearn.combine import SMOTEENN
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import RandomUnderSampler

from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

Mounted at /content/gdrive/
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [825 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:10 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [2,036 kB]
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:12 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [1,756 kB]
Hit

In [None]:
CONSTANT_LOCAL_FILE_NAME = 'spam_detector_hyperparameters_optimization_colab.ipynb'


logx.set_log_mode(False)

logx.set_image_mode(False)


logx.begin_program('spam_detector_hyperparameters_optimization_colab')

Program execution begins...



# <br> **Section 1: Extraction and Transformation**

## **1.1: Read the CSV data into a Pandas DataFrame**

In [None]:
data_type_dictionary \
    = {'word_freq_make': float,
       'word_freq_address': float,
       'word_freq_all': float,
       'word_freq_3d': float,
       'word_freq_our': float,
       'word_freq_over': float,
       'word_freq_remove': float,
       'word_freq_internet': float,
       'word_freq_order': float,
       'word_freq_mail': float,
       'word_freq_receive': float,
       'word_freq_will': float,
       'word_freq_people': float,
       'word_freq_report': float,
       'word_freq_addresses': float,
       'word_freq_free': float,
       'word_freq_business': float,
       'word_freq_email': float,
       'word_freq_you': float,
       'word_freq_credit': float,
       'word_freq_your': float,
       'word_freq_font': float,
       'word_freq_000': float,
       'word_freq_money': float,
       'word_freq_hp': float,
       'word_freq_hpl': float,
       'word_freq_george': float,
       'word_freq_650': float,
       'word_freq_lab': float,
       'word_freq_labs': float,
       'word_freq_telnet': float,
       'word_freq_857': float,
       'word_freq_data': float,
       'word_freq_415': float,
       'word_freq_85': float,
       'word_freq_technology': float,
       'word_freq_1999': float,
       'word_freq_parts': float,
       'word_freq_pm': float,
       'word_freq_direct': float,
       'word_freq_cs': float,
       'word_freq_meeting': float,
       'word_freq_original': float,
       'word_freq_project': float,
       'word_freq_re': float,
       'word_freq_edu': float,
       'word_freq_table': float,
       'word_freq_conference': float,
       'char_freq_;': float,
       'char_freq_(': float,
       'char_freq_[': float,
       'char_freq_!': float,
       'char_freq_$': float,
       'char_freq_#': float,
       'capital_run_length_average': float,
       'capital_run_length_longest': int,
       'capital_run_length_total': int,
       'spam': int}

spam_dataframe \
    = pd.read_csv(spam_detector_constants.CONSTANT_INPUT_FILE_PATH, dtype = data_type_dictionary)

logx.log_write_object(spam_dataframe)

## **1.2: Display Spam DataFrame**

In [None]:
pandasx.return_formatted_table(spam_dataframe, 'Table 1.2: Spam Data Table')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,0.0,0.64,0.0,0.0,0.0,0.32,0.0,1.29,1.93,0.0,0.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.78,0.0,0.0,3.76,61,278,1
0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,0.21,0.79,0.65,0.21,0.14,0.14,0.07,0.28,3.47,0.0,1.59,0.0,0.43,0.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.0,0.37,0.18,0.05,5.11,101,1028,1
0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,0.38,0.45,0.12,0.0,1.75,0.06,0.06,1.03,1.36,0.32,0.51,0.0,1.16,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.12,0.0,0.06,0.06,0.0,0.0,0.01,0.14,0.0,0.28,0.18,0.01,9.82,485,2259,1
0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,0.31,0.31,0.31,0.0,0.0,0.31,0.0,0.0,3.18,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.14,0.0,0.0,3.54,40,191,1
0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,0.31,0.31,0.31,0.0,0.0,0.31,0.0,0.0,3.18,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.14,0.0,0.0,3.54,40,191,1
0.0,0.0,0.0,0.0,1.85,0.0,0.0,1.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22,0.0,0.0,0.0,0.0,3.0,15,54,1
0.0,0.0,0.0,0.0,1.92,0.0,0.0,0.0,0.0,0.64,0.96,1.28,0.0,0.0,0.0,0.96,0.0,0.32,3.85,0.0,0.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.16,0.05,0.0,1.67,4,112,1
0.0,0.0,0.0,0.0,1.88,0.0,0.0,1.88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,2.45,11,49,1
0.15,0.0,0.46,0.0,0.61,0.0,0.3,0.0,0.92,0.76,0.76,0.92,0.0,0.0,0.0,0.0,0.0,0.15,1.23,3.53,2.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.27,0.0,0.18,0.2,0.02,9.74,445,1257,1
0.06,0.12,0.77,0.0,0.19,0.32,0.38,0.0,0.06,0.0,0.0,0.64,0.25,0.0,0.12,0.0,0.0,0.12,1.67,0.06,0.71,0.0,0.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.04,0.03,0.0,0.24,0.08,0.0,1.73,43,749,1


## **1.3: Create the labels series (`y`)  from the “spam” column, and then create the features (`X`) DataFrame from the remaining columns.**

### **Separate the Y Variable, The Labels**

In [None]:
y_series = spam_dataframe['spam']

logx.log_write_object(y_series)

### **Review the Y Series**

In [None]:
pandasx.return_formatted_table(y_series.to_frame(), 'Table 1.3.1: Spam Target Series')

spam
1
1
1
1
1
1
1
1
1
1


### **Check the Balance of the Labels Variable (`y`) by Using the `value_counts` Function.**

In [None]:
y_series.value_counts()

spam
0    2788
1    1813
Name: count, dtype: int64

### **Separate the X Variable, the Features**

In [None]:
x_dataframe = spam_dataframe.drop(columns = 'spam', axis = 1)

logx.log_write_object(x_dataframe)

### **Review the X DataFrame**

In [None]:
pandasx.return_formatted_table(x_dataframe, 'Table 1.3.2: Spam Features DataFrame')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,0.0,0.64,0.0,0.0,0.0,0.32,0.0,1.29,1.93,0.0,0.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.78,0.0,0.0,3.76,61,278
0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,0.21,0.79,0.65,0.21,0.14,0.14,0.07,0.28,3.47,0.0,1.59,0.0,0.43,0.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.0,0.37,0.18,0.05,5.11,101,1028
0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,0.38,0.45,0.12,0.0,1.75,0.06,0.06,1.03,1.36,0.32,0.51,0.0,1.16,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.12,0.0,0.06,0.06,0.0,0.0,0.01,0.14,0.0,0.28,0.18,0.01,9.82,485,2259
0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,0.31,0.31,0.31,0.0,0.0,0.31,0.0,0.0,3.18,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.14,0.0,0.0,3.54,40,191
0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,0.31,0.31,0.31,0.0,0.0,0.31,0.0,0.0,3.18,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.14,0.0,0.0,3.54,40,191
0.0,0.0,0.0,0.0,1.85,0.0,0.0,1.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22,0.0,0.0,0.0,0.0,3.0,15,54
0.0,0.0,0.0,0.0,1.92,0.0,0.0,0.0,0.0,0.64,0.96,1.28,0.0,0.0,0.0,0.96,0.0,0.32,3.85,0.0,0.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.16,0.05,0.0,1.67,4,112
0.0,0.0,0.0,0.0,1.88,0.0,0.0,1.88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,2.45,11,49
0.15,0.0,0.46,0.0,0.61,0.0,0.3,0.0,0.92,0.76,0.76,0.92,0.0,0.0,0.0,0.0,0.0,0.15,1.23,3.53,2.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.27,0.0,0.18,0.2,0.02,9.74,445,1257
0.06,0.12,0.77,0.0,0.19,0.32,0.38,0.0,0.06,0.0,0.0,0.64,0.25,0.0,0.12,0.0,0.0,0.12,1.67,0.06,0.71,0.0,0.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.04,0.03,0.0,0.24,0.08,0.0,1.73,43,749


## **1.4: Split the Data into Training and Testing Datasets by Using `train_test_split`.**

In [None]:
x_train_dataframe, x_test_dataframe, \
y_train_series, y_test_series \
    = train_test_split \
        (x_dataframe, y_series,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1)

In [None]:
logx.log_write_object(x_train_dataframe)

logx.log_write_object(x_test_dataframe)

logx.log_write_object(y_train_series)

logx.log_write_object(y_test_series)

## **1.5: Use the StandardScaler to Scale the X Variables**

### **Scale Training and Test Data as Numpy Arrays**

In [None]:
x_train_scaled_nparray = StandardScaler().fit_transform(x_train_dataframe)

logx.log_write_object(x_train_scaled_nparray)

In [None]:
x_test_scaled_nparray = StandardScaler().fit_transform(x_test_dataframe)

logx.log_write_object(x_test_scaled_nparray)

### **Create Scaled X Variable DataFrames**

In [None]:
x_train_scaled_dataframe \
    = pd.DataFrame \
        (x_train_scaled_nparray,
         columns = x_train_dataframe.columns,
         index = x_train_dataframe.index)

logx.log_write_object(x_train_scaled_dataframe)

In [None]:
x_test_scaled_dataframe \
    = pd.DataFrame \
        (x_test_scaled_nparray,
         columns = x_test_dataframe.columns,
         index = x_test_dataframe.index)

logx.log_write_object(x_test_scaled_dataframe)

### **Display Scaled Training and Testing Data**

In [None]:
pandasx.return_formatted_table \
    (x_train_scaled_dataframe,
     'Table 1.5.1: Spam Scaled Features Training Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
2.83,-0.16,1.35,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.68,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,0.06,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.12,-0.3,-0.1,-0.09,-0.2,-0.36
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.35,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,3.86,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,2.8,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,0.05,-0.3,-0.1,-0.09,-0.21,-0.38
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,5.77,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
0.93,0.15,-0.56,-0.05,0.12,-0.36,-0.29,-0.26,-0.32,0.24,-0.3,0.74,-0.3,0.99,-0.19,-0.32,-0.31,-0.35,-0.28,-0.16,-0.67,-0.12,-0.29,-0.21,1.41,1.93,0.13,1.9,0.52,0.61,0.76,0.98,-0.17,0.97,1.88,0.76,0.58,-0.06,-0.18,0.88,-0.13,0.32,1.51,-0.12,0.1,-0.2,-0.07,-0.11,0.35,0.83,5.17,-0.31,-0.3,-0.1,-0.09,-0.19,-0.23
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,2.35,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.3,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,1.58,-0.18,-0.13,-0.18,-0.21,-0.12,0.46,-0.2,-0.07,-0.11,-0.16,0.48,-0.19,0.32,-0.3,-0.1,-0.08,-0.18,-0.34
1.42,-0.16,-0.56,-0.05,-0.46,0.65,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,1.45,-0.17,-0.19,-0.32,-0.31,-0.35,1.18,-0.16,-0.23,-0.12,-0.29,-0.21,-0.17,-0.3,-0.23,-0.23,-0.18,-0.22,1.11,-0.14,-0.17,-0.15,0.28,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,1.02,-0.19,-0.31,0.01,-0.1,-0.1,-0.17,-0.29
-0.35,-0.02,-0.56,-0.05,-0.2,-0.36,-0.29,0.66,-0.32,-0.37,-0.3,0.45,-0.3,-0.17,-0.19,-0.32,3.74,-0.35,-0.84,-0.16,-0.67,-0.12,-0.29,-0.21,-0.1,-0.1,-0.23,-0.23,-0.18,0.16,-0.16,-0.14,-0.17,-0.15,-0.19,0.71,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,0.28,-0.2,-0.07,-0.11,-0.16,0.36,-0.19,-0.31,-0.3,-0.1,-0.11,-0.21,-0.19
-0.35,-0.16,-0.56,-0.05,0.88,-0.36,1.88,-0.26,-0.32,1.04,4.1,-0.63,2.62,-0.17,-0.19,-0.32,-0.31,-0.35,0.58,-0.16,0.06,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.1,-0.22,-0.42
-0.35,-0.16,3.33,-0.05,1.01,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,1.65,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,1.83,-0.16,0.13,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,2.18,-0.18,-0.13,2.37,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.23,-0.42


In [None]:
pandasx.return_formatted_table \
    (x_test_scaled_dataframe,
     'Table 1.5.2: Spam Scaled Features Test Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
-0.02,-0.17,-0.0,-0.05,0.06,-0.02,-0.3,0.18,-0.01,-0.38,-0.29,0.23,-0.35,0.89,-0.2,-0.27,-0.36,-0.36,0.24,-0.18,2.26,-0.12,0.78,0.02,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.05,-0.19,-0.08,-0.13,-0.1,-0.44,0.05,-0.04,-0.19,0.0,-0.02,0.2,0.48
-0.31,-0.17,-0.56,-0.05,-0.46,-0.34,-0.3,-0.28,-0.33,-0.38,-0.29,-0.62,-0.35,-0.18,-0.2,-0.27,-0.36,-0.36,-0.51,-0.18,-0.69,-0.12,-0.3,-0.26,-0.13,0.1,-0.22,1.23,0.37,0.62,-0.18,-0.15,-0.18,-0.15,0.68,-0.25,-0.33,-0.07,1.29,-0.21,-0.1,-0.16,-0.2,0.55,-0.29,-0.19,-0.08,-0.13,-0.17,-0.66,-0.13,-0.43,-0.32,-0.16,-0.19,-0.3,-0.3
-0.31,-0.17,-0.56,-0.05,-0.46,-0.34,-0.3,-0.28,-0.33,2.42,-0.29,0.85,-0.35,-0.18,-0.2,-0.27,-0.36,-0.36,-0.92,-0.18,-0.69,-0.12,-0.3,-0.26,0.99,1.11,0.14,3.68,0.77,1.27,1.72,2.2,-0.18,2.21,1.38,2.58,-0.33,-0.07,1.12,1.97,-0.1,-0.16,2.68,-0.14,0.26,-0.19,-0.08,-0.13,0.63,2.93,-0.13,-0.43,-0.32,-0.16,-0.11,-0.26,-0.2
-0.31,-0.17,-0.56,-0.05,-0.46,-0.34,-0.3,-0.28,-0.33,0.31,-0.29,1.56,-0.35,-0.18,-0.2,-0.27,-0.36,0.46,-0.12,-0.18,-0.28,-0.12,-0.3,-0.26,0.66,1.26,0.05,1.7,0.54,0.88,2.64,5.09,-0.18,5.1,2.16,0.8,-0.33,-0.07,-0.18,3.03,-0.1,1.73,-0.2,-0.14,0.12,-0.19,-0.08,4.55,-0.17,1.06,-0.13,-0.43,-0.32,-0.16,0.0,-0.1,-0.21
-0.31,-0.17,0.04,-0.05,0.83,0.15,-0.3,-0.28,2.78,-0.38,-0.29,0.24,0.15,-0.18,-0.2,-0.27,0.43,-0.36,0.35,-0.18,0.1,-0.12,0.12,0.66,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,1.28,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,-0.17,-0.2,-0.13,0.2,-0.11,-0.03,-0.13,0.03,0.27
-0.31,-0.17,0.75,-0.05,-0.46,1.88,-0.3,-0.28,-0.33,-0.38,-0.29,-0.62,-0.35,-0.18,-0.2,0.37,-0.36,-0.36,0.58,-0.18,-0.69,-0.12,1.61,-0.26,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,1.3,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,0.23,0.24,1.58,0.64,-0.32,-0.16,0.42,0.74,0.02
0.76,-0.17,2.82,-0.05,0.99,-0.34,0.76,-0.28,2.01,1.61,-0.29,-0.62,-0.35,-0.18,-0.2,-0.27,-0.36,-0.36,-0.73,2.13,-0.1,-0.12,-0.3,-0.26,-0.14,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,-0.17,-0.66,-0.13,0.96,0.12,-0.16,23.74,9.53,1.94
-0.31,-0.17,-0.56,-0.05,3.85,-0.34,-0.3,-0.28,-0.33,-0.38,-0.29,-0.62,-0.35,-0.18,-0.2,2.64,-0.36,-0.36,-0.92,-0.18,-0.69,-0.12,-0.3,-0.26,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,1.33,1.02,-0.13,0.72,-0.32,-0.16,-0.01,-0.3,-0.46
0.76,-0.17,-0.56,-0.05,-0.46,0.81,-0.3,-0.28,-0.33,-0.38,-0.29,0.58,2.07,-0.18,-0.2,-0.27,1.45,-0.36,0.84,-0.18,1.14,-0.12,-0.3,-0.26,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,-0.17,-0.66,-0.13,-0.43,-0.32,-0.16,-0.17,-0.22,-0.38
-0.31,-0.17,-0.56,-0.05,-0.46,-0.34,-0.3,-0.28,-0.33,-0.38,-0.29,-0.62,-0.35,-0.18,-0.2,-0.27,-0.36,-0.36,0.68,-0.18,-0.69,-0.12,-0.3,-0.26,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,2.74,-0.08,-0.13,-0.17,-0.66,-0.13,-0.43,-0.32,-0.16,-0.22,-0.39,-0.49


# <br> **Section 2: Undersampled and OverSampled Spam Data**

## **2.1: Instantiate the Random Undersampler Instance**

In [None]:
x_train_scaled_undersampled_dataframe, y_train_undersampled_series \
    = RandomUnderSampler(random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
        .fit_resample(x_train_scaled_dataframe, y_train_series)

In [None]:
logx.log_write_object(x_train_scaled_undersampled_dataframe)

logx.log_write_object(y_train_undersampled_series)

## **2.2: Instantiate the Random Oversampler Instance**

In [None]:
x_train_scaled_oversampled_dataframe, y_train_oversampled_series \
    = RandomOverSampler(random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
        .fit_resample(x_train_scaled_dataframe, y_train_series)

In [None]:
logx.log_write_object(x_train_scaled_undersampled_dataframe)

logx.log_write_object(y_train_undersampled_series)

## **2.3: Instantiate the Cluster Centroids Instance**

In [None]:
x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series \
    = ClusterCentroids \
        (estimator \
             = KMeans(n_init = 'auto', random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_2),
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit_resample(x_train_scaled_dataframe, y_train_series)

In [None]:
logx.log_write_object(x_train_scaled_cluster_centroids_dataframe)

logx.log_write_object(y_train_cluster_centroids_series)

## **2.4: Instantiate the SMOTE Instance**

In [None]:
x_train_scaled_SMOTE_dataframe, y_train_SMOTE_series \
    = SMOTE(random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1, sampling_strategy = 'auto') \
        .fit_resample(x_train_scaled_dataframe, y_train_series)

In [None]:
logx.log_write_object(x_train_scaled_SMOTE_dataframe)

logx.log_write_object(y_train_SMOTE_series)

## **2.5: Instantiate the SMOTEEN Instance**

In [None]:
x_train_scaled_SMOTEENN_dataframe, y_train_SMOTEENN_series \
    = SMOTEENN(random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
        .fit_resample(x_train_scaled_dataframe, y_train_series)

In [None]:
logx.log_write_object(x_train_scaled_SMOTEENN_dataframe)

logx.log_write_object(y_train_SMOTEENN_series)

## **2.6: Check the Balance of the Labels Variable (`y`) by Using the `value_counts` Function.**

In [None]:
y_train_undersampled_series.value_counts()

spam
0    1361
1    1361
Name: count, dtype: int64

In [None]:
y_train_oversampled_series.value_counts()

spam
0    2089
1    2089
Name: count, dtype: int64

In [None]:
y_train_cluster_centroids_series.value_counts()

spam
0    1361
1    1361
Name: count, dtype: int64

In [None]:
y_train_SMOTE_series.value_counts()

spam
0    2089
1    2089
Name: count, dtype: int64

In [None]:
y_train_SMOTEENN_series.value_counts()

spam
1    1785
0    1653
Name: count, dtype: int64

## **2.7: Display Normalized Resampled Training and Testing Data**

In [None]:
pandasx.return_formatted_table \
    (x_train_scaled_undersampled_dataframe,
     'Table 2.7.1: Scaled Features Training Undersampled Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,1.27,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,7.49,4.85,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.24,-0.44
-0.35,-0.16,-0.56,-0.05,-0.46,1.88,-0.29,-0.26,-0.32,0.57,-0.3,0.06,1.64,-0.17,-0.19,-0.32,-0.31,-0.35,0.76,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,1.06,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,0.32,3.18,-0.07,-0.11,0.23,-0.48,-0.19,-0.2,0.45,-0.1,-0.1,-0.18,-0.32
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,3.86,-0.37,-0.3,0.71,3.46,-0.17,-0.19,-0.32,-0.31,-0.35,-0.29,-0.16,0.27,-0.12,-0.29,-0.21,-0.34,-0.3,0.12,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.24,-0.44
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,6.21,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,3.12,-0.16,0.8,-0.12,-0.29,-0.21,-0.34,-0.3,0.31,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,0.62,-0.19,-0.31,-0.3,-0.1,-0.12,-0.24,-0.44
-0.35,1.34,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,1.09,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,1.43,1.8,0.33,1.44,1.47,1.75,2.03,2.53,-0.17,2.52,1.43,2.14,1.82,-0.06,2.03,2.34,-0.13,1.02,3.9,-0.12,0.66,-0.2,-0.07,-0.11,-0.16,1.11,1.67,-0.31,-0.3,-0.1,-0.1,-0.19,-0.32
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,8.64,-0.18,-0.13,4.57,-0.21,-0.12,3.51,-0.2,-0.07,-0.11,-0.16,-0.48,8.23,-0.31,-0.3,-0.1,-0.1,-0.23,-0.43
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,4.65,-0.3,-0.17,-0.19,-0.32,1.64,-0.35,-0.94,-0.16,0.07,-0.12,-0.29,-0.21,0.82,-0.3,-0.23,-0.23,-0.18,1.71,-0.16,-0.14,-0.17,-0.15,-0.19,2.09,-0.32,-0.06,1.99,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.05,-0.19,-0.02,-0.3,-0.1,-0.08,-0.19,-0.35
-0.35,0.47,2.51,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,0.27,-0.3,-0.17,-0.19,-0.32,-0.31,2.65,2.11,-0.16,2.5,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.01,-0.19,0.01,-0.3,-0.1,-0.11,-0.21,-0.39
-0.35,11.39,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.1,-0.23,-0.44
-0.35,-0.16,0.09,-0.05,0.52,-0.36,-0.29,0.56,-0.32,1.7,-0.3,-0.63,-0.3,0.81,-0.19,-0.32,0.39,-0.35,-0.2,-0.16,-0.4,-0.12,-0.29,-0.21,0.91,0.44,-0.03,-0.23,-0.18,-0.22,-0.16,-0.14,0.41,-0.15,-0.19,-0.24,1.96,-0.06,-0.18,-0.18,-0.13,-0.18,1.25,0.38,0.04,-0.2,-0.07,-0.11,0.53,0.88,-0.19,-0.05,1.03,-0.1,-0.04,-0.06,0.06


In [None]:
pandasx.return_formatted_table \
    (x_train_scaled_oversampled_dataframe,
     'Table 2.7.2: Scaled Features Training Oversampled Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
2.83,-0.16,1.35,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.68,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,0.06,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.12,-0.3,-0.1,-0.09,-0.2,-0.36
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.35,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,3.86,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,2.8,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,0.05,-0.3,-0.1,-0.09,-0.21,-0.38
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,5.77,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
0.93,0.15,-0.56,-0.05,0.12,-0.36,-0.29,-0.26,-0.32,0.24,-0.3,0.74,-0.3,0.99,-0.19,-0.32,-0.31,-0.35,-0.28,-0.16,-0.67,-0.12,-0.29,-0.21,1.41,1.93,0.13,1.9,0.52,0.61,0.76,0.98,-0.17,0.97,1.88,0.76,0.58,-0.06,-0.18,0.88,-0.13,0.32,1.51,-0.12,0.1,-0.2,-0.07,-0.11,0.35,0.83,5.17,-0.31,-0.3,-0.1,-0.09,-0.19,-0.23
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,2.35,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.3,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,1.58,-0.18,-0.13,-0.18,-0.21,-0.12,0.46,-0.2,-0.07,-0.11,-0.16,0.48,-0.19,0.32,-0.3,-0.1,-0.08,-0.18,-0.34
1.42,-0.16,-0.56,-0.05,-0.46,0.65,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,1.45,-0.17,-0.19,-0.32,-0.31,-0.35,1.18,-0.16,-0.23,-0.12,-0.29,-0.21,-0.17,-0.3,-0.23,-0.23,-0.18,-0.22,1.11,-0.14,-0.17,-0.15,0.28,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,1.02,-0.19,-0.31,0.01,-0.1,-0.1,-0.17,-0.29
-0.35,-0.02,-0.56,-0.05,-0.2,-0.36,-0.29,0.66,-0.32,-0.37,-0.3,0.45,-0.3,-0.17,-0.19,-0.32,3.74,-0.35,-0.84,-0.16,-0.67,-0.12,-0.29,-0.21,-0.1,-0.1,-0.23,-0.23,-0.18,0.16,-0.16,-0.14,-0.17,-0.15,-0.19,0.71,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,0.28,-0.2,-0.07,-0.11,-0.16,0.36,-0.19,-0.31,-0.3,-0.1,-0.11,-0.21,-0.19
-0.35,-0.16,-0.56,-0.05,0.88,-0.36,1.88,-0.26,-0.32,1.04,4.1,-0.63,2.62,-0.17,-0.19,-0.32,-0.31,-0.35,0.58,-0.16,0.06,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.1,-0.22,-0.42
-0.35,-0.16,3.33,-0.05,1.01,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,1.65,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,1.83,-0.16,0.13,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,2.18,-0.18,-0.13,2.37,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.23,-0.42


In [None]:
pandasx.return_formatted_table \
    (x_train_scaled_cluster_centroids_dataframe,
     'Table 2.7.3: Scaled Features Training Cluster Centroids Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,0.64,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.02,-0.23,0.57,-0.22,-0.16,-0.14,0.62,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.06,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,1.32,1.66,-0.31,-0.3,0.05,-0.04,-0.11,-0.13
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,4.9,4.39,1.02,7.22,7.21,8.61,9.61,11.83,-0.17,11.78,7.07,10.43,-0.32,-0.06,-0.18,11.09,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,3.47,-0.19,-0.31,-0.3,-0.1,-0.06,-0.2,-0.41
-0.21,-0.04,-0.52,-0.05,-0.41,-0.29,-0.29,-0.26,-0.26,-0.1,-0.3,0.23,-0.19,-0.17,-0.19,-0.3,-0.31,-0.28,0.4,-0.16,-0.11,-0.12,-0.29,-0.11,-0.32,-0.27,-0.06,-0.23,-0.18,0.04,-0.16,-0.06,-0.17,-0.06,-0.19,-0.24,-0.19,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,0.02,-0.21,-0.13,-0.07,-0.11,-0.13,-0.19,-0.12,-0.29,-0.27,-0.07,-0.11,-0.18,-0.22
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,0.04,1.58,-0.17,-0.19,-0.32,-0.31,-0.35,-0.29,-0.16,1.24,-0.12,-0.29,-0.21,0.4,0.35,0.12,0.81,0.85,1.01,1.2,1.53,-0.17,1.52,0.82,1.25,1.02,-0.06,1.2,1.39,-0.13,-0.18,2.35,0.76,0.3,-0.2,-0.07,-0.11,-0.16,0.56,-0.19,-0.31,-0.3,-0.1,-0.11,-0.2,-0.33
-0.35,-0.16,-0.56,-0.05,-0.46,1.21,-0.29,-0.26,-0.32,-0.37,-0.3,0.34,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.23,-0.16,-0.67,6.14,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,0.82,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,16.65,-0.48,-0.19,-0.23,-0.3,0.81,0.02,-0.05,-0.03
-0.35,11.39,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.1,-0.23,-0.44
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,0.45,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.64,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,1.85,-0.06,-0.18,-0.18,5.06,-0.18,-0.21,-0.12,0.67,9.24,-0.07,-0.11,-0.16,0.44,-0.19,-0.31,-0.3,-0.1,-0.09,-0.18,-0.35
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,4.59,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,31.33,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.24,-0.43
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,4.6,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,15.96,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,11.49,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.11,-0.23,-0.43
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,1.39,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.78,-0.16,-0.67,-0.12,-0.29,-0.21,0.77,0.03,-0.23,0.81,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,0.31,0.5,1.02,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.0,-0.2,-0.07,-0.11,-0.16,-0.17,-0.19,-0.26,-0.3,-0.1,-0.09,-0.18,-0.18


In [None]:
pandasx.return_formatted_table \
    (x_train_scaled_SMOTE_dataframe,
     'Table 2.7.4: Scaled Features Training SMOTE Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
2.83,-0.16,1.35,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.68,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,0.06,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.12,-0.3,-0.1,-0.09,-0.2,-0.36
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.35,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,3.86,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,2.8,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,0.05,-0.3,-0.1,-0.09,-0.21,-0.38
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,5.77,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
0.93,0.15,-0.56,-0.05,0.12,-0.36,-0.29,-0.26,-0.32,0.24,-0.3,0.74,-0.3,0.99,-0.19,-0.32,-0.31,-0.35,-0.28,-0.16,-0.67,-0.12,-0.29,-0.21,1.41,1.93,0.13,1.9,0.52,0.61,0.76,0.98,-0.17,0.97,1.88,0.76,0.58,-0.06,-0.18,0.88,-0.13,0.32,1.51,-0.12,0.1,-0.2,-0.07,-0.11,0.35,0.83,5.17,-0.31,-0.3,-0.1,-0.09,-0.19,-0.23
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,2.35,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.3,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,1.58,-0.18,-0.13,-0.18,-0.21,-0.12,0.46,-0.2,-0.07,-0.11,-0.16,0.48,-0.19,0.32,-0.3,-0.1,-0.08,-0.18,-0.34
1.42,-0.16,-0.56,-0.05,-0.46,0.65,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,1.45,-0.17,-0.19,-0.32,-0.31,-0.35,1.18,-0.16,-0.23,-0.12,-0.29,-0.21,-0.17,-0.3,-0.23,-0.23,-0.18,-0.22,1.11,-0.14,-0.17,-0.15,0.28,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,1.02,-0.19,-0.31,0.01,-0.1,-0.1,-0.17,-0.29
-0.35,-0.02,-0.56,-0.05,-0.2,-0.36,-0.29,0.66,-0.32,-0.37,-0.3,0.45,-0.3,-0.17,-0.19,-0.32,3.74,-0.35,-0.84,-0.16,-0.67,-0.12,-0.29,-0.21,-0.1,-0.1,-0.23,-0.23,-0.18,0.16,-0.16,-0.14,-0.17,-0.15,-0.19,0.71,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,0.28,-0.2,-0.07,-0.11,-0.16,0.36,-0.19,-0.31,-0.3,-0.1,-0.11,-0.21,-0.19
-0.35,-0.16,-0.56,-0.05,0.88,-0.36,1.88,-0.26,-0.32,1.04,4.1,-0.63,2.62,-0.17,-0.19,-0.32,-0.31,-0.35,0.58,-0.16,0.06,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.1,-0.22,-0.42
-0.35,-0.16,3.33,-0.05,1.01,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,1.65,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,1.83,-0.16,0.13,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,2.18,-0.18,-0.13,2.37,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.23,-0.42


In [None]:
pandasx.return_formatted_table \
    (x_train_scaled_SMOTEENN_dataframe,
     'Table 2.7.5: Scaled Features Training SMOTEEN Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
2.83,-0.16,1.35,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.68,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,0.06,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.12,-0.3,-0.1,-0.09,-0.2,-0.36
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.35,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,3.86,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,2.8,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,0.05,-0.3,-0.1,-0.09,-0.21,-0.38
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,5.77,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
0.93,0.15,-0.56,-0.05,0.12,-0.36,-0.29,-0.26,-0.32,0.24,-0.3,0.74,-0.3,0.99,-0.19,-0.32,-0.31,-0.35,-0.28,-0.16,-0.67,-0.12,-0.29,-0.21,1.41,1.93,0.13,1.9,0.52,0.61,0.76,0.98,-0.17,0.97,1.88,0.76,0.58,-0.06,-0.18,0.88,-0.13,0.32,1.51,-0.12,0.1,-0.2,-0.07,-0.11,0.35,0.83,5.17,-0.31,-0.3,-0.1,-0.09,-0.19,-0.23
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,2.35,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.3,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,1.58,-0.18,-0.13,-0.18,-0.21,-0.12,0.46,-0.2,-0.07,-0.11,-0.16,0.48,-0.19,0.32,-0.3,-0.1,-0.08,-0.18,-0.34
1.42,-0.16,-0.56,-0.05,-0.46,0.65,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,1.45,-0.17,-0.19,-0.32,-0.31,-0.35,1.18,-0.16,-0.23,-0.12,-0.29,-0.21,-0.17,-0.3,-0.23,-0.23,-0.18,-0.22,1.11,-0.14,-0.17,-0.15,0.28,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,1.02,-0.19,-0.31,0.01,-0.1,-0.1,-0.17,-0.29
-0.35,-0.02,-0.56,-0.05,-0.2,-0.36,-0.29,0.66,-0.32,-0.37,-0.3,0.45,-0.3,-0.17,-0.19,-0.32,3.74,-0.35,-0.84,-0.16,-0.67,-0.12,-0.29,-0.21,-0.1,-0.1,-0.23,-0.23,-0.18,0.16,-0.16,-0.14,-0.17,-0.15,-0.19,0.71,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,0.28,-0.2,-0.07,-0.11,-0.16,0.36,-0.19,-0.31,-0.3,-0.1,-0.11,-0.21,-0.19
-0.35,-0.16,3.33,-0.05,1.01,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,1.65,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,1.83,-0.16,0.13,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,2.18,-0.18,-0.13,2.37,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.23,-0.42
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,1.79,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,1.38,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,1.34,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.11,-0.23,-0.42


# <br> **Section 3: Model Optimization**

## **3.1: Logistic Regression**

### **Original**

In [None]:
parameters_grid_dictionary \
    = {'class_weight': ['balanced', None],
       'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
       'multi_class': ['auto', 'ovr', 'multinomial']}

lr_grid_search_model \
    = GridSearchCV \
        (LogisticRegression \
             (random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1,
              max_iter = spam_detector_constants.CONSTANT_ML_LR_MAX_ITERATIONS),
         parameters_grid_dictionary)

lr_undersampled_grid_search_model = copy.copy(lr_grid_search_model)

lr_oversampled_grid_search_model = copy.copy(lr_grid_search_model)

lr_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model = copy.copy(lr_grid_search_model)

lr_SMOTE_grid_search_model = copy.copy(lr_grid_search_model)

lr_SMOTEENN_grid_search_model = copy.copy(lr_grid_search_model)

In [None]:
lr_grid_search_model \
    .fit(x_train_scaled_dataframe, y_train_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model best accuracy score is {:.2f}%' \
         .format(lr_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(lr_grid_search_model.best_params_)
     + '\033[0m')

[1mThe logistic regression model best accuracy score is 92.52%

The optimal model hyperparameters are:
{'class_weight': None, 'multi_class': 'multinomial', 'solver': 'lbfgs'}[0m


### **Random Undersampling**

In [None]:
lr_undersampled_grid_search_model.fit \
    (x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model with random undersampling best accuracy score is {:.2f}%' \
         .format(lr_undersampled_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(lr_undersampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe logistic regression model with random undersampling best accuracy score is 92.21%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'multi_class': 'auto', 'solver': 'liblinear'}[0m


### **Random Oversampling**

In [None]:
lr_oversampled_grid_search_model.fit \
    (x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model with random oversampling best accuracy score is {:.2f}%' \
         .format(lr_oversampled_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(lr_oversampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe logistic regression model with random oversampling best accuracy score is 92.41%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'multi_class': 'auto', 'solver': 'saga'}[0m


### **Cluster Centroids**

In [None]:
lr_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model.fit \
    (x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model with cluster centroid best accuracy score is {:.2f}%' \
         .format(lr_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(lr_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model.best_params_)
     + '\033[0m')

[1mThe logistic regression model with cluster centroid best accuracy score is 92.32%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'multi_class': 'multinomial', 'solver': 'sag'}[0m


### **SMOTE**

In [None]:
lr_SMOTE_grid_search_model.fit \
    (x_train_scaled_SMOTE_dataframe, y_train_SMOTE_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model with SMOTE best accuracy score is {:.2f}%' \
         .format(lr_SMOTE_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(lr_SMOTE_grid_search_model.best_params_)
     + '\033[0m')

[1mThe logistic regression model with SMOTE best accuracy score is 92.56%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'multi_class': 'auto', 'solver': 'saga'}[0m


### **SMOTEEN**

In [None]:
lr_SMOTEENN_grid_search_model.fit \
    (x_train_scaled_SMOTEENN_dataframe, y_train_SMOTEENN_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model with SMOTEENN best accuracy score is {:.2f}%' \
         .format(lr_SMOTEENN_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(lr_SMOTEENN_grid_search_model.best_params_)
     + '\033[0m')

[1mThe logistic regression model with SMOTEENN best accuracy score is 97.82%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'multi_class': 'auto', 'solver': 'lbfgs'}[0m


## **3.2: Decision Tree**

### **Original**

In [None]:
parameters_grid_dictionary \
    = {'criterion': ['gini', 'entropy', 'log_loss'],
       'splitter': ['best', 'random'],
       'class_weight': ['balanced', None]}

dt_grid_search_model \
    = GridSearchCV \
        (DecisionTreeClassifier(random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1),
         parameters_grid_dictionary)

dt_undersampled_grid_search_model = copy.copy(dt_grid_search_model)

dt_oversampled_grid_search_model = copy.copy(dt_grid_search_model)

dt_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model = copy.copy(dt_grid_search_model)

dt_SMOTE_grid_search_model = copy.copy(dt_grid_search_model)

dt_SMOTEENN_grid_search_model = copy.copy(dt_grid_search_model)

In [None]:
dt_grid_search_model \
    .fit(x_train_scaled_dataframe, y_train_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model best accuracy score is {:.2f}%' \
         .format(dt_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(dt_grid_search_model.best_params_)
     + '\033[0m')

[1mThe decision tree model best accuracy score is 91.51%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'criterion': 'entropy', 'splitter': 'best'}[0m


### **Random Undersampling**

In [None]:
dt_undersampled_grid_search_model.fit \
    (x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model with random undersampling best accuracy score is {:.2f}%' \
         .format(dt_undersampled_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(dt_undersampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe decision tree model with random undersampling best accuracy score is 90.70%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'criterion': 'entropy', 'splitter': 'best'}[0m


### **Random Oversampling**

In [None]:
dt_oversampled_grid_search_model.fit \
    (x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model with random oversampling best accuracy score is {:.2f}%' \
         .format(dt_oversampled_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(dt_oversampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe decision tree model with random oversampling best accuracy score is 93.92%

The optimal model hyperparameters are:
{'class_weight': None, 'criterion': 'entropy', 'splitter': 'best'}[0m


### **Cluster Centroids**

In [None]:
dt_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model.fit \
    (x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model with cluster centroid best accuracy score is {:.2f}%' \
         .format(dt_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(dt_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model.best_params_)
     + '\033[0m')

[1mThe decision tree model with cluster centroid best accuracy score is 90.67%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'criterion': 'entropy', 'splitter': 'best'}[0m


### **SMOTE**

In [None]:
dt_SMOTE_grid_search_model.fit \
    (x_train_scaled_SMOTE_dataframe, y_train_SMOTE_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model with SMOTE best accuracy score is {:.2f}%' \
         .format(dt_SMOTE_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(dt_SMOTE_grid_search_model.best_params_)
     + '\033[0m')

[1mThe decision tree model with SMOTE best accuracy score is 91.74%

The optimal model hyperparameters are:
{'class_weight': None, 'criterion': 'entropy', 'splitter': 'best'}[0m


### **SMOTEEN**

In [None]:
dt_SMOTEENN_grid_search_model.fit \
    (x_train_scaled_SMOTEENN_dataframe, y_train_SMOTEENN_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model with SMOTEENN best accuracy score is {:.2f}%' \
         .format(dt_SMOTEENN_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(dt_SMOTEENN_grid_search_model.best_params_)
     + '\033[0m')

[1mThe decision tree model with SMOTEENN best accuracy score is 96.22%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'criterion': 'entropy', 'splitter': 'best'}[0m


## **3.3: Random Forest**

### **Original**

In [None]:
parameters_grid_dictionary \
    = {'criterion': ['gini', 'entropy', 'log_loss'],
       'max_features': ['sqrt', 'log2', None],
       'class_weight': ['balanced', 'balanced_subsample', None]}

rf_grid_search_model \
    = GridSearchCV \
        (RandomForestClassifier \
             (n_estimators = spam_detector_constants.CONSTANT_ML_RF_N_ESTIMATORS,
              random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1),
         parameters_grid_dictionary)

rf_undersampled_grid_search_model = copy.copy(rf_grid_search_model)

rf_oversampled_grid_search_model = copy.copy(rf_grid_search_model)

rf_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model = copy.copy(rf_grid_search_model)

rf_SMOTE_grid_search_model = copy.copy(rf_grid_search_model)

rf_SMOTEENN_grid_search_model = copy.copy(rf_grid_search_model)

In [None]:
rf_grid_search_model \
    .fit(x_train_scaled_dataframe, y_train_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model best accuracy score is {:.2f}%' \
         .format(rf_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(rf_grid_search_model.best_params_)
     + '\033[0m')

[1mThe random forest model best accuracy score is 95.07%

The optimal model hyperparameters are:
{'class_weight': 'balanced_subsample', 'criterion': 'entropy', 'max_features': 'log2'}[0m


### **Random Undersampling**

In [None]:
rf_undersampled_grid_search_model.fit \
    (x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model with random undersampling best accuracy score is {:.2f}%' \
         .format(rf_undersampled_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(rf_undersampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe random forest model with random undersampling best accuracy score is 94.71%

The optimal model hyperparameters are:
{'class_weight': None, 'criterion': 'gini', 'max_features': 'log2'}[0m


### **Random Oversampling**

In [None]:
rf_oversampled_grid_search_model.fit \
    (x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model with random oversampling best accuracy score is {:.2f}%' \
         .format(rf_oversampled_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(rf_oversampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe random forest model with random oversampling best accuracy score is 96.84%

The optimal model hyperparameters are:
{'class_weight': None, 'criterion': 'gini', 'max_features': 'log2'}[0m


### **Cluster Centroids**

In [None]:
rf_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model.fit \
    (x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model with cluster centroid best accuracy score is {:.2f}%' \
         .format(rf_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(rf_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model.best_params_)
     + '\033[0m')

[1mThe random forest model with cluster centroid best accuracy score is 94.49%

The optimal model hyperparameters are:
{'class_weight': None, 'criterion': 'entropy', 'max_features': 'log2'}[0m


### **SMOTE**

In [None]:
rf_SMOTE_grid_search_model.fit \
    (x_train_scaled_SMOTE_dataframe, y_train_SMOTE_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model with SMOTE best accuracy score is {:.2f}%' \
         .format(rf_SMOTE_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(rf_SMOTE_grid_search_model.best_params_)
     + '\033[0m')

[1mThe random forest model with SMOTE best accuracy score is 95.98%

The optimal model hyperparameters are:
{'class_weight': 'balanced_subsample', 'criterion': 'entropy', 'max_features': 'log2'}[0m


### **SMOTEEN**

In [None]:
rf_SMOTEENN_grid_search_model.fit \
    (x_train_scaled_SMOTEENN_dataframe, y_train_SMOTEENN_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model with SMOTEENN best accuracy score is {:.2f}%' \
         .format(rf_SMOTEENN_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(rf_SMOTEENN_grid_search_model.best_params_)
     + '\033[0m')

[1mThe random forest model with SMOTEENN best accuracy score is 98.81%

The optimal model hyperparameters are:
{'class_weight': None, 'criterion': 'entropy', 'max_features': 'log2'}[0m


## **3.4: Support Vector Machine (SVM)**

### **Original**

In [None]:
parameters_grid_dictionary \
    = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
       'gamma': ['scale', 'auto'],
       'class_weight': ['balanced', None],
       'decision_function_shape': ['ovo', 'ovr']}

svm_grid_search_model \
    = GridSearchCV \
        (SVC \
             (probability = spam_detector_constants.CONSTANT_ML_SVM_PROBABILITY,
              random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1),
         parameters_grid_dictionary)

svm_undersampled_grid_search_model = copy.copy(svm_grid_search_model)

svm_oversampled_grid_search_model = copy.copy(svm_grid_search_model)

svm_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model = copy.copy(svm_grid_search_model)

svm_SMOTE_grid_search_model = copy.copy(svm_grid_search_model)

svm_SMOTEENN_grid_search_model = copy.copy(svm_grid_search_model)

In [None]:
svm_grid_search_model \
    .fit(x_train_scaled_dataframe, y_train_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The svm model best accuracy score is {:.2f}%' \
         .format(svm_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(svm_grid_search_model.best_params_)
     + '\033[0m')

[1mThe svm model best accuracy score is 92.99%

The optimal model hyperparameters are:
{'class_weight': None, 'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'rbf'}[0m


### **Random Undersampling**

In [None]:
svm_undersampled_grid_search_model.fit \
    (x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The svm model with random undersampling best accuracy score is {:.2f}%' \
         .format(svm_undersampled_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(svm_undersampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe svm model with random undersampling best accuracy score is 92.73%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'linear'}[0m


### **Random Oversampling**

In [None]:
svm_oversampled_grid_search_model.fit \
    (x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The svm model with random oversampling best accuracy score is {:.2f}%' \
         .format(svm_oversampled_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(svm_oversampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe svm model with random oversampling best accuracy score is 93.56%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'rbf'}[0m


### **Cluster Centroids**

In [None]:
svm_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model.fit \
    (x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The svm model with cluster centroid best accuracy score is {:.2f}%' \
         .format(svm_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(svm_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model.best_params_)
     + '\033[0m')

[1mThe svm model with cluster centroid best accuracy score is 92.14%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'linear'}[0m


### **SMOTE**

In [None]:
svm_SMOTE_grid_search_model.fit \
    (x_train_scaled_SMOTE_dataframe, y_train_SMOTE_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The svm model with SMOTE best accuracy score is {:.2f}%' \
         .format(svm_SMOTE_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(svm_SMOTE_grid_search_model.best_params_)
     + '\033[0m')

[1mThe svm model with SMOTE best accuracy score is 93.54%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'rbf'}[0m


### **SMOTEEN**

In [None]:
svm_SMOTEENN_grid_search_model.fit \
    (x_train_scaled_SMOTEENN_dataframe, y_train_SMOTEENN_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The svm model with SMOTEENN best accuracy score is {:.2f}%' \
         .format(svm_SMOTEENN_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(svm_SMOTEENN_grid_search_model.best_params_)
     + '\033[0m')

[1mThe svm model with SMOTEENN best accuracy score is 98.25%

The optimal model hyperparameters are:
{'class_weight': None, 'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'rbf'}[0m


## **3.5: K-Nearest Neighbor (KNN)**

### **Original**

In [None]:
parameters_grid_dictionary \
    = {'n_neighbors': np.arange(5, 21, 1),
       'weights': ['uniform', 'distance', None],
       'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
       'p': [1, 2]}

knn_grid_search_model \
    = GridSearchCV \
        (KNeighborsClassifier(leaf_size = spam_detector_constants.CONSTANT_ML_KNN_LEAF_SIZE),
         parameters_grid_dictionary)

knn_undersampled_grid_search_model = copy.copy(knn_grid_search_model)

knn_oversampled_grid_search_model = copy.copy(knn_grid_search_model)

knn_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model = copy.copy(knn_grid_search_model)

knn_SMOTE_grid_search_model = copy.copy(knn_grid_search_model)

knn_SMOTEENN_grid_search_model = copy.copy(knn_grid_search_model)

In [None]:
knn_grid_search_model \
    .fit(x_train_scaled_dataframe, y_train_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model best accuracy score is {:.2f}%' \
         .format(knn_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(knn_grid_search_model.best_params_)
     + '\033[0m')

[1mThe knn model best accuracy score is 92.00%

The optimal model hyperparameters are:
{'algorithm': 'auto', 'n_neighbors': 10, 'p': 2, 'weights': 'distance'}[0m


### **Random Undersampling**

In [None]:
knn_undersampled_grid_search_model.fit \
    (x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model with random undersampling best accuracy score is {:.2f}%' \
         .format(knn_undersampled_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(knn_undersampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe knn model with random undersampling best accuracy score is 91.70%

The optimal model hyperparameters are:
{'algorithm': 'auto', 'n_neighbors': 12, 'p': 1, 'weights': 'distance'}[0m


### **Random Oversampling**

In [None]:
knn_oversampled_grid_search_model.fit \
    (x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model with random oversampling best accuracy score is {:.2f}%' \
         .format(knn_oversampled_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(knn_oversampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe knn model with random oversampling best accuracy score is 95.64%

The optimal model hyperparameters are:
{'algorithm': 'auto', 'n_neighbors': 19, 'p': 1, 'weights': 'distance'}[0m


### **Cluster Centroids**

In [None]:
knn_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model.fit \
    (x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model with cluster centroid best accuracy score is {:.2f}%' \
         .format(knn_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(knn_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model.best_params_)
     + '\033[0m')

[1mThe knn model with cluster centroid best accuracy score is 90.26%

The optimal model hyperparameters are:
{'algorithm': 'auto', 'n_neighbors': 6, 'p': 2, 'weights': 'distance'}[0m


### **SMOTE**

In [None]:
knn_SMOTE_grid_search_model.fit \
    (x_train_scaled_SMOTE_dataframe, y_train_SMOTE_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model with SMOTE best accuracy score is {:.2f}%' \
         .format(knn_SMOTE_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(knn_SMOTE_grid_search_model.best_params_)
     + '\033[0m')

[1mThe knn model with SMOTE best accuracy score is 94.52%

The optimal model hyperparameters are:
{'algorithm': 'auto', 'n_neighbors': 12, 'p': 1, 'weights': 'distance'}[0m


### **SMOTEEN**

In [None]:
knn_SMOTEENN_grid_search_model.fit \
    (x_train_scaled_SMOTEENN_dataframe, y_train_SMOTEENN_series)

clear_output()

In [None]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model with SMOTEENN best accuracy score is {:.2f}%' \
         .format(knn_SMOTEENN_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(knn_SMOTEENN_grid_search_model.best_params_)
     + '\033[0m')

[1mThe knn model with SMOTEENN best accuracy score is 99.01%

The optimal model hyperparameters are:
{'algorithm': 'auto', 'n_neighbors': 6, 'p': 2, 'weights': 'distance'}[0m


# <br> **Section 4: Save Models To Files**

## **4.1: Logistic Regression**

### **Original**

In [None]:
pickle.dump \
    (lr_grid_search_model,
     open(spam_detector_constants.CONSTANT_LR_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [None]:
pickle.dump \
    (lr_undersampled_grid_search_model,
     open(spam_detector_constants.CONSTANT_LR_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [None]:
pickle.dump \
    (lr_oversampled_grid_search_model,
     open(spam_detector_constants.CONSTANT_LR_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [None]:
pickle.dump \
    (lr_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model,
     open(spam_detector_constants.CONSTANT_LR_CLUSTER_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **SMOTE**

In [None]:
pickle.dump \
    (lr_SMOTE_grid_search_model,
     open(spam_detector_constants.CONSTANT_LR_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **SMOTEEN**

In [None]:
pickle.dump \
    (lr_SMOTEENN_grid_search_model,
     open(spam_detector_constants.CONSTANT_LR_SMOTEENN_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

## **4.2: Decision Tree**

### **Original**

In [None]:
pickle.dump \
    (dt_grid_search_model,
     open(spam_detector_constants.CONSTANT_DT_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [None]:
pickle.dump \
    (dt_undersampled_grid_search_model,
     open(spam_detector_constants.CONSTANT_DT_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [None]:
pickle.dump \
    (dt_oversampled_grid_search_model,
     open(spam_detector_constants.CONSTANT_DT_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [None]:
pickle.dump \
    (dt_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model,
     open(spam_detector_constants.CONSTANT_DT_CLUSTER_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **SMOTE**

In [None]:
pickle.dump \
    (dt_SMOTE_grid_search_model,
     open(spam_detector_constants.CONSTANT_DT_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **SMOTEEN**

In [None]:
pickle.dump \
    (dt_SMOTEENN_grid_search_model,
     open(spam_detector_constants.CONSTANT_DT_SMOTEENN_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

## **4.3: Random Forest**

### **Original**

In [None]:
pickle.dump \
    (rf_grid_search_model,
     open(spam_detector_constants.CONSTANT_RF_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [None]:
pickle.dump \
    (rf_undersampled_grid_search_model,
     open(spam_detector_constants.CONSTANT_RF_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [None]:
pickle.dump \
    (rf_oversampled_grid_search_model,
     open(spam_detector_constants.CONSTANT_RF_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [None]:
pickle.dump \
    (rf_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model,
     open(spam_detector_constants.CONSTANT_RF_CLUSTER_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **SMOTE**

In [None]:
pickle.dump \
    (rf_SMOTE_grid_search_model,
     open(spam_detector_constants.CONSTANT_RF_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **SMOTEEN**

In [None]:
pickle.dump \
    (rf_SMOTEENN_grid_search_model,
     open(spam_detector_constants.CONSTANT_RF_SMOTEENN_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

## **4.4: Support Vector Machine (SVM)**

### **Original**

In [None]:
pickle.dump \
    (svm_grid_search_model,
     open(spam_detector_constants.CONSTANT_SVM_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [None]:
pickle.dump \
    (svm_undersampled_grid_search_model,
     open(spam_detector_constants.CONSTANT_SVM_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [None]:
pickle.dump \
    (svm_oversampled_grid_search_model,
     open(spam_detector_constants.CONSTANT_SVM_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [None]:
pickle.dump \
    (svm_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model,
     open(spam_detector_constants.CONSTANT_SVM_CLUSTER_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **SMOTE**

In [None]:
pickle.dump \
    (svm_SMOTE_grid_search_model,
     open(spam_detector_constants.CONSTANT_SVM_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **SMOTEEN**

In [None]:
pickle.dump \
    (svm_SMOTEENN_grid_search_model,
     open(spam_detector_constants.CONSTANT_SVM_SMOTEENN_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

## **4.5: K-Nearest Neighbor (KNN)**

### **Original**

In [None]:
pickle.dump \
    (knn_grid_search_model,
     open(spam_detector_constants.CONSTANT_KNN_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [None]:
pickle.dump \
    (knn_undersampled_grid_search_model,
     open(spam_detector_constants.CONSTANT_KNN_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [None]:
pickle.dump \
    (knn_oversampled_grid_search_model,
     open(spam_detector_constants.CONSTANT_KNN_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [None]:
pickle.dump \
    (knn_cluster_CLUSTER_CENTROIDS_GRID_SEARCH_model,
     open(spam_detector_constants.CONSTANT_KNN_CLUSTER_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **SMOTE**

In [None]:
pickle.dump \
    (knn_SMOTE_grid_search_model,
     open(spam_detector_constants.CONSTANT_KNN_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **SMOTEEN**

In [None]:
pickle.dump \
    (knn_SMOTEENN_grid_search_model,
     open(spam_detector_constants.CONSTANT_KNN_SMOTEENN_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

In [None]:
# logx.end_program()