<a href="https://colab.research.google.com/github/njgeorge000158/Spam-Detection-with-Supervised-Machine-Learning-Models/blob/main/spam_detector_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#*******************************************************************************************
 #
 #  File Name:  spam_detector_colab.ipynb
 #
 #  File Description:
 #      This interactive Python notebook, spam_detector_colab.ipynb, reads a csv file,
 #      spam-data.csv and uses Python and scikit-learn module to find the best
 #      supervised learning model (binary classification) for detecting spam in
 #      e-mails. Here is a list of the models:
 #
 #      logistic regression
 #      decision tree
 #      random forest
 #      support vector machine
 #      k-nearest neighbor
 #      gaussian naive bayes
 #
 #
 #  Date            Description                             Programmer
 #  ----------      ------------------------------------    ------------------
 #  04/22/2024      Initial Development                     Nicholas J. George
 #
 #******************************************************************************************/

from google.colab import drive
drive.mount('/content/gdrive/')

import sys
sys.path.insert(0,'./gdrive/MyDrive/spam_detection')

import os
os.environ['HV_DOC_HTML'] = 'true'

!apt-get update
!apt install firefox
!pip install -U geckodriver
!pip install -U dataframe_image
!pip install -U selenium
!pip install -U kaleido
!pip install -U hvplot
!pip install -U plotly
!pip install -U panel
!pip install -U bokeh
!pip install -U imblearn

import hvplot
import hvplot.pandas

import pandas as pd

import holoviews as hv
hv.extension('bokeh')

import logx
logx.set_logs_directory_path('./gdrive/MyDrive/spam_detection/logs')
logx.set_images_directory_path('./gdrive/MyDrive/spam_detection/images')
logx.set_models_directory_path('./gdrive/MyDrive/spam_detection/models')
logx.create_directory(logx.MODELS_DIRECTORY_PATH)

import pandasx
pandasx.set_google_colab(True)

import classificationsx
import logx
import pandasx
import spam_detector_constants

import copy
import pickle

import numpy as np
import pandas as pd

from imblearn.combine import SMOTEENN
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import RandomUnderSampler

from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

Mounted at /content/gdrive/
Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:4 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:6 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:7 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:9 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Get:11 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,077 kB]
Get:12 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [1,756 kB]
Get:13 https://develo

In [3]:
CONSTANT_LOCAL_FILE_NAME = 'spam_detector_colab.ipynb'


logx.set_log_mode(False)

logx.set_image_mode(False)


logx.begin_program('spam_detector_colab')

Program execution begins...



# <br> **Section 1: Extraction and Transformation**

## **1.1: Read the CSV data into a Pandas DataFrame**

In [4]:
data_type_dictionary \
    = {'word_freq_make': float,
       'word_freq_address': float,
       'word_freq_all': float,
       'word_freq_3d': float,
       'word_freq_our': float,
       'word_freq_over': float,
       'word_freq_remove': float,
       'word_freq_internet': float,
       'word_freq_order': float,
       'word_freq_mail': float,
       'word_freq_receive': float,
       'word_freq_will': float,
       'word_freq_people': float,
       'word_freq_report': float,
       'word_freq_addresses': float,
       'word_freq_free': float,
       'word_freq_business': float,
       'word_freq_email': float,
       'word_freq_you': float,
       'word_freq_credit': float,
       'word_freq_your': float,
       'word_freq_font': float,
       'word_freq_000': float,
       'word_freq_money': float,
       'word_freq_hp': float,
       'word_freq_hpl': float,
       'word_freq_george': float,
       'word_freq_650': float,
       'word_freq_lab': float,
       'word_freq_labs': float,
       'word_freq_telnet': float,
       'word_freq_857': float,
       'word_freq_data': float,
       'word_freq_415': float,
       'word_freq_85': float,
       'word_freq_technology': float,
       'word_freq_1999': float,
       'word_freq_parts': float,
       'word_freq_pm': float,
       'word_freq_direct': float,
       'word_freq_cs': float,
       'word_freq_meeting': float,
       'word_freq_original': float,
       'word_freq_project': float,
       'word_freq_re': float,
       'word_freq_edu': float,
       'word_freq_table': float,
       'word_freq_conference': float,
       'char_freq_;': float,
       'char_freq_(': float,
       'char_freq_[': float,
       'char_freq_!': float,
       'char_freq_$': float,
       'char_freq_#': float,
       'capital_run_length_average': float,
       'capital_run_length_longest': int,
       'capital_run_length_total': int,
       'spam': int}

spam_dataframe \
    = pd.read_csv(spam_detector_constants.CONSTANT_INPUT_FILE_PATH, dtype = data_type_dictionary)

logx.log_write_object(spam_dataframe)

## **1.2: Display Spam DataFrame**

In [5]:
pandasx.return_formatted_table(spam_dataframe, 'Table 1.2: Spam Data Table')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,0.0,0.64,0.0,0.0,0.0,0.32,0.0,1.29,1.93,0.0,0.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.78,0.0,0.0,3.76,61,278,1
0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,0.21,0.79,0.65,0.21,0.14,0.14,0.07,0.28,3.47,0.0,1.59,0.0,0.43,0.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.0,0.37,0.18,0.05,5.11,101,1028,1
0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,0.38,0.45,0.12,0.0,1.75,0.06,0.06,1.03,1.36,0.32,0.51,0.0,1.16,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.12,0.0,0.06,0.06,0.0,0.0,0.01,0.14,0.0,0.28,0.18,0.01,9.82,485,2259,1
0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,0.31,0.31,0.31,0.0,0.0,0.31,0.0,0.0,3.18,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.14,0.0,0.0,3.54,40,191,1
0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,0.31,0.31,0.31,0.0,0.0,0.31,0.0,0.0,3.18,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.14,0.0,0.0,3.54,40,191,1
0.0,0.0,0.0,0.0,1.85,0.0,0.0,1.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22,0.0,0.0,0.0,0.0,3.0,15,54,1
0.0,0.0,0.0,0.0,1.92,0.0,0.0,0.0,0.0,0.64,0.96,1.28,0.0,0.0,0.0,0.96,0.0,0.32,3.85,0.0,0.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.16,0.05,0.0,1.67,4,112,1
0.0,0.0,0.0,0.0,1.88,0.0,0.0,1.88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,2.45,11,49,1
0.15,0.0,0.46,0.0,0.61,0.0,0.3,0.0,0.92,0.76,0.76,0.92,0.0,0.0,0.0,0.0,0.0,0.15,1.23,3.53,2.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.27,0.0,0.18,0.2,0.02,9.74,445,1257,1
0.06,0.12,0.77,0.0,0.19,0.32,0.38,0.0,0.06,0.0,0.0,0.64,0.25,0.0,0.12,0.0,0.0,0.12,1.67,0.06,0.71,0.0,0.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.04,0.03,0.0,0.24,0.08,0.0,1.73,43,749,1


## **1.3: Create the labels series (`y`)  from the “spam” column, and then create the features (`X`) DataFrame from the remaining columns.**

### **Separate the Y Variable, The Labels**

In [6]:
y_series = spam_dataframe['spam']

logx.log_write_object(y_series)

### **Review the Y Series**

In [7]:
pandasx.return_formatted_table(y_series.to_frame(), 'Table 1.3.1: Spam Target Series')

spam
1
1
1
1
1
1
1
1
1
1


### **Check the Balance of the Labels Variable (`y`) by Using the `value_counts` Function.**

In [8]:
y_series.value_counts()

spam
0    2788
1    1813
Name: count, dtype: int64

### **Separate the X Variable, the Features**

In [9]:
x_dataframe = spam_dataframe.drop(columns = 'spam', axis = 1)

logx.log_write_object(x_dataframe)

### **Review the X DataFrame**

In [10]:
pandasx.return_formatted_table(x_dataframe, 'Table 1.3.2: Spam Features DataFrame')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,0.0,0.64,0.0,0.0,0.0,0.32,0.0,1.29,1.93,0.0,0.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.78,0.0,0.0,3.76,61,278
0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,0.21,0.79,0.65,0.21,0.14,0.14,0.07,0.28,3.47,0.0,1.59,0.0,0.43,0.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.0,0.37,0.18,0.05,5.11,101,1028
0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,0.38,0.45,0.12,0.0,1.75,0.06,0.06,1.03,1.36,0.32,0.51,0.0,1.16,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.12,0.0,0.06,0.06,0.0,0.0,0.01,0.14,0.0,0.28,0.18,0.01,9.82,485,2259
0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,0.31,0.31,0.31,0.0,0.0,0.31,0.0,0.0,3.18,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.14,0.0,0.0,3.54,40,191
0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,0.31,0.31,0.31,0.0,0.0,0.31,0.0,0.0,3.18,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.14,0.0,0.0,3.54,40,191
0.0,0.0,0.0,0.0,1.85,0.0,0.0,1.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22,0.0,0.0,0.0,0.0,3.0,15,54
0.0,0.0,0.0,0.0,1.92,0.0,0.0,0.0,0.0,0.64,0.96,1.28,0.0,0.0,0.0,0.96,0.0,0.32,3.85,0.0,0.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.16,0.05,0.0,1.67,4,112
0.0,0.0,0.0,0.0,1.88,0.0,0.0,1.88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,2.45,11,49
0.15,0.0,0.46,0.0,0.61,0.0,0.3,0.0,0.92,0.76,0.76,0.92,0.0,0.0,0.0,0.0,0.0,0.15,1.23,3.53,2.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.27,0.0,0.18,0.2,0.02,9.74,445,1257
0.06,0.12,0.77,0.0,0.19,0.32,0.38,0.0,0.06,0.0,0.0,0.64,0.25,0.0,0.12,0.0,0.0,0.12,1.67,0.06,0.71,0.0,0.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.04,0.03,0.0,0.24,0.08,0.0,1.73,43,749


## **1.4: Split the Data into Training and Testing Datasets by Using `train_test_split`.**

In [11]:
x_train_dataframe, x_test_dataframe, \
y_train_series, y_test_series \
    = train_test_split \
        (x_dataframe, y_series,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1)

In [12]:
logx.log_write_object(x_train_dataframe)

logx.log_write_object(x_test_dataframe)

logx.log_write_object(y_train_series)

logx.log_write_object(y_test_series)

## **1.5: Use the StandardScaler to Scale the X Variables**

### **Scale Training and Test Data as Numpy Arrays**

In [13]:
x_train_scaled_nparray = StandardScaler().fit_transform(x_train_dataframe)

logx.log_write_object(x_train_scaled_nparray)

In [14]:
x_test_scaled_nparray = StandardScaler().fit_transform(x_test_dataframe)

logx.log_write_object(x_test_scaled_nparray)

### **Create Scaled X Variable DataFrames**

In [15]:
x_train_scaled_dataframe \
    = pd.DataFrame \
        (x_train_scaled_nparray,
         columns = x_train_dataframe.columns,
         index = x_train_dataframe.index)

logx.log_write_object(x_train_scaled_dataframe)

In [16]:
x_test_scaled_dataframe \
    = pd.DataFrame \
        (x_test_scaled_nparray,
         columns = x_test_dataframe.columns,
         index = x_test_dataframe.index)

logx.log_write_object(x_test_scaled_dataframe)

### **Display Scaled Training and Testing Data**

In [17]:
pandasx.return_formatted_table \
    (x_train_scaled_dataframe,
     'Table 1.5.1: Spam Scaled Features Training Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
2.83,-0.16,1.35,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.68,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,0.06,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.12,-0.3,-0.1,-0.09,-0.2,-0.36
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.35,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,3.86,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,2.8,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,0.05,-0.3,-0.1,-0.09,-0.21,-0.38
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,5.77,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
0.93,0.15,-0.56,-0.05,0.12,-0.36,-0.29,-0.26,-0.32,0.24,-0.3,0.74,-0.3,0.99,-0.19,-0.32,-0.31,-0.35,-0.28,-0.16,-0.67,-0.12,-0.29,-0.21,1.41,1.93,0.13,1.9,0.52,0.61,0.76,0.98,-0.17,0.97,1.88,0.76,0.58,-0.06,-0.18,0.88,-0.13,0.32,1.51,-0.12,0.1,-0.2,-0.07,-0.11,0.35,0.83,5.17,-0.31,-0.3,-0.1,-0.09,-0.19,-0.23
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,2.35,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.3,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,1.58,-0.18,-0.13,-0.18,-0.21,-0.12,0.46,-0.2,-0.07,-0.11,-0.16,0.48,-0.19,0.32,-0.3,-0.1,-0.08,-0.18,-0.34
1.42,-0.16,-0.56,-0.05,-0.46,0.65,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,1.45,-0.17,-0.19,-0.32,-0.31,-0.35,1.18,-0.16,-0.23,-0.12,-0.29,-0.21,-0.17,-0.3,-0.23,-0.23,-0.18,-0.22,1.11,-0.14,-0.17,-0.15,0.28,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,1.02,-0.19,-0.31,0.01,-0.1,-0.1,-0.17,-0.29
-0.35,-0.02,-0.56,-0.05,-0.2,-0.36,-0.29,0.66,-0.32,-0.37,-0.3,0.45,-0.3,-0.17,-0.19,-0.32,3.74,-0.35,-0.84,-0.16,-0.67,-0.12,-0.29,-0.21,-0.1,-0.1,-0.23,-0.23,-0.18,0.16,-0.16,-0.14,-0.17,-0.15,-0.19,0.71,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,0.28,-0.2,-0.07,-0.11,-0.16,0.36,-0.19,-0.31,-0.3,-0.1,-0.11,-0.21,-0.19
-0.35,-0.16,-0.56,-0.05,0.88,-0.36,1.88,-0.26,-0.32,1.04,4.1,-0.63,2.62,-0.17,-0.19,-0.32,-0.31,-0.35,0.58,-0.16,0.06,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.1,-0.22,-0.42
-0.35,-0.16,3.33,-0.05,1.01,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,1.65,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,1.83,-0.16,0.13,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,2.18,-0.18,-0.13,2.37,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.23,-0.42


In [18]:
pandasx.return_formatted_table \
    (x_test_scaled_dataframe,
     'Table 1.5.2: Spam Scaled Features Test Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
-0.02,-0.17,-0.0,-0.05,0.06,-0.02,-0.3,0.18,-0.01,-0.38,-0.29,0.23,-0.35,0.89,-0.2,-0.27,-0.36,-0.36,0.24,-0.18,2.26,-0.12,0.78,0.02,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.05,-0.19,-0.08,-0.13,-0.1,-0.44,0.05,-0.04,-0.19,0.0,-0.02,0.2,0.48
-0.31,-0.17,-0.56,-0.05,-0.46,-0.34,-0.3,-0.28,-0.33,-0.38,-0.29,-0.62,-0.35,-0.18,-0.2,-0.27,-0.36,-0.36,-0.51,-0.18,-0.69,-0.12,-0.3,-0.26,-0.13,0.1,-0.22,1.23,0.37,0.62,-0.18,-0.15,-0.18,-0.15,0.68,-0.25,-0.33,-0.07,1.29,-0.21,-0.1,-0.16,-0.2,0.55,-0.29,-0.19,-0.08,-0.13,-0.17,-0.66,-0.13,-0.43,-0.32,-0.16,-0.19,-0.3,-0.3
-0.31,-0.17,-0.56,-0.05,-0.46,-0.34,-0.3,-0.28,-0.33,2.42,-0.29,0.85,-0.35,-0.18,-0.2,-0.27,-0.36,-0.36,-0.92,-0.18,-0.69,-0.12,-0.3,-0.26,0.99,1.11,0.14,3.68,0.77,1.27,1.72,2.2,-0.18,2.21,1.38,2.58,-0.33,-0.07,1.12,1.97,-0.1,-0.16,2.68,-0.14,0.26,-0.19,-0.08,-0.13,0.63,2.93,-0.13,-0.43,-0.32,-0.16,-0.11,-0.26,-0.2
-0.31,-0.17,-0.56,-0.05,-0.46,-0.34,-0.3,-0.28,-0.33,0.31,-0.29,1.56,-0.35,-0.18,-0.2,-0.27,-0.36,0.46,-0.12,-0.18,-0.28,-0.12,-0.3,-0.26,0.66,1.26,0.05,1.7,0.54,0.88,2.64,5.09,-0.18,5.1,2.16,0.8,-0.33,-0.07,-0.18,3.03,-0.1,1.73,-0.2,-0.14,0.12,-0.19,-0.08,4.55,-0.17,1.06,-0.13,-0.43,-0.32,-0.16,0.0,-0.1,-0.21
-0.31,-0.17,0.04,-0.05,0.83,0.15,-0.3,-0.28,2.78,-0.38,-0.29,0.24,0.15,-0.18,-0.2,-0.27,0.43,-0.36,0.35,-0.18,0.1,-0.12,0.12,0.66,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,1.28,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,-0.17,-0.2,-0.13,0.2,-0.11,-0.03,-0.13,0.03,0.27
-0.31,-0.17,0.75,-0.05,-0.46,1.88,-0.3,-0.28,-0.33,-0.38,-0.29,-0.62,-0.35,-0.18,-0.2,0.37,-0.36,-0.36,0.58,-0.18,-0.69,-0.12,1.61,-0.26,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,1.3,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,0.23,0.24,1.58,0.64,-0.32,-0.16,0.42,0.74,0.02
0.76,-0.17,2.82,-0.05,0.99,-0.34,0.76,-0.28,2.01,1.61,-0.29,-0.62,-0.35,-0.18,-0.2,-0.27,-0.36,-0.36,-0.73,2.13,-0.1,-0.12,-0.3,-0.26,-0.14,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,-0.17,-0.66,-0.13,0.96,0.12,-0.16,23.74,9.53,1.94
-0.31,-0.17,-0.56,-0.05,3.85,-0.34,-0.3,-0.28,-0.33,-0.38,-0.29,-0.62,-0.35,-0.18,-0.2,2.64,-0.36,-0.36,-0.92,-0.18,-0.69,-0.12,-0.3,-0.26,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,1.33,1.02,-0.13,0.72,-0.32,-0.16,-0.01,-0.3,-0.46
0.76,-0.17,-0.56,-0.05,-0.46,0.81,-0.3,-0.28,-0.33,-0.38,-0.29,0.58,2.07,-0.18,-0.2,-0.27,1.45,-0.36,0.84,-0.18,1.14,-0.12,-0.3,-0.26,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,-0.17,-0.66,-0.13,-0.43,-0.32,-0.16,-0.17,-0.22,-0.38
-0.31,-0.17,-0.56,-0.05,-0.46,-0.34,-0.3,-0.28,-0.33,-0.38,-0.29,-0.62,-0.35,-0.18,-0.2,-0.27,-0.36,-0.36,0.68,-0.18,-0.69,-0.12,-0.3,-0.26,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,2.74,-0.08,-0.13,-0.17,-0.66,-0.13,-0.43,-0.32,-0.16,-0.22,-0.39,-0.49


# <br> **Section 2: Undersampled and OverSampled Spam Data**

## **2.1: Instantiate the Random Undersampler Instance**

In [19]:
x_train_scaled_undersampled_dataframe, y_train_undersampled_series \
    = RandomUnderSampler(random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
        .fit_resample(x_train_scaled_dataframe, y_train_series)

In [20]:
logx.log_write_object(x_train_scaled_undersampled_dataframe)

logx.log_write_object(y_train_undersampled_series)

## **2.2: Instantiate the Random Oversampler Instance**

In [21]:
x_train_scaled_oversampled_dataframe, y_train_oversampled_series \
    = RandomOverSampler(random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
        .fit_resample(x_train_scaled_dataframe, y_train_series)

In [22]:
logx.log_write_object(x_train_scaled_undersampled_dataframe)

logx.log_write_object(y_train_undersampled_series)

## **2.3: Instantiate the Cluster Centroids Instance**

In [23]:
x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series \
    = ClusterCentroids \
        (estimator \
             = KMeans(n_init = 'auto', random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_2),
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
        .fit_resample(x_train_scaled_dataframe, y_train_series)

In [24]:
logx.log_write_object(x_train_scaled_cluster_centroids_dataframe)

logx.log_write_object(y_train_cluster_centroids_series)

## **2.4: Instantiate the SMOTE Instance**

In [25]:
x_train_scaled_SMOTE_dataframe, y_train_SMOTE_series \
    = SMOTE(random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1, sampling_strategy = 'auto') \
        .fit_resample(x_train_scaled_dataframe, y_train_series)

In [26]:
logx.log_write_object(x_train_scaled_SMOTE_dataframe)

logx.log_write_object(y_train_SMOTE_series)

## **2.5: Instantiate the SMOTEENN Instance**

In [27]:
x_train_scaled_SMOTEENN_dataframe, y_train_SMOTEENN_series \
    = SMOTEENN(random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
        .fit_resample(x_train_scaled_dataframe, y_train_series)

In [28]:
logx.log_write_object(x_train_scaled_SMOTEENN_dataframe)

logx.log_write_object(y_train_SMOTEENN_series)

## **2.6: Check the Balance of the Labels Variable (`y`) by Using the `value_counts` Function.**

In [29]:
y_train_undersampled_series.value_counts()

spam
0    1361
1    1361
Name: count, dtype: int64

In [30]:
y_train_oversampled_series.value_counts()

spam
0    2089
1    2089
Name: count, dtype: int64

In [31]:
y_train_cluster_centroids_series.value_counts()

spam
0    1361
1    1361
Name: count, dtype: int64

In [32]:
y_train_SMOTE_series.value_counts()

spam
0    2089
1    2089
Name: count, dtype: int64

In [33]:
y_train_SMOTEENN_series.value_counts()

spam
1    1785
0    1653
Name: count, dtype: int64

## **2.7: Display Normalized Resampled Training and Testing Data**

In [34]:
pandasx.return_formatted_table \
    (x_train_scaled_undersampled_dataframe,
     'Table 2.7.1: Scaled Features Training Undersampled Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,1.27,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,7.49,4.85,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.24,-0.44
-0.35,-0.16,-0.56,-0.05,-0.46,1.88,-0.29,-0.26,-0.32,0.57,-0.3,0.06,1.64,-0.17,-0.19,-0.32,-0.31,-0.35,0.76,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,1.06,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,0.32,3.18,-0.07,-0.11,0.23,-0.48,-0.19,-0.2,0.45,-0.1,-0.1,-0.18,-0.32
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,3.86,-0.37,-0.3,0.71,3.46,-0.17,-0.19,-0.32,-0.31,-0.35,-0.29,-0.16,0.27,-0.12,-0.29,-0.21,-0.34,-0.3,0.12,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.24,-0.44
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,6.21,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,3.12,-0.16,0.8,-0.12,-0.29,-0.21,-0.34,-0.3,0.31,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,0.62,-0.19,-0.31,-0.3,-0.1,-0.12,-0.24,-0.44
-0.35,1.34,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,1.09,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,1.43,1.8,0.33,1.44,1.47,1.75,2.03,2.53,-0.17,2.52,1.43,2.14,1.82,-0.06,2.03,2.34,-0.13,1.02,3.9,-0.12,0.66,-0.2,-0.07,-0.11,-0.16,1.11,1.67,-0.31,-0.3,-0.1,-0.1,-0.19,-0.32
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,8.64,-0.18,-0.13,4.57,-0.21,-0.12,3.51,-0.2,-0.07,-0.11,-0.16,-0.48,8.23,-0.31,-0.3,-0.1,-0.1,-0.23,-0.43
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,4.65,-0.3,-0.17,-0.19,-0.32,1.64,-0.35,-0.94,-0.16,0.07,-0.12,-0.29,-0.21,0.82,-0.3,-0.23,-0.23,-0.18,1.71,-0.16,-0.14,-0.17,-0.15,-0.19,2.09,-0.32,-0.06,1.99,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.05,-0.19,-0.02,-0.3,-0.1,-0.08,-0.19,-0.35
-0.35,0.47,2.51,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,0.27,-0.3,-0.17,-0.19,-0.32,-0.31,2.65,2.11,-0.16,2.5,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.01,-0.19,0.01,-0.3,-0.1,-0.11,-0.21,-0.39
-0.35,11.39,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.1,-0.23,-0.44
-0.35,-0.16,0.09,-0.05,0.52,-0.36,-0.29,0.56,-0.32,1.7,-0.3,-0.63,-0.3,0.81,-0.19,-0.32,0.39,-0.35,-0.2,-0.16,-0.4,-0.12,-0.29,-0.21,0.91,0.44,-0.03,-0.23,-0.18,-0.22,-0.16,-0.14,0.41,-0.15,-0.19,-0.24,1.96,-0.06,-0.18,-0.18,-0.13,-0.18,1.25,0.38,0.04,-0.2,-0.07,-0.11,0.53,0.88,-0.19,-0.05,1.03,-0.1,-0.04,-0.06,0.06


In [35]:
pandasx.return_formatted_table \
    (x_train_scaled_oversampled_dataframe,
     'Table 2.7.2: Scaled Features Training Oversampled Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
2.83,-0.16,1.35,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.68,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,0.06,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.12,-0.3,-0.1,-0.09,-0.2,-0.36
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.35,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,3.86,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,2.8,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,0.05,-0.3,-0.1,-0.09,-0.21,-0.38
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,5.77,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
0.93,0.15,-0.56,-0.05,0.12,-0.36,-0.29,-0.26,-0.32,0.24,-0.3,0.74,-0.3,0.99,-0.19,-0.32,-0.31,-0.35,-0.28,-0.16,-0.67,-0.12,-0.29,-0.21,1.41,1.93,0.13,1.9,0.52,0.61,0.76,0.98,-0.17,0.97,1.88,0.76,0.58,-0.06,-0.18,0.88,-0.13,0.32,1.51,-0.12,0.1,-0.2,-0.07,-0.11,0.35,0.83,5.17,-0.31,-0.3,-0.1,-0.09,-0.19,-0.23
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,2.35,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.3,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,1.58,-0.18,-0.13,-0.18,-0.21,-0.12,0.46,-0.2,-0.07,-0.11,-0.16,0.48,-0.19,0.32,-0.3,-0.1,-0.08,-0.18,-0.34
1.42,-0.16,-0.56,-0.05,-0.46,0.65,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,1.45,-0.17,-0.19,-0.32,-0.31,-0.35,1.18,-0.16,-0.23,-0.12,-0.29,-0.21,-0.17,-0.3,-0.23,-0.23,-0.18,-0.22,1.11,-0.14,-0.17,-0.15,0.28,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,1.02,-0.19,-0.31,0.01,-0.1,-0.1,-0.17,-0.29
-0.35,-0.02,-0.56,-0.05,-0.2,-0.36,-0.29,0.66,-0.32,-0.37,-0.3,0.45,-0.3,-0.17,-0.19,-0.32,3.74,-0.35,-0.84,-0.16,-0.67,-0.12,-0.29,-0.21,-0.1,-0.1,-0.23,-0.23,-0.18,0.16,-0.16,-0.14,-0.17,-0.15,-0.19,0.71,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,0.28,-0.2,-0.07,-0.11,-0.16,0.36,-0.19,-0.31,-0.3,-0.1,-0.11,-0.21,-0.19
-0.35,-0.16,-0.56,-0.05,0.88,-0.36,1.88,-0.26,-0.32,1.04,4.1,-0.63,2.62,-0.17,-0.19,-0.32,-0.31,-0.35,0.58,-0.16,0.06,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.1,-0.22,-0.42
-0.35,-0.16,3.33,-0.05,1.01,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,1.65,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,1.83,-0.16,0.13,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,2.18,-0.18,-0.13,2.37,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.23,-0.42


In [36]:
pandasx.return_formatted_table \
    (x_train_scaled_cluster_centroids_dataframe,
     'Table 2.7.3: Scaled Features Training Cluster Centroids Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,0.64,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.02,-0.23,0.57,-0.22,-0.16,-0.14,0.62,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.06,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,1.32,1.66,-0.31,-0.3,0.05,-0.04,-0.11,-0.13
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,4.9,4.39,1.02,7.22,7.21,8.61,9.61,11.83,-0.17,11.78,7.07,10.43,-0.32,-0.06,-0.18,11.09,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,3.47,-0.19,-0.31,-0.3,-0.1,-0.06,-0.2,-0.41
-0.21,-0.04,-0.52,-0.05,-0.41,-0.29,-0.29,-0.26,-0.26,-0.1,-0.3,0.23,-0.19,-0.17,-0.19,-0.3,-0.31,-0.28,0.4,-0.16,-0.11,-0.12,-0.29,-0.11,-0.32,-0.27,-0.06,-0.23,-0.18,0.04,-0.16,-0.06,-0.17,-0.06,-0.19,-0.24,-0.19,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,0.02,-0.21,-0.13,-0.07,-0.11,-0.13,-0.19,-0.12,-0.29,-0.27,-0.07,-0.11,-0.18,-0.22
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,0.04,1.58,-0.17,-0.19,-0.32,-0.31,-0.35,-0.29,-0.16,1.24,-0.12,-0.29,-0.21,0.4,0.35,0.12,0.81,0.85,1.01,1.2,1.53,-0.17,1.52,0.82,1.25,1.02,-0.06,1.2,1.39,-0.13,-0.18,2.35,0.76,0.3,-0.2,-0.07,-0.11,-0.16,0.56,-0.19,-0.31,-0.3,-0.1,-0.11,-0.2,-0.33
-0.35,-0.16,-0.56,-0.05,-0.46,1.21,-0.29,-0.26,-0.32,-0.37,-0.3,0.34,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.23,-0.16,-0.67,6.14,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,0.82,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,16.65,-0.48,-0.19,-0.23,-0.3,0.81,0.02,-0.05,-0.03
-0.35,11.39,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.1,-0.23,-0.44
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,0.45,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.64,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,1.85,-0.06,-0.18,-0.18,5.06,-0.18,-0.21,-0.12,0.67,9.24,-0.07,-0.11,-0.16,0.44,-0.19,-0.31,-0.3,-0.1,-0.09,-0.18,-0.35
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,4.59,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,31.33,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.24,-0.43
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,4.6,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,15.96,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,11.49,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.11,-0.23,-0.43
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,1.39,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.78,-0.16,-0.67,-0.12,-0.29,-0.21,0.77,0.03,-0.23,0.81,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,0.31,0.5,1.02,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.0,-0.2,-0.07,-0.11,-0.16,-0.17,-0.19,-0.26,-0.3,-0.1,-0.09,-0.18,-0.18


In [37]:
pandasx.return_formatted_table \
    (x_train_scaled_SMOTE_dataframe,
     'Table 2.7.4: Scaled Features Training SMOTE Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
2.83,-0.16,1.35,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.68,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,0.06,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.12,-0.3,-0.1,-0.09,-0.2,-0.36
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.35,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,3.86,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,2.8,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,0.05,-0.3,-0.1,-0.09,-0.21,-0.38
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,5.77,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
0.93,0.15,-0.56,-0.05,0.12,-0.36,-0.29,-0.26,-0.32,0.24,-0.3,0.74,-0.3,0.99,-0.19,-0.32,-0.31,-0.35,-0.28,-0.16,-0.67,-0.12,-0.29,-0.21,1.41,1.93,0.13,1.9,0.52,0.61,0.76,0.98,-0.17,0.97,1.88,0.76,0.58,-0.06,-0.18,0.88,-0.13,0.32,1.51,-0.12,0.1,-0.2,-0.07,-0.11,0.35,0.83,5.17,-0.31,-0.3,-0.1,-0.09,-0.19,-0.23
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,2.35,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.3,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,1.58,-0.18,-0.13,-0.18,-0.21,-0.12,0.46,-0.2,-0.07,-0.11,-0.16,0.48,-0.19,0.32,-0.3,-0.1,-0.08,-0.18,-0.34
1.42,-0.16,-0.56,-0.05,-0.46,0.65,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,1.45,-0.17,-0.19,-0.32,-0.31,-0.35,1.18,-0.16,-0.23,-0.12,-0.29,-0.21,-0.17,-0.3,-0.23,-0.23,-0.18,-0.22,1.11,-0.14,-0.17,-0.15,0.28,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,1.02,-0.19,-0.31,0.01,-0.1,-0.1,-0.17,-0.29
-0.35,-0.02,-0.56,-0.05,-0.2,-0.36,-0.29,0.66,-0.32,-0.37,-0.3,0.45,-0.3,-0.17,-0.19,-0.32,3.74,-0.35,-0.84,-0.16,-0.67,-0.12,-0.29,-0.21,-0.1,-0.1,-0.23,-0.23,-0.18,0.16,-0.16,-0.14,-0.17,-0.15,-0.19,0.71,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,0.28,-0.2,-0.07,-0.11,-0.16,0.36,-0.19,-0.31,-0.3,-0.1,-0.11,-0.21,-0.19
-0.35,-0.16,-0.56,-0.05,0.88,-0.36,1.88,-0.26,-0.32,1.04,4.1,-0.63,2.62,-0.17,-0.19,-0.32,-0.31,-0.35,0.58,-0.16,0.06,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.1,-0.22,-0.42
-0.35,-0.16,3.33,-0.05,1.01,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,1.65,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,1.83,-0.16,0.13,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,2.18,-0.18,-0.13,2.37,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.23,-0.42


In [38]:
pandasx.return_formatted_table \
    (x_train_scaled_SMOTEENN_dataframe,
     'Table 2.7.5: Scaled Features Training SMOTEENN Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
2.83,-0.16,1.35,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.68,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,0.06,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.12,-0.3,-0.1,-0.09,-0.2,-0.36
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.35,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,3.86,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,2.8,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,0.05,-0.3,-0.1,-0.09,-0.21,-0.38
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,5.77,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
0.93,0.15,-0.56,-0.05,0.12,-0.36,-0.29,-0.26,-0.32,0.24,-0.3,0.74,-0.3,0.99,-0.19,-0.32,-0.31,-0.35,-0.28,-0.16,-0.67,-0.12,-0.29,-0.21,1.41,1.93,0.13,1.9,0.52,0.61,0.76,0.98,-0.17,0.97,1.88,0.76,0.58,-0.06,-0.18,0.88,-0.13,0.32,1.51,-0.12,0.1,-0.2,-0.07,-0.11,0.35,0.83,5.17,-0.31,-0.3,-0.1,-0.09,-0.19,-0.23
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,2.35,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.3,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,1.58,-0.18,-0.13,-0.18,-0.21,-0.12,0.46,-0.2,-0.07,-0.11,-0.16,0.48,-0.19,0.32,-0.3,-0.1,-0.08,-0.18,-0.34
1.42,-0.16,-0.56,-0.05,-0.46,0.65,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,1.45,-0.17,-0.19,-0.32,-0.31,-0.35,1.18,-0.16,-0.23,-0.12,-0.29,-0.21,-0.17,-0.3,-0.23,-0.23,-0.18,-0.22,1.11,-0.14,-0.17,-0.15,0.28,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,1.02,-0.19,-0.31,0.01,-0.1,-0.1,-0.17,-0.29
-0.35,-0.02,-0.56,-0.05,-0.2,-0.36,-0.29,0.66,-0.32,-0.37,-0.3,0.45,-0.3,-0.17,-0.19,-0.32,3.74,-0.35,-0.84,-0.16,-0.67,-0.12,-0.29,-0.21,-0.1,-0.1,-0.23,-0.23,-0.18,0.16,-0.16,-0.14,-0.17,-0.15,-0.19,0.71,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,0.28,-0.2,-0.07,-0.11,-0.16,0.36,-0.19,-0.31,-0.3,-0.1,-0.11,-0.21,-0.19
-0.35,-0.16,3.33,-0.05,1.01,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,1.65,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,1.83,-0.16,0.13,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,2.18,-0.18,-0.13,2.37,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.23,-0.42
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,1.79,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,1.38,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,1.34,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.11,-0.23,-0.42


# <br> **Section 3: Logistic Regression Models**

## **3.1: Fit Models by Using the Scaled Training Data.**

### **Original**

In [39]:
lr_grid_search_model \
    = pickle.load(open(spam_detector_constants.CONSTANT_LR_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

logistic_regression_model \
    = LogisticRegression \
        (class_weight = lr_grid_search_model.best_params_['class_weight'],
         solver = lr_grid_search_model.best_params_['solver'],
         multi_class = lr_grid_search_model.best_params_['multi_class'],
         max_iter = spam_detector_constants.CONSTANT_ML_LR_MAX_ITERATIONS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_dataframe, y_train_series)

### **Random Undersampling**

In [40]:
lr_undersampled_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_LR_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

logistic_regression_undersampled_model \
    = LogisticRegression \
        (class_weight = lr_undersampled_grid_search_model.best_params_['class_weight'],
         solver = lr_undersampled_grid_search_model.best_params_['solver'],
         multi_class = lr_undersampled_grid_search_model.best_params_['multi_class'],
         max_iter = spam_detector_constants.CONSTANT_ML_LR_MAX_ITERATIONS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

### **Random Oversampling**

In [41]:
lr_oversampled_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_LR_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

logistic_regression_oversampled_model \
    = LogisticRegression \
        (class_weight = lr_oversampled_grid_search_model.best_params_['class_weight'],
         solver = lr_oversampled_grid_search_model.best_params_['solver'],
         multi_class = lr_oversampled_grid_search_model.best_params_['multi_class'],
         max_iter = spam_detector_constants.CONSTANT_ML_LR_MAX_ITERATIONS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

### **Cluster Centroids**

In [42]:
lr_cluster_centroids_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_LR_CLUSTER_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

logistic_regression_cluster_centroids_model \
    = LogisticRegression \
        (class_weight = lr_cluster_centroids_grid_search_model.best_params_['class_weight'],
         solver = lr_cluster_centroids_grid_search_model.best_params_['solver'],
         multi_class = lr_cluster_centroids_grid_search_model.best_params_['multi_class'],
         max_iter = spam_detector_constants.CONSTANT_ML_LR_MAX_ITERATIONS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

### **SMOTE**

In [43]:
lr_SMOTE_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_LR_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

logistic_regression_SMOTE_model \
    = LogisticRegression \
        (class_weight = lr_SMOTE_grid_search_model.best_params_['class_weight'],
         solver = lr_SMOTE_grid_search_model.best_params_['solver'],
         multi_class = lr_SMOTE_grid_search_model.best_params_['multi_class'],
         max_iter = spam_detector_constants.CONSTANT_ML_LR_MAX_ITERATIONS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_SMOTE_dataframe, y_train_SMOTE_series)

### **SMOTEENN**

In [44]:
lr_SMOTEENN_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_LR_SMOTEENN_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

logistic_regression_SMOTEENN_model \
    = LogisticRegression \
        (class_weight = lr_SMOTEENN_grid_search_model.best_params_['class_weight'],
         solver = lr_SMOTEENN_grid_search_model.best_params_['solver'],
         multi_class = lr_SMOTEENN_grid_search_model.best_params_['multi_class'],
         max_iter = spam_detector_constants.CONSTANT_ML_LR_MAX_ITERATIONS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_SMOTEENN_dataframe, y_train_SMOTEENN_series)

## **3.2: Display the Model Scores Using the Scaled Training and Testing data.**

### **Original**

In [45]:
accuracy_score_train_float \
    = logistic_regression_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model score from scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe logistic regression model score from scaled training data is 92.81%[0m


In [46]:
accuracy_score_test_float \
    = logistic_regression_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model score from scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe logistic regression model score from scaled test data is 90.88%[0m


### **Random Undersampling**

In [47]:
accuracy_score_train_float \
    = logistic_regression_undersampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model score from undersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe logistic regression model score from undersampled scaled training data is 93.51%[0m


In [48]:
accuracy_score_test_float \
    = logistic_regression_undersampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model score from undersampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe logistic regression model score from undersampled scaled test data is 92.44%[0m


### **Random Oversampling**

In [49]:
accuracy_score_train_float \
    = logistic_regression_oversampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model score from overersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe logistic regression model score from overersampled scaled training data is 93.19%[0m


In [50]:
accuracy_score_test_float \
    = logistic_regression_oversampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model score from oversampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe logistic regression model score from oversampled scaled test data is 91.57%[0m


### **Cluster Centroids**

In [51]:
accuracy_score_train_float \
    = logistic_regression_cluster_centroids_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model score from cluster centroids scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe logistic regression model score from cluster centroids scaled training data is 89.48%[0m


In [52]:
accuracy_score_test_float \
    = logistic_regression_cluster_centroids_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model score from cluster centroids scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe logistic regression model score from cluster centroids scaled test data is 91.31%[0m


### **SMOTE**

In [53]:
accuracy_score_train_float \
    = logistic_regression_SMOTE_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model score from SMOTE scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe logistic regression model score from SMOTE scaled training data is 93.10%[0m


In [54]:
accuracy_score_test_float \
    = logistic_regression_SMOTE_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model score from SMOTE scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe logistic regression model score from SMOTE scaled test data is 92.09%[0m


### **SMOTEEN**

In [55]:
accuracy_score_train_float \
    = logistic_regression_SMOTEENN_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model score from SMOTEENN scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe logistic regression model score from SMOTEENN scaled training data is 93.33%[0m


In [56]:
accuracy_score_test_float \
    = logistic_regression_SMOTEENN_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model score from SMOTEENN scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe logistic regression model score from SMOTEENN scaled test data is 92.01%[0m


## **3.3: Calculate Training and Test Predictions.**

### **Original**

In [57]:
lr_train_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(lr_train_predictions_dataframe)

In [58]:
lr_test_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(lr_test_predictions_dataframe)

### **Random Undersampling**

In [59]:
lr_train_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_undersampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(lr_train_undersampled_predictions_dataframe)

In [60]:
lr_test_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_undersampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(lr_test_undersampled_predictions_dataframe)

### **Random Oversampling**

In [61]:
lr_train_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_oversampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(lr_train_oversampled_predictions_dataframe)

In [62]:
lr_test_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_oversampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(lr_test_oversampled_predictions_dataframe)

### **Cluster Centroids**

In [63]:
lr_train_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_cluster_centroids_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(lr_train_cluster_centroids_predictions_dataframe)

In [64]:
lr_test_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_cluster_centroids_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(lr_test_cluster_centroids_predictions_dataframe)

### **SMOTE**

In [65]:
lr_train_SMOTE_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_SMOTE_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(lr_train_SMOTE_predictions_dataframe)

In [66]:
lr_test_SMOTE_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_SMOTE_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(lr_test_SMOTE_predictions_dataframe)

### **SMOTEEN**

In [67]:
lr_train_SMOTEENN_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_SMOTEENN_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(lr_train_SMOTEENN_predictions_dataframe)

In [68]:
lr_test_SMOTEENN_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_SMOTE_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(lr_test_SMOTEENN_predictions_dataframe)

# <br> **Section 4: Decision Tree Models**

## **4.1: Fit Models by Using the Scaled Training Data.**

### **Original**

In [69]:
dt_grid_search_model \
    = pickle.load(open(spam_detector_constants.CONSTANT_DT_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

decision_tree_model \
    = DecisionTreeClassifier \
        (criterion = dt_grid_search_model.best_params_['criterion'],
         splitter = dt_grid_search_model.best_params_['splitter'],
         class_weight = dt_grid_search_model.best_params_['class_weight'],
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_dataframe, y_train_series)

### **Random Undersampling**

In [70]:
dt_undersampled_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_DT_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

decision_tree_undersampled_model \
    = DecisionTreeClassifier \
        (criterion = dt_undersampled_grid_search_model.best_params_['criterion'],
         splitter = dt_undersampled_grid_search_model.best_params_['splitter'],
         class_weight = dt_undersampled_grid_search_model.best_params_['class_weight'],
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

### **Random Oversampling**

In [71]:
dt_oversampled_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_DT_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

decision_tree_oversampled_model \
    = DecisionTreeClassifier \
        (criterion = dt_oversampled_grid_search_model.best_params_['criterion'],
         splitter = dt_oversampled_grid_search_model.best_params_['splitter'],
         class_weight = dt_oversampled_grid_search_model.best_params_['class_weight'],
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

### **Cluster Centroids**

In [72]:
dt_cluster_centroids_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_DT_CLUSTER_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

decision_tree_cluster_centroids_model \
    = DecisionTreeClassifier \
        (criterion = dt_cluster_centroids_grid_search_model.best_params_['criterion'],
         splitter = dt_cluster_centroids_grid_search_model.best_params_['splitter'],
         class_weight = dt_cluster_centroids_grid_search_model.best_params_['class_weight'],
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

### **SMOTE**

In [73]:
dt_SMOTE_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_DT_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

decision_tree_SMOTE_model \
    = DecisionTreeClassifier \
        (criterion = dt_SMOTE_grid_search_model.best_params_['criterion'],
         splitter = dt_SMOTE_grid_search_model.best_params_['splitter'],
         class_weight = dt_SMOTE_grid_search_model.best_params_['class_weight'],
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_SMOTE_dataframe, y_train_SMOTE_series)

### **SMOTEENN**

In [74]:
dt_SMOTEENN_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_DT_SMOTEENN_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

decision_tree_SMOTEENN_model \
    = DecisionTreeClassifier \
        (criterion = dt_SMOTEENN_grid_search_model.best_params_['criterion'],
         splitter = dt_SMOTEENN_grid_search_model.best_params_['splitter'],
         class_weight = dt_SMOTEENN_grid_search_model.best_params_['class_weight'],
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_SMOTEENN_dataframe, y_train_SMOTEENN_series)

## **4.2: Display the Model Scores Using the Scaled Training and Testing data.**

### **Original**

In [75]:
accuracy_score_train_float \
    = decision_tree_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model score from scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe decision tree model score from scaled training data is 99.91%[0m


In [76]:
accuracy_score_test_float \
    = decision_tree_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model score from scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe decision tree model score from scaled test data is 89.66%[0m


### **Random Undersampling**

In [77]:
accuracy_score_train_float \
    = decision_tree_undersampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model score from undersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe decision tree model score from undersampled scaled training data is 97.54%[0m


In [78]:
accuracy_score_test_float \
    = decision_tree_undersampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model score from undersampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe decision tree model score from undersampled scaled test data is 89.66%[0m


### **Random Oversampling**

In [79]:
accuracy_score_train_float \
    = decision_tree_oversampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model score from overersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe decision tree model score from overersampled scaled training data is 99.91%[0m


In [80]:
accuracy_score_test_float \
    = decision_tree_oversampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model score from oversampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe decision tree model score from oversampled scaled test data is 89.75%[0m


### **Cluster Centroids**

In [81]:
accuracy_score_train_float \
    = decision_tree_cluster_centroids_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model score from cluster centroids scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe decision tree model score from cluster centroids scaled training data is 98.23%[0m


In [82]:
accuracy_score_test_float \
    = decision_tree_cluster_centroids_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model score from cluster centroids scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe decision tree model score from cluster centroids scaled test data is 90.10%[0m


### **SMOTE**

In [83]:
accuracy_score_train_float \
    = decision_tree_SMOTE_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model score from SMOTE scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe decision tree model score from SMOTE scaled training data is 99.91%[0m


In [84]:
accuracy_score_test_float \
    = decision_tree_SMOTE_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model score from SMOTE scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe decision tree model score from SMOTE scaled test data is 90.96%[0m


### **SMOTEEN**

In [85]:
accuracy_score_train_float \
    = decision_tree_SMOTEENN_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model score from SMOTEENN scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe decision tree model score from SMOTEENN scaled training data is 95.13%[0m


In [86]:
accuracy_score_test_float \
    = decision_tree_SMOTEENN_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model score from SMOTEENN scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe decision tree model score from SMOTEENN scaled test data is 57.95%[0m


## **4.3: Calculate Training and Test Predictions.**

### **Original**

In [87]:
dt_train_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(dt_train_predictions_dataframe)

In [88]:
dt_test_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(dt_test_predictions_dataframe)

### **Random Undersampling**

In [89]:
dt_train_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_undersampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(dt_train_undersampled_predictions_dataframe)

In [90]:
dt_test_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_undersampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(dt_test_undersampled_predictions_dataframe)

### **Random Oversampling**

In [91]:
dt_train_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_oversampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(dt_train_oversampled_predictions_dataframe)

In [92]:
dt_test_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_oversampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(dt_test_oversampled_predictions_dataframe)

### **Cluster Centroids**

In [93]:
dt_train_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_cluster_centroids_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(dt_train_cluster_centroids_predictions_dataframe)

In [94]:
dt_test_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_cluster_centroids_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(dt_test_cluster_centroids_predictions_dataframe)

### **SMOTE**

In [95]:
dt_train_SMOTE_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_SMOTE_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(dt_train_SMOTE_predictions_dataframe)

In [96]:
dt_test_SMOTE_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_SMOTE_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(dt_test_SMOTE_predictions_dataframe)

### **SMOTEEN**

In [97]:
dt_train_SMOTEENN_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_SMOTEENN_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(dt_train_SMOTEENN_predictions_dataframe)

In [98]:
dt_test_SMOTEENN_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_SMOTE_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(dt_test_SMOTEENN_predictions_dataframe)

# <br> **Section 5: Random Forest Models**

## **5.1: Fit Models by Using the Scaled Training Data.**

### **Original**

In [99]:
rf_grid_search_model \
    = pickle.load(open(spam_detector_constants.CONSTANT_RF_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

random_forest_model \
    = RandomForestClassifier \
        (criterion = rf_grid_search_model.best_params_['criterion'],
         max_features = rf_grid_search_model.best_params_['max_features'],
         class_weight = rf_grid_search_model.best_params_['class_weight'],
         n_estimators = spam_detector_constants.CONSTANT_ML_RF_N_ESTIMATORS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_dataframe, y_train_series)

### **Random Undersampling**

In [100]:
rf_undersampled_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_RF_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

random_forest_undersampled_model \
    = RandomForestClassifier \
        (criterion = rf_undersampled_grid_search_model.best_params_['criterion'],
         max_features = rf_undersampled_grid_search_model.best_params_['max_features'],
         class_weight = rf_undersampled_grid_search_model.best_params_['class_weight'],
         n_estimators = spam_detector_constants.CONSTANT_ML_RF_N_ESTIMATORS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

### **Random Oversampling**

In [101]:
rf_oversampled_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_RF_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

random_forest_oversampled_model \
    = RandomForestClassifier \
        (criterion = rf_oversampled_grid_search_model.best_params_['criterion'],
         max_features = rf_oversampled_grid_search_model.best_params_['max_features'],
         class_weight = rf_oversampled_grid_search_model.best_params_['class_weight'],
         n_estimators = spam_detector_constants.CONSTANT_ML_RF_N_ESTIMATORS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

### **Cluster Centroids**

In [102]:
rf_cluster_centroids_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_RF_CLUSTER_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

random_forest_cluster_centroids_model \
    = RandomForestClassifier \
        (criterion = rf_cluster_centroids_grid_search_model.best_params_['criterion'],
         max_features = rf_cluster_centroids_grid_search_model.best_params_['max_features'],
         class_weight = rf_cluster_centroids_grid_search_model.best_params_['class_weight'],
         n_estimators = spam_detector_constants.CONSTANT_ML_RF_N_ESTIMATORS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

### **SMOTE**

In [103]:
rf_SMOTE_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_RF_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

random_forest_SMOTE_model \
    = RandomForestClassifier \
        (criterion = rf_SMOTE_grid_search_model.best_params_['criterion'],
         max_features = rf_SMOTE_grid_search_model.best_params_['max_features'],
         class_weight = rf_SMOTE_grid_search_model.best_params_['class_weight'],
         n_estimators = spam_detector_constants.CONSTANT_ML_RF_N_ESTIMATORS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_SMOTE_dataframe, y_train_SMOTE_series)

### **SMOTEENN**

In [104]:
rf_SMOTEENN_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_RF_SMOTEENN_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

random_forest_SMOTEENN_model \
    = RandomForestClassifier \
        (criterion = rf_SMOTEENN_grid_search_model.best_params_['criterion'],
         max_features = rf_SMOTEENN_grid_search_model.best_params_['max_features'],
         class_weight = rf_SMOTEENN_grid_search_model.best_params_['class_weight'],
         n_estimators = spam_detector_constants.CONSTANT_ML_RF_N_ESTIMATORS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_SMOTEENN_dataframe, y_train_SMOTEENN_series)

## **5.2: Display the Model Scores Using the Scaled Training and Testing data.**

### **Original**

In [105]:
accuracy_score_train_float \
    = random_forest_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model score from scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe random forest model score from scaled training data is 99.91%[0m


In [106]:
accuracy_score_test_float \
    = random_forest_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model score from scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe random forest model score from scaled test data is 92.96%[0m


### **Random Undersampling**

In [107]:
accuracy_score_train_float \
    = random_forest_undersampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model score from undersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe random forest model score from undersampled scaled training data is 99.07%[0m


In [108]:
accuracy_score_test_float \
    = random_forest_undersampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model score from undersampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe random forest model score from undersampled scaled test data is 94.18%[0m


### **Random Oversampling**

In [109]:
accuracy_score_train_float \
    = random_forest_oversampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model score from overersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe random forest model score from overersampled scaled training data is 99.91%[0m


In [110]:
accuracy_score_test_float \
    = random_forest_oversampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model score from oversampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe random forest model score from oversampled scaled test data is 93.05%[0m


### **Cluster Centroids**

In [111]:
accuracy_score_train_float \
    = random_forest_cluster_centroids_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model score from cluster centroids scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe random forest model score from cluster centroids scaled training data is 99.04%[0m


In [112]:
accuracy_score_test_float \
    = random_forest_cluster_centroids_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model score from cluster centroids scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe random forest model score from cluster centroids scaled test data is 93.14%[0m


### **SMOTE**

In [113]:
accuracy_score_train_float \
    = random_forest_SMOTE_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model score from SMOTE scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe random forest model score from SMOTE scaled training data is 99.91%[0m


In [114]:
accuracy_score_test_float \
    = random_forest_SMOTE_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model score from SMOTE scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe random forest model score from SMOTE scaled test data is 93.74%[0m


### **SMOTEEN**

In [115]:
accuracy_score_train_float \
    = random_forest_SMOTEENN_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model score from SMOTEENN scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe random forest model score from SMOTEENN scaled training data is 95.88%[0m


In [116]:
accuracy_score_test_float \
    = random_forest_SMOTEENN_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model score from SMOTEENN scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe random forest model score from SMOTEENN scaled test data is 93.66%[0m


## **5.3: Calculate Training and Test Predictions.**

### **Original**

In [117]:
rf_train_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(rf_train_predictions_dataframe)

In [118]:
rf_test_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(rf_test_predictions_dataframe)

### **Random Undersampling**

In [119]:
rf_train_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_undersampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(rf_train_undersampled_predictions_dataframe)

In [120]:
rf_test_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_undersampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(rf_test_undersampled_predictions_dataframe)

### **Random Oversampling**

In [121]:
rf_train_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_oversampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(rf_train_oversampled_predictions_dataframe)

In [122]:
rf_test_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_oversampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(rf_test_oversampled_predictions_dataframe)

### **Cluster Centroids**

In [123]:
rf_train_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_cluster_centroids_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(rf_train_cluster_centroids_predictions_dataframe)

In [124]:
rf_test_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_cluster_centroids_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(rf_test_cluster_centroids_predictions_dataframe)

### **SMOTE**

In [125]:
rf_train_SMOTE_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_SMOTE_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(rf_train_SMOTE_predictions_dataframe)

In [126]:
rf_test_SMOTE_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_SMOTE_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(rf_test_SMOTE_predictions_dataframe)

### **SMOTEEN**

In [127]:
rf_train_SMOTEENN_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_SMOTEENN_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(rf_train_SMOTEENN_predictions_dataframe)

In [128]:
rf_test_SMOTEENN_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_SMOTE_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(rf_test_SMOTEENN_predictions_dataframe)

# <br> **Section 6: Support Vector Machine (SVM) Models**

## **6.1: Fit Models by Using the Scaled Training Data.**

### **Original**

In [129]:
svm_grid_search_model \
    = pickle.load(open(spam_detector_constants.CONSTANT_SVM_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

svm_model \
    = SVC \
        (kernel = svm_grid_search_model.best_params_['kernel'],
         gamma = svm_grid_search_model.best_params_['gamma'],
         class_weight = svm_grid_search_model.best_params_['class_weight'],
         decision_function_shape = svm_grid_search_model.best_params_['decision_function_shape'],
         probability = spam_detector_constants.CONSTANT_ML_SVM_PROBABILITY,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_dataframe, y_train_series)

### **Random Undersampling**

In [130]:
svm_undersampled_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_SVM_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

svm_undersampled_model \
    = SVC \
        (kernel = svm_undersampled_grid_search_model.best_params_['kernel'],
         gamma = svm_undersampled_grid_search_model.best_params_['gamma'],
         class_weight = svm_undersampled_grid_search_model.best_params_['class_weight'],
         decision_function_shape = svm_undersampled_grid_search_model.best_params_['decision_function_shape'],
         probability = spam_detector_constants.CONSTANT_ML_SVM_PROBABILITY,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

### **Random Oversampling**

In [131]:
svm_oversampled_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_SVM_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

svm_oversampled_model \
    = SVC \
        (kernel = svm_oversampled_grid_search_model.best_params_['kernel'],
         gamma = svm_oversampled_grid_search_model.best_params_['gamma'],
         class_weight = svm_oversampled_grid_search_model.best_params_['class_weight'],
         decision_function_shape = svm_oversampled_grid_search_model.best_params_['decision_function_shape'],
         probability = spam_detector_constants.CONSTANT_ML_SVM_PROBABILITY,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

### **Cluster Centroids**

In [132]:
svm_cluster_centroids_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_SVM_CLUSTER_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

svm_cluster_centroids_model \
    = SVC \
        (kernel = svm_cluster_centroids_grid_search_model.best_params_['kernel'],
         gamma = svm_cluster_centroids_grid_search_model.best_params_['gamma'],
         class_weight = svm_cluster_centroids_grid_search_model.best_params_['class_weight'],
         decision_function_shape = svm_cluster_centroids_grid_search_model.best_params_['decision_function_shape'],
         probability = spam_detector_constants.CONSTANT_ML_SVM_PROBABILITY,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

### **SMOTE**

In [133]:
svm_SMOTE_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_SVM_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

svm_SMOTE_model \
    = SVC \
        (kernel = svm_SMOTE_grid_search_model.best_params_['kernel'],
         gamma = svm_SMOTE_grid_search_model.best_params_['gamma'],
         class_weight = svm_SMOTE_grid_search_model.best_params_['class_weight'],
         decision_function_shape = svm_SMOTE_grid_search_model.best_params_['decision_function_shape'],
         probability = spam_detector_constants.CONSTANT_ML_SVM_PROBABILITY,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_SMOTE_dataframe, y_train_SMOTE_series)

### **SMOTEENN**

In [134]:
svm_SMOTEENN_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_SVM_SMOTEENN_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

svm_SMOTEENN_model \
    = SVC \
        (kernel = svm_SMOTEENN_grid_search_model.best_params_['kernel'],
         gamma = svm_SMOTEENN_grid_search_model.best_params_['gamma'],
         class_weight = svm_SMOTEENN_grid_search_model.best_params_['class_weight'],
         decision_function_shape = svm_SMOTEENN_grid_search_model.best_params_['decision_function_shape'],
         probability = spam_detector_constants.CONSTANT_ML_SVM_PROBABILITY,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_SMOTEENN_dataframe, y_train_SMOTEENN_series)

## **6.2: Display the Model Scores Using the Scaled Training and Testing data.**

### **Original**

In [135]:
accuracy_score_train_float \
    = svm_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The svm model score from scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe svm model score from scaled training data is 94.81%[0m


In [136]:
accuracy_score_test_float \
    = svm_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The svm model score from scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe svm model score from scaled test data is 92.53%[0m


### **Random Undersampling**

In [137]:
accuracy_score_train_float \
    = svm_undersampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The svm model score from undersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe svm model score from undersampled scaled training data is 93.68%[0m


In [138]:
accuracy_score_test_float \
    = svm_undersampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The svm model score from undersampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe svm model score from undersampled scaled test data is 93.05%[0m


### **Random Oversampling**

In [139]:
accuracy_score_train_float \
    = svm_oversampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The svm model score from overersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe svm model score from overersampled scaled training data is 95.10%[0m


In [140]:
accuracy_score_test_float \
    = svm_oversampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The svm model score from oversampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe svm model score from oversampled scaled test data is 92.44%[0m


### **Cluster Centroids**

In [141]:
accuracy_score_train_float \
    = svm_cluster_centroids_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The svm model score from cluster centroids scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe svm model score from cluster centroids scaled training data is 89.39%[0m


In [142]:
accuracy_score_test_float \
    = svm_cluster_centroids_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The svm model score from cluster centroids scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe svm model score from cluster centroids scaled test data is 91.75%[0m


### **SMOTE**

In [143]:
accuracy_score_train_float \
    = svm_SMOTE_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The svm model score from SMOTE scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe svm model score from SMOTE scaled training data is 95.13%[0m


In [144]:
accuracy_score_test_float \
    = svm_SMOTE_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The svm model score from SMOTE scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe svm model score from SMOTE scaled test data is 92.35%[0m


### **SMOTEEN**

In [145]:
accuracy_score_train_float \
    = svm_SMOTEENN_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The svm model score from SMOTEENN scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe svm model score from SMOTEENN scaled training data is 93.48%[0m


In [146]:
accuracy_score_test_float \
    = svm_SMOTEENN_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The svm model score from SMOTEENN scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe svm model score from SMOTEENN scaled test data is 92.44%[0m


## **6.3: Calculate Training and Test Predictions.**

### **Original**

In [147]:
svm_train_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(svm_train_predictions_dataframe)

In [148]:
svm_test_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(svm_test_predictions_dataframe)

### **Random Undersampling**

In [149]:
svm_train_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_undersampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(svm_train_undersampled_predictions_dataframe)

In [150]:
svm_test_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_undersampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(svm_test_undersampled_predictions_dataframe)

### **Random Oversampling**

In [151]:
svm_train_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_oversampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(svm_train_oversampled_predictions_dataframe)

In [152]:
svm_test_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_oversampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(svm_test_oversampled_predictions_dataframe)

### **Cluster Centroids**

In [153]:
svm_train_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_cluster_centroids_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(svm_train_cluster_centroids_predictions_dataframe)

In [154]:
svm_test_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_cluster_centroids_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(svm_test_cluster_centroids_predictions_dataframe)

### **SMOTE**

In [155]:
svm_train_SMOTE_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_SMOTE_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(svm_train_SMOTE_predictions_dataframe)

In [156]:
svm_test_SMOTE_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_SMOTE_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(svm_test_SMOTE_predictions_dataframe)

### **SMOTEEN**

In [157]:
svm_train_SMOTEENN_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_SMOTEENN_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(svm_train_SMOTEENN_predictions_dataframe)

In [158]:
svm_test_SMOTEENN_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_SMOTE_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(svm_test_SMOTEENN_predictions_dataframe)

# <br> **Section 7: K-Nearest Neighbor (KNN) Models**

## **7.1: Fit Models by Using the Scaled Training Data.**

### **Original**

In [159]:
knn_grid_search_model \
    = pickle.load(open(spam_detector_constants.CONSTANT_KNN_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

knn_model \
    = KNeighborsClassifier \
        (n_neighbors = knn_grid_search_model.best_params_['n_neighbors'],
         weights = knn_grid_search_model.best_params_['weights'],
         algorithm = knn_grid_search_model.best_params_['algorithm'],
         p = knn_grid_search_model.best_params_['p'],
         leaf_size = spam_detector_constants.CONSTANT_ML_KNN_LEAF_SIZE) \
            .fit(x_train_scaled_dataframe, y_train_series)

### **Random Undersampling**

In [160]:
knn_undersampled_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_KNN_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

knn_undersampled_model \
    = KNeighborsClassifier \
        (n_neighbors = knn_undersampled_grid_search_model.best_params_['n_neighbors'],
         weights = knn_undersampled_grid_search_model.best_params_['weights'],
         algorithm = knn_undersampled_grid_search_model.best_params_['algorithm'],
         p = knn_undersampled_grid_search_model.best_params_['p'],
         leaf_size = spam_detector_constants.CONSTANT_ML_KNN_LEAF_SIZE) \
            .fit(x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

### **Random Oversampling**

In [161]:
knn_oversampled_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_KNN_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

knn_oversampled_model \
    = KNeighborsClassifier \
        (n_neighbors = knn_oversampled_grid_search_model.best_params_['n_neighbors'],
         weights = knn_oversampled_grid_search_model.best_params_['weights'],
         algorithm = knn_oversampled_grid_search_model.best_params_['algorithm'],
         p = knn_oversampled_grid_search_model.best_params_['p'],
         leaf_size = spam_detector_constants.CONSTANT_ML_KNN_LEAF_SIZE) \
            .fit(x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

### **Cluster Centroids**

In [162]:
knn_cluster_centroids_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_KNN_CLUSTER_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

knn_cluster_centroids_model \
    = KNeighborsClassifier \
        (n_neighbors = knn_cluster_centroids_grid_search_model.best_params_['n_neighbors'],
         weights = knn_cluster_centroids_grid_search_model.best_params_['weights'],
         algorithm = knn_cluster_centroids_grid_search_model.best_params_['algorithm'],
         p = knn_cluster_centroids_grid_search_model.best_params_['p'],
         leaf_size = spam_detector_constants.CONSTANT_ML_KNN_LEAF_SIZE) \
            .fit(x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

### **SMOTE**

In [163]:
knn_SMOTE_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_KNN_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

knn_SMOTE_model \
    = KNeighborsClassifier \
        (n_neighbors = knn_SMOTE_grid_search_model.best_params_['n_neighbors'],
         weights = knn_SMOTE_grid_search_model.best_params_['weights'],
         algorithm = knn_SMOTE_grid_search_model.best_params_['algorithm'],
         p = knn_SMOTE_grid_search_model.best_params_['p'],
         leaf_size = spam_detector_constants.CONSTANT_ML_KNN_LEAF_SIZE) \
            .fit(x_train_scaled_SMOTE_dataframe, y_train_SMOTE_series)

### **SMOTEENN**

In [164]:
knn_SMOTEENN_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_KNN_SMOTEENN_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

knn_SMOTEENN_model \
    = KNeighborsClassifier \
        (n_neighbors = knn_SMOTEENN_grid_search_model.best_params_['n_neighbors'],
         weights = knn_SMOTEENN_grid_search_model.best_params_['weights'],
         algorithm = knn_SMOTEENN_grid_search_model.best_params_['algorithm'],
         p = knn_SMOTEENN_grid_search_model.best_params_['p'],
         leaf_size = spam_detector_constants.CONSTANT_ML_KNN_LEAF_SIZE) \
            .fit(x_train_scaled_SMOTEENN_dataframe, y_train_SMOTEENN_series)

## **7.2: Display the Model Scores Using the Scaled Training and Testing data.**

### **Original**

In [165]:
accuracy_score_train_float \
    = knn_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model score from scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe knn model score from scaled training data is 99.91%[0m


In [166]:
accuracy_score_test_float \
    = knn_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model score from scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe knn model score from scaled test data is 92.35%[0m


### **Random Undersampling**

In [167]:
accuracy_score_train_float \
    = knn_undersampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model score from undersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe knn model score from undersampled scaled training data is 98.90%[0m


In [168]:
accuracy_score_test_float \
    = knn_undersampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model score from undersampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe knn model score from undersampled scaled test data is 91.57%[0m


### **Random Oversampling**

In [169]:
accuracy_score_train_float \
    = knn_oversampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model score from overersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe knn model score from overersampled scaled training data is 99.91%[0m


In [170]:
accuracy_score_test_float \
    = knn_oversampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model score from oversampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe knn model score from oversampled scaled test data is 91.92%[0m


### **Cluster Centroids**

In [171]:
accuracy_score_train_float \
    = knn_cluster_centroids_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model score from cluster centroids scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe knn model score from cluster centroids scaled training data is 95.07%[0m


In [172]:
accuracy_score_test_float \
    = knn_cluster_centroids_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model score from cluster centroids scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe knn model score from cluster centroids scaled test data is 87.92%[0m


### **SMOTE**

In [173]:
accuracy_score_train_float \
    = knn_SMOTE_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model score from SMOTE scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe knn model score from SMOTE scaled training data is 99.91%[0m


In [174]:
accuracy_score_test_float \
    = knn_SMOTE_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model score from SMOTE scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe knn model score from SMOTE scaled test data is 92.62%[0m


### **SMOTEEN**

In [175]:
accuracy_score_train_float \
    = knn_SMOTEENN_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model score from SMOTEENN scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe knn model score from SMOTEENN scaled training data is 91.65%[0m


In [176]:
accuracy_score_test_float \
    = knn_SMOTEENN_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model score from SMOTEENN scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe knn model score from SMOTEENN scaled test data is 89.23%[0m


## **7.3: Calculate Training and Test Predictions.**

### **Original**

In [177]:
knn_train_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(knn_train_predictions_dataframe)

In [178]:
knn_test_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(knn_test_predictions_dataframe)

### **Random Undersampling**

In [179]:
knn_train_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_undersampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(knn_train_undersampled_predictions_dataframe)

In [180]:
knn_test_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_undersampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(knn_test_undersampled_predictions_dataframe)

### **Random Oversampling**

In [181]:
knn_train_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_oversampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(knn_train_oversampled_predictions_dataframe)

In [182]:
knn_test_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_oversampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(knn_test_oversampled_predictions_dataframe)

### **Cluster Centroids**

In [183]:
knn_train_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_cluster_centroids_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(knn_train_cluster_centroids_predictions_dataframe)

In [184]:
knn_test_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_cluster_centroids_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(knn_test_cluster_centroids_predictions_dataframe)

### **SMOTE**

In [185]:
knn_train_SMOTE_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_SMOTE_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(knn_train_SMOTE_predictions_dataframe)

In [186]:
knn_test_SMOTE_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_SMOTE_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(knn_test_SMOTE_predictions_dataframe)

### **SMOTEEN**

In [187]:
knn_train_SMOTEENN_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_SMOTEENN_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(knn_train_SMOTEENN_predictions_dataframe)

In [188]:
knn_test_SMOTEENN_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_SMOTE_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(knn_test_SMOTEENN_predictions_dataframe)

# <br> **Section 8: Gaussian Naive Bayes (GNB) Models**

## **8.1: Fit Models by Using the Scaled Training Data.**

### **Original**

In [189]:
gnb_model \
    = GaussianNB().fit(x_train_scaled_dataframe, y_train_series)

### **Random Undersampling**

In [190]:
gnb_undersampled_model \
    = GaussianNB().fit(x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

### **Random Oversampling**

In [191]:
gnb_oversampled_model \
    = GaussianNB().fit(x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

### **Cluster Centroids**

In [192]:
gnb_cluster_centroids_model \
    = GaussianNB().fit(x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

### **SMOTE**

In [193]:
gnb_SMOTE_model \
    = GaussianNB().fit(x_train_scaled_SMOTE_dataframe, y_train_SMOTE_series)

### **SMOTEENN**

In [194]:
gnb_SMOTEENN_model \
    = GaussianNB().fit(x_train_scaled_SMOTEENN_dataframe, y_train_SMOTEENN_series)

## **8.2: Display the Model Scores Using the Scaled Training and Testing data.**

### **Original**

In [195]:
accuracy_score_train_float \
    = gnb_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The gnb model score from scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe gnb model score from scaled training data is 81.62%[0m


In [196]:
accuracy_score_test_float \
    = gnb_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The gnb model score from scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe gnb model score from scaled test data is 81.15%[0m


### **Random Undersampling**

In [197]:
accuracy_score_train_float \
    = gnb_undersampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The gnb model score from undersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe gnb model score from undersampled scaled training data is 81.71%[0m


In [198]:
accuracy_score_test_float \
    = gnb_undersampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The gnb model score from undersampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe gnb model score from undersampled scaled test data is 81.15%[0m


### **Random Oversampling**

In [199]:
accuracy_score_train_float \
    = gnb_oversampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The gnb model score from overersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe gnb model score from overersampled scaled training data is 81.25%[0m


In [200]:
accuracy_score_test_float \
    = gnb_oversampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The gnb model score from oversampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe gnb model score from oversampled scaled test data is 80.97%[0m


### **Cluster Centroids**

In [201]:
accuracy_score_train_float \
    = gnb_cluster_centroids_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The gnb model score from cluster centroids scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe gnb model score from cluster centroids scaled training data is 81.28%[0m


In [202]:
accuracy_score_test_float \
    = gnb_cluster_centroids_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The gnb model score from cluster centroids scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe gnb model score from cluster centroids scaled test data is 81.23%[0m


### **SMOTE**

In [203]:
accuracy_score_train_float \
    = gnb_SMOTE_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The gnb model score from SMOTE scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe gnb model score from SMOTE scaled training data is 81.91%[0m


In [204]:
accuracy_score_test_float \
    = gnb_SMOTE_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The gnb model score from SMOTE scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe gnb model score from SMOTE scaled test data is 81.49%[0m


### **SMOTEEN**

In [205]:
accuracy_score_train_float \
    = gnb_SMOTEENN_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The gnb model score from SMOTEENN scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float)
     + '\033[0m')

[1mThe gnb model score from SMOTEENN scaled training data is 83.25%[0m


In [206]:
accuracy_score_test_float \
    = gnb_SMOTEENN_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The gnb model score from SMOTEENN scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float)
     + '\033[0m')

[1mThe gnb model score from SMOTEENN scaled test data is 84.19%[0m


## **8.3: Calculate Training and Test Predictions.**

### **Original**

In [207]:
gnb_train_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(gnb_train_predictions_dataframe)

In [208]:
gnb_test_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(gnb_test_predictions_dataframe)

### **Random Undersampling**

In [209]:
gnb_train_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_undersampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(gnb_train_undersampled_predictions_dataframe)

In [210]:
gnb_test_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_undersampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(gnb_test_undersampled_predictions_dataframe)

### **Random Oversampling**

In [211]:
gnb_train_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_oversampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(gnb_train_oversampled_predictions_dataframe)

In [212]:
gnb_test_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_oversampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(gnb_test_oversampled_predictions_dataframe)

### **Cluster Centroids**

In [213]:
gnb_train_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_cluster_centroids_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(gnb_train_cluster_centroids_predictions_dataframe)

In [214]:
gnb_test_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_cluster_centroids_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(gnb_test_cluster_centroids_predictions_dataframe)

### **SMOTE**

In [215]:
gnb_train_SMOTE_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_SMOTE_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(gnb_train_SMOTE_predictions_dataframe)

In [216]:
gnb_test_SMOTE_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_SMOTE_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(gnb_test_SMOTE_predictions_dataframe)

### **SMOTEEN**

In [217]:
gnb_train_SMOTEENN_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_SMOTEENN_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(gnb_train_SMOTEENN_predictions_dataframe)

In [218]:
gnb_test_SMOTEENN_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_SMOTE_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(gnb_test_SMOTEENN_predictions_dataframe)

# <br> **Section 9: Evaluate Model Performance**

## **9.1: Logistic Regression**

### **Original**

In [219]:
logistic_regression_model.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 10000,
 'multi_class': 'multinomial',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 21,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [220]:
lr_predictions_nparray = logistic_regression_model.predict(x_test_scaled_dataframe)

lr_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, lr_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The balanced accuracy score for logistic regression from actual vs. test predictions is {:.2f}%' \
         .format(lr_balanced_accuracy_score_float)
     + '\033[0m')

[1mThe balanced accuracy score for logistic regression from actual vs. test predictions is 90.88%[0m


In [221]:
lr_accuracy_score_float, \
lr_confusion_matrix_dataframe, \
lr_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         lr_predictions_nparray,
         'LOGISTIC REGRESSION MODEL',
         'Spam', 'Not Spam')

model_performance_dictionary \
    = {'logistic_regression': [lr_accuracy_score_float * 100]}

model_performance_ranking_dictionary \
    = {'logistic_regression': lr_accuracy_score_float * 100}

[1mLOGISTIC REGRESSION MODEL
[0m
1) [1mOverall Accuracy Score: [0m89.67%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 666                  33
Actual Not Spam              72                 380

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.90      0.95      0.93       699
    not spam       0.92      0.84      0.88       452

    accuracy                           0.91      1151
   macro avg       0.91      0.90      0.90      1151
weighted avg       0.91      0.91      0.91      1151




### **Random Undersampling**

In [222]:
logistic_regression_undersampled_model.get_params()

{'C': 1.0,
 'class_weight': 'balanced',
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 10000,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 21,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [223]:
lr_undersampled_predictions_nparray \
    = logistic_regression_undersampled_model.predict(x_test_scaled_dataframe)

lr_undersampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, lr_undersampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for logistic regression undersampled from actual vs. test predictions is {:.2f}%' \
         .format(lr_undersampled_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for logistic regression undersampled from actual vs. test predictions is 92.44%[0m


In [224]:
lr_undersampled_accuracy_score_float, \
lr_undersampled_confusion_matrix_dataframe, \
lr_undersampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         lr_undersampled_predictions_nparray,
         'LOGISTIC REGRESSION MODEL (Undersampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['logistic_regression'] \
    .append(lr_undersampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['logistic_regression_undersampled'] \
    = lr_undersampled_accuracy_score_float * 100

[1mLOGISTIC REGRESSION MODEL (Undersampled)
[0m
1) [1mOverall Accuracy Score: [0m92.02%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 657                  42
Actual Not Spam              45                 407

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.94      0.94      0.94       699
    not spam       0.91      0.90      0.90       452

    accuracy                           0.92      1151
   macro avg       0.92      0.92      0.92      1151
weighted avg       0.92      0.92      0.92      1151




### **Random Oversampling**

In [225]:
logistic_regression_oversampled_model.get_params()

{'C': 1.0,
 'class_weight': 'balanced',
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 10000,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 21,
 'solver': 'saga',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [226]:
lr_oversampled_predictions_nparray \
    = logistic_regression_oversampled_model.predict(x_test_scaled_dataframe)

lr_oversampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, lr_oversampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for logistic regression oversampled from actual vs. test predictions is {:.2f}%' \
         .format(lr_oversampled_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for logistic regression oversampled from actual vs. test predictions is 91.57%[0m


In [227]:
lr_oversampled_accuracy_score_float, \
lr_oversampled_confusion_matrix_dataframe, \
lr_oversampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         lr_oversampled_predictions_nparray,
         'LOGISTIC REGRESSION MODEL (Oversampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['logistic_regression'] \
    .append(lr_oversampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['logistic_regression_oversampled'] \
    = lr_oversampled_accuracy_score_float * 100

[1mLOGISTIC REGRESSION MODEL (Oversampled)
[0m
1) [1mOverall Accuracy Score: [0m91.22%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 649                  50
Actual Not Spam              47                 405

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.93      0.93      0.93       699
    not spam       0.89      0.90      0.89       452

    accuracy                           0.92      1151
   macro avg       0.91      0.91      0.91      1151
weighted avg       0.92      0.92      0.92      1151




### **Cluster Centroids**

In [228]:
logistic_regression_cluster_centroids_model.get_params()

{'C': 1.0,
 'class_weight': 'balanced',
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 10000,
 'multi_class': 'multinomial',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 21,
 'solver': 'sag',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [229]:
lr_cluster_centroids_predictions_nparray \
    = logistic_regression_cluster_centroids_model.predict(x_test_scaled_dataframe)

lr_cluster_centroids_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, lr_cluster_centroids_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for logistic regression cluster centroids from actual vs. test predictions is {:.2f}%' \
         .format(lr_cluster_centroids_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for logistic regression cluster centroids from actual vs. test predictions is 91.31%[0m


In [230]:
lr_cluster_centroids_accuracy_score_float, \
lr_cluster_centroids_confusion_matrix_dataframe, \
lr_cluster_centroids_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         lr_cluster_centroids_predictions_nparray,
         'LOGISTIC REGRESSION MODEL (Cluster Centroids)',
         'Spam', 'Not Spam')

model_performance_dictionary['logistic_regression'] \
    .append(lr_cluster_centroids_accuracy_score_float * 100)

model_performance_ranking_dictionary['logistic_regression_cluster_centroids'] \
    = lr_cluster_centroids_accuracy_score_float * 100

[1mLOGISTIC REGRESSION MODEL (Cluster Centroids)
[0m
1) [1mOverall Accuracy Score: [0m91.01%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 646                  53
Actual Not Spam              47                 405

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.93      0.92      0.93       699
    not spam       0.88      0.90      0.89       452

    accuracy                           0.91      1151
   macro avg       0.91      0.91      0.91      1151
weighted avg       0.91      0.91      0.91      1151




### **SMOTE**

In [231]:
logistic_regression_SMOTE_model.get_params()

{'C': 1.0,
 'class_weight': 'balanced',
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 10000,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 21,
 'solver': 'saga',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [232]:
lr_SMOTE_predictions_nparray \
    = logistic_regression_SMOTE_model.predict(x_test_scaled_dataframe)

lr_SMOTE_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, lr_SMOTE_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for logistic regression SMOTE from actual vs. test predictions is {:.2f}%' \
         .format(lr_SMOTE_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for logistic regression SMOTE from actual vs. test predictions is 92.09%[0m


In [233]:
lr_SMOTE_accuracy_score_float, \
lr_SMOTE_confusion_matrix_dataframe, \
lr_SMOTE_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         lr_SMOTE_predictions_nparray,
         'LOGISTIC REGRESSION MODEL (SMOTE)',
         'Spam', 'Not Spam')

model_performance_dictionary['logistic_regression'] \
    .append(lr_SMOTE_accuracy_score_float * 100)

model_performance_ranking_dictionary['logistic_regression_smote'] \
    = lr_SMOTE_accuracy_score_float * 100

[1mLOGISTIC REGRESSION MODEL (SMOTE)
[0m
1) [1mOverall Accuracy Score: [0m91.69%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 654                  45
Actual Not Spam              46                 406

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.93      0.94      0.93       699
    not spam       0.90      0.90      0.90       452

    accuracy                           0.92      1151
   macro avg       0.92      0.92      0.92      1151
weighted avg       0.92      0.92      0.92      1151




### **SMOTEEN**

In [234]:
logistic_regression_SMOTEENN_model.get_params()

{'C': 1.0,
 'class_weight': 'balanced',
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 10000,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 21,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [235]:
lr_SMOTEENN_predictions_nparray \
    = logistic_regression_SMOTEENN_model.predict(x_test_scaled_dataframe)

lr_SMOTEENN_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, lr_SMOTEENN_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for logistic regression SMOTEENN from actual vs. test predictions is {:.2f}%' \
         .format(lr_SMOTEENN_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for logistic regression SMOTEENN from actual vs. test predictions is 92.01%[0m


In [236]:
lr_SMOTEENN_accuracy_score_float, \
lr_SMOTEENN_confusion_matrix_dataframe, \
lr_SMOTEENN_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         lr_SMOTEENN_predictions_nparray,
         'LOGISTIC REGRESSION MODEL (SMOTEENN)',
         'Spam', 'Not Spam')

model_performance_dictionary['logistic_regression'] \
    .append(lr_SMOTEENN_accuracy_score_float * 100)

model_performance_ranking_dictionary['logistic_regression_smoteen'] \
    = lr_SMOTEENN_accuracy_score_float * 100

[1mLOGISTIC REGRESSION MODEL (SMOTEENN)
[0m
1) [1mOverall Accuracy Score: [0m91.82%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 648                  51
Actual Not Spam              41                 411

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.94      0.93      0.93       699
    not spam       0.89      0.91      0.90       452

    accuracy                           0.92      1151
   macro avg       0.92      0.92      0.92      1151
weighted avg       0.92      0.92      0.92      1151




## **9.2: Decision Tree**

### **Original**

In [237]:
decision_tree_model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 21,
 'splitter': 'best'}

In [238]:
dt_predictions_nparray \
    = decision_tree_model.predict(x_test_scaled_dataframe)

dt_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, dt_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The balanced accuracy score for decision tree from actual vs. test predictions is {:.2f}%' \
         .format(dt_balanced_accuracy_score_float)
     + '\033[0m')

[1mThe balanced accuracy score for decision tree from actual vs. test predictions is 89.66%[0m


In [239]:
dt_accuracy_score_float, \
dt_confusion_matrix_dataframe, \
dt_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         dt_predictions_nparray,
         'DECISION TREE MODEL',
         'Spam', 'Not Spam')

model_performance_dictionary['decision_tree'] \
    = [dt_accuracy_score_float * 100]

model_performance_ranking_dictionary['decision_tree'] \
    = dt_accuracy_score_float * 100

[1mDECISION TREE MODEL
[0m
1) [1mOverall Accuracy Score: [0m87.77%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 675                  24
Actual Not Spam              95                 357

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.88      0.97      0.92       699
    not spam       0.94      0.79      0.86       452

    accuracy                           0.90      1151
   macro avg       0.91      0.88      0.89      1151
weighted avg       0.90      0.90      0.89      1151




### **Random Undersampling**

In [240]:
decision_tree_undersampled_model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 21,
 'splitter': 'best'}

In [241]:
dt_undersampled_predictions_nparray \
    = decision_tree_undersampled_model.predict(x_test_scaled_dataframe)

dt_undersampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, dt_undersampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for decision tree undersampled from actual vs. test predictions is {:.2f}%' \
         .format(dt_undersampled_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for decision tree undersampled from actual vs. test predictions is 89.66%[0m


In [242]:
dt_undersampled_accuracy_score_float, \
dt_undersampled_confusion_matrix_dataframe, \
dt_undersampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         dt_undersampled_predictions_nparray,
         'DECISION TREE MODEL (Undersampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['decision_tree'] \
    .append(dt_undersampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['decision_tree_undersampling'] \
    = dt_undersampled_accuracy_score_float * 100

[1mDECISION TREE MODEL (Undersampled)
[0m
1) [1mOverall Accuracy Score: [0m88.01%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 669                  30
Actual Not Spam              89                 363

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.88      0.96      0.92       699
    not spam       0.92      0.80      0.86       452

    accuracy                           0.90      1151
   macro avg       0.90      0.88      0.89      1151
weighted avg       0.90      0.90      0.90      1151




### **Random Oversampling**

In [243]:
decision_tree_oversampled_model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 21,
 'splitter': 'best'}

In [244]:
dt_oversampled_predictions_nparray \
    = decision_tree_oversampled_model.predict(x_test_scaled_dataframe)

dt_oversampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, dt_oversampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for decision tree oversampled from actual vs. test predictions is {:.2f}%' \
         .format(dt_oversampled_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for decision tree oversampled from actual vs. test predictions is 89.75%[0m


In [245]:
dt_oversampled_accuracy_score_float, \
dt_oversampled_confusion_matrix_dataframe, \
dt_oversampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         dt_oversampled_predictions_nparray,
         'DECISION TREE MODEL (Oversampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['decision_tree'] \
    .append(dt_oversampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['decision_tree_oversampling'] \
    = dt_oversampled_accuracy_score_float * 100

[1mDECISION TREE MODEL (Oversampled)
[0m
1) [1mOverall Accuracy Score: [0m88.04%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 671                  28
Actual Not Spam              90                 362

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.88      0.96      0.92       699
    not spam       0.93      0.80      0.86       452

    accuracy                           0.90      1151
   macro avg       0.90      0.88      0.89      1151
weighted avg       0.90      0.90      0.90      1151




### **Cluster Centroids**

In [246]:
decision_tree_cluster_centroids_model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 21,
 'splitter': 'best'}

In [247]:
dt_cluster_centroids_predictions_nparray \
    = decision_tree_cluster_centroids_model.predict(x_test_scaled_dataframe)

dt_cluster_centroids_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, dt_cluster_centroids_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for decision tree cluster centroids from actual vs. test predictions is {:.2f}%' \
         .format(dt_cluster_centroids_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for decision tree cluster centroids from actual vs. test predictions is 90.10%[0m


In [248]:
dt_cluster_centroids_accuracy_score_float, \
dt_cluster_centroids_confusion_matrix_dataframe, \
dt_cluster_centroids_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         dt_cluster_centroids_predictions_nparray,
         'DECISION TREE MODEL (Cluster Centroids)',
         'Spam', 'Not Spam')

model_performance_dictionary['decision_tree'] \
    .append(dt_cluster_centroids_accuracy_score_float * 100)

model_performance_ranking_dictionary['decision_tree_cluster_centroids'] \
    = dt_cluster_centroids_accuracy_score_float * 100

[1mDECISION TREE MODEL (Cluster Centroids)
[0m
1) [1mOverall Accuracy Score: [0m88.84%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 662                  37
Actual Not Spam              77                 375

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.90      0.95      0.92       699
    not spam       0.91      0.83      0.87       452

    accuracy                           0.90      1151
   macro avg       0.90      0.89      0.89      1151
weighted avg       0.90      0.90      0.90      1151




### **SMOTE**

In [249]:
decision_tree_SMOTE_model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 21,
 'splitter': 'best'}

In [250]:
dt_SMOTE_predictions_nparray \
    = decision_tree_SMOTE_model.predict(x_test_scaled_dataframe)

dt_SMOTE_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, dt_SMOTE_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for decision tree SMOTE from actual vs. test predictions is {:.2f}%' \
         .format(dt_SMOTE_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for decision tree SMOTE from actual vs. test predictions is 90.96%[0m


In [251]:
dt_SMOTE_accuracy_score_float, \
dt_SMOTE_confusion_matrix_dataframe, \
dt_SMOTE_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         dt_SMOTE_predictions_nparray,
         'DECISION TREE MODEL (SMOTE)',
         'Spam', 'Not Spam')

model_performance_dictionary['decision_tree'] \
    .append(dt_SMOTE_accuracy_score_float * 100)

model_performance_ranking_dictionary['decision_tree_smote'] \
    = dt_SMOTE_accuracy_score_float * 100

[1mDECISION TREE MODEL (SMOTE)
[0m
1) [1mOverall Accuracy Score: [0m89.67%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 669                  30
Actual Not Spam              74                 378

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.90      0.96      0.93       699
    not spam       0.93      0.84      0.88       452

    accuracy                           0.91      1151
   macro avg       0.91      0.90      0.90      1151
weighted avg       0.91      0.91      0.91      1151




### **SMOTEEN**

In [252]:
decision_tree_SMOTEENN_model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 21,
 'splitter': 'best'}

In [253]:
dt_SMOTEENN_predictions_nparray \
    = decision_tree_SMOTEENN_model.predict(x_test_scaled_dataframe)

dt_SMOTEENN_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, dt_SMOTEENN_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for decision tree SMOTEENN from actual vs. test predictions is {:.2f}%' \
         .format(dt_SMOTEENN_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for decision tree SMOTEENN from actual vs. test predictions is 57.95%[0m


In [254]:
dt_SMOTEENN_accuracy_score_float, \
dt_SMOTEENN_confusion_matrix_dataframe, \
dt_SMOTEENN_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         dt_SMOTEENN_predictions_nparray,
         'DECISION TREE MODEL (SMOTEENN)',
         'Spam', 'Not Spam')

model_performance_dictionary['decision_tree'] \
    .append(dt_SMOTEENN_accuracy_score_float * 100)

model_performance_ranking_dictionary['decision_tree_smoteen'] \
    = dt_SMOTEENN_accuracy_score_float * 100

[1mDECISION TREE MODEL (SMOTEENN)
[0m
1) [1mOverall Accuracy Score: [0m63.97%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 251                 448
Actual Not Spam              36                 416

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.87      0.36      0.51       699
    not spam       0.48      0.92      0.63       452

    accuracy                           0.58      1151
   macro avg       0.68      0.64      0.57      1151
weighted avg       0.72      0.58      0.56      1151




## **9.3: Random Forest**

### **Original**

In [255]:
random_forest_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': 'balanced_subsample',
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 21,
 'verbose': 0,
 'warm_start': False}

In [256]:
rt_predictions_nparray \
    = random_forest_model.predict(x_test_scaled_dataframe)

rf_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, rt_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for random forest from actual vs. test predictions is {:.2f}%' \
         .format(rf_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for random forest from actual vs. test predictions is 92.96%[0m


In [257]:
rf_accuracy_score_float, rf_confusion_matrix_dataframe, rf_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         rt_predictions_nparray,
         'RANDOM FOREST MODEL',
         'Spam', 'Not Spam')

model_performance_dictionary['random_forest'] \
    = [rf_accuracy_score_float * 100]

model_performance_ranking_dictionary['random_forest'] \
    = rf_accuracy_score_float * 100

[1mRANDOM FOREST MODEL
[0m
1) [1mOverall Accuracy Score: [0m91.35%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 691                   8
Actual Not Spam              73                 379

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.90      0.99      0.94       699
    not spam       0.98      0.84      0.90       452

    accuracy                           0.93      1151
   macro avg       0.94      0.91      0.92      1151
weighted avg       0.93      0.93      0.93      1151




### **Random Undersampling**

In [258]:
random_forest_undersampled_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 21,
 'verbose': 0,
 'warm_start': False}

In [259]:
rt_undersampled_predictions_nparray \
    = random_forest_undersampled_model.predict(x_test_scaled_dataframe)

rf_undersampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, rt_undersampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for random forest undersampled from actual vs. test predictions is {:.2f}%' \
         .format(rf_undersampled_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for random forest undersampled from actual vs. test predictions is 94.18%[0m


In [260]:
rf_undersampled_accuracy_score_float, \
rf_undersampled_confusion_matrix_dataframe, \
rf_undersampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         rt_undersampled_predictions_nparray,
         'RANDOM FOREST MODEL (Undersampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['random_forest'] \
    .append(rf_undersampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['random_forest_undersampled'] \
    = rf_undersampled_accuracy_score_float * 100

[1mRANDOM FOREST MODEL (Undersampled)
[0m
1) [1mOverall Accuracy Score: [0m93.02%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 688                  11
Actual Not Spam              56                 396

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.92      0.98      0.95       699
    not spam       0.97      0.88      0.92       452

    accuracy                           0.94      1151
   macro avg       0.95      0.93      0.94      1151
weighted avg       0.94      0.94      0.94      1151




### **Random Oversampling**

In [261]:
random_forest_oversampled_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 21,
 'verbose': 0,
 'warm_start': False}

In [262]:
rt_oversampled_predictions_nparray \
    = random_forest_oversampled_model.predict(x_test_scaled_dataframe)

rf_oversampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, rt_oversampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for random forest oversampled from actual vs. test predictions is {:.2f}%' \
         .format(rf_oversampled_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for random forest oversampled from actual vs. test predictions is 93.05%[0m


In [263]:
rf_oversampled_accuracy_score_float, \
rf_oversampled_confusion_matrix_dataframe, \
rf_oversampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         rt_oversampled_predictions_nparray,
         'RANDOM FOREST MODEL (Oversampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['random_forest'] \
    .append(rf_oversampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['random_forest_oversampled'] \
    = rf_oversampled_accuracy_score_float * 100

[1mRANDOM FOREST MODEL (Oversampled)
[0m
1) [1mOverall Accuracy Score: [0m91.62%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 687                  12
Actual Not Spam              68                 384

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.91      0.98      0.94       699
    not spam       0.97      0.85      0.91       452

    accuracy                           0.93      1151
   macro avg       0.94      0.92      0.93      1151
weighted avg       0.93      0.93      0.93      1151




### **Cluster Centroids**

In [264]:
random_forest_cluster_centroids_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 21,
 'verbose': 0,
 'warm_start': False}

In [265]:
rf_cluster_centroids_predictions_nparray \
    = random_forest_cluster_centroids_model.predict(x_test_scaled_dataframe)

rf_cluster_centroids_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, rf_cluster_centroids_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for random forest cluster centroids from actual vs. test predictions is {:.2f}%' \
         .format(rf_cluster_centroids_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for random forest cluster centroids from actual vs. test predictions is 93.14%[0m


In [266]:
rf_cluster_centroids_accuracy_score_float, \
rf_cluster_centroids_confusion_matrix_dataframe, \
rf_cluster_centroids_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         rf_cluster_centroids_predictions_nparray,
         'RANDOM FOREST MODEL (Cluster Centroids)',
         'Spam', 'Not Spam')

model_performance_dictionary['random_forest'] \
    .append(rf_cluster_centroids_accuracy_score_float * 100)

model_performance_ranking_dictionary['random_forest_cluster_centroids'] \
    = rf_cluster_centroids_accuracy_score_float * 100

[1mRANDOM FOREST MODEL (Cluster Centroids)
[0m
1) [1mOverall Accuracy Score: [0m91.57%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 691                   8
Actual Not Spam              71                 381

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.91      0.99      0.95       699
    not spam       0.98      0.84      0.91       452

    accuracy                           0.93      1151
   macro avg       0.94      0.92      0.93      1151
weighted avg       0.94      0.93      0.93      1151




### **SMOTE**

In [267]:
random_forest_SMOTE_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': 'balanced_subsample',
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 21,
 'verbose': 0,
 'warm_start': False}

In [268]:
rf_SMOTE_predictions_nparray \
    = random_forest_SMOTE_model.predict(x_test_scaled_dataframe)

rf_SMOTE_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, rf_SMOTE_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for random forest SMOTE from actual vs. test predictions is {:.2f}%' \
         .format(rf_SMOTE_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for random forest SMOTE from actual vs. test predictions is 93.74%[0m


In [269]:
rf_SMOTE_accuracy_score_float, \
rf_SMOTE_confusion_matrix_dataframe, \
rf_SMOTE_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         rf_SMOTE_predictions_nparray,
         'RANDOM FOREST MODEL (SMOTE)',
         'Spam', 'Not Spam')

model_performance_dictionary['random_forest'] \
    .append(rf_SMOTE_accuracy_score_float * 100)

model_performance_ranking_dictionary['random_forest_smote'] \
    = rf_SMOTE_accuracy_score_float * 100

[1mRANDOM FOREST MODEL (SMOTE)
[0m
1) [1mOverall Accuracy Score: [0m92.5%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 687                  12
Actual Not Spam              60                 392

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.92      0.98      0.95       699
    not spam       0.97      0.87      0.92       452

    accuracy                           0.94      1151
   macro avg       0.94      0.93      0.93      1151
weighted avg       0.94      0.94      0.94      1151




### **SMOTEEN**

In [270]:
random_forest_SMOTEENN_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 21,
 'verbose': 0,
 'warm_start': False}

In [271]:
rf_SMOTEENN_predictions_nparray \
    = random_forest_SMOTEENN_model.predict(x_test_scaled_dataframe)

rf_SMOTEENN_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, rf_SMOTEENN_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for random forest SMOTEENN from actual vs. test predictions is {:.2f}%' \
         .format(rf_SMOTEENN_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for random forest SMOTEENN from actual vs. test predictions is 93.66%[0m


In [272]:
rf_SMOTEENN_accuracy_score_float, \
rf_SMOTEENN_confusion_matrix_dataframe, \
rf_SMOTEENN_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         rf_SMOTEENN_predictions_nparray,
         'RANDOM FOREST MODEL (SMOTEENN)',
         'Spam', 'Not Spam')

model_performance_dictionary['random_forest'] \
    .append(rf_SMOTEENN_accuracy_score_float * 100)

model_performance_ranking_dictionary['random_forest_smoteen'] \
    = rf_SMOTEENN_accuracy_score_float * 100

[1mRANDOM FOREST MODEL (SMOTEENN)
[0m
1) [1mOverall Accuracy Score: [0m92.82%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 676                  23
Actual Not Spam              50                 402

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.93      0.97      0.95       699
    not spam       0.95      0.89      0.92       452

    accuracy                           0.94      1151
   macro avg       0.94      0.93      0.93      1151
weighted avg       0.94      0.94      0.94      1151




## **9.4: Support Vector Machine (SVM)**

### **Original**

In [273]:
svm_model.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovo',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': True,
 'random_state': 21,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [274]:
svm_predictions_nparray \
    = svm_model.predict(x_test_scaled_dataframe)

svm_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, svm_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for SVM from actual vs. test predictions is {:.2f}%' \
         .format(svm_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for SVM from actual vs. test predictions is 92.53%[0m


In [275]:
svm_accuracy_score_float, svm_confusion_matrix_dataframe, svm_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         svm_predictions_nparray,
         'SVM MODEL',
         'Spam', 'Not Spam')

model_performance_dictionary['svm'] \
    = [svm_accuracy_score_float * 100]

model_performance_ranking_dictionary['svm'] \
    = svm_accuracy_score_float * 100

[1mSVM MODEL
[0m
1) [1mOverall Accuracy Score: [0m91.7%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 668                  31
Actual Not Spam              55                 397

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.92      0.96      0.94       699
    not spam       0.93      0.88      0.90       452

    accuracy                           0.93      1151
   macro avg       0.93      0.92      0.92      1151
weighted avg       0.93      0.93      0.92      1151




### **Random Undersampling**

In [276]:
svm_undersampled_model.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': 'balanced',
 'coef0': 0.0,
 'decision_function_shape': 'ovo',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'linear',
 'max_iter': -1,
 'probability': True,
 'random_state': 21,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [277]:
svm_undersampled_predictions_nparray \
    = svm_undersampled_model.predict(x_test_scaled_dataframe)

svm_undersampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, svm_undersampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The balanced accuracy score for svm undersampled from actual vs. test predictions is {:.2f}%' \
         .format(svm_undersampled_balanced_accuracy_score_float)
     + '\033[0m')

[1mThe balanced accuracy score for svm undersampled from actual vs. test predictions is 93.05%[0m


In [278]:
svm_undersampled_accuracy_score_float, \
svm_undersampled_confusion_matrix_dataframe, \
svm_undersampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         svm_undersampled_predictions_nparray,
         'SVM MODEL (Undersampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['svm'] \
    .append(svm_undersampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['svm_undersampled'] \
    = svm_undersampled_accuracy_score_float * 100

[1mSVM MODEL (Undersampled)
[0m
1) [1mOverall Accuracy Score: [0m92.79%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 657                  42
Actual Not Spam              38                 414

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.95      0.94      0.94       699
    not spam       0.91      0.92      0.91       452

    accuracy                           0.93      1151
   macro avg       0.93      0.93      0.93      1151
weighted avg       0.93      0.93      0.93      1151




### **Random Oversampling**

In [279]:
svm_oversampled_model.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': 'balanced',
 'coef0': 0.0,
 'decision_function_shape': 'ovo',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': True,
 'random_state': 21,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [280]:
svm_oversampled_predictions_nparray \
    = svm_oversampled_model.predict(x_test_scaled_dataframe)

svm_oversampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, svm_oversampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The balanced accuracy score for svm oversampled from actual vs. test predictions is {:.2f}%' \
         .format(svm_oversampled_balanced_accuracy_score_float)
     + '\033[0m')

[1mThe balanced accuracy score for svm oversampled from actual vs. test predictions is 92.44%[0m


In [281]:
svm_oversampled_accuracy_score_float, \
svm_oversampled_confusion_matrix_dataframe, \
svm_oversampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         svm_oversampled_predictions_nparray,
         'SVM MODEL (Oversampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['svm'] \
    .append(svm_oversampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['svm_oversampled'] \
    = svm_oversampled_accuracy_score_float * 100

[1mSVM MODEL (Oversampled)
[0m
1) [1mOverall Accuracy Score: [0m91.86%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 661                  38
Actual Not Spam              49                 403

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.93      0.95      0.94       699
    not spam       0.91      0.89      0.90       452

    accuracy                           0.92      1151
   macro avg       0.92      0.92      0.92      1151
weighted avg       0.92      0.92      0.92      1151




### **Cluster Centroids**

In [282]:
svm_cluster_centroids_model.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': 'balanced',
 'coef0': 0.0,
 'decision_function_shape': 'ovo',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'linear',
 'max_iter': -1,
 'probability': True,
 'random_state': 21,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [283]:
svm_cluster_centroids_predictions_nparray \
    = svm_cluster_centroids_model.predict(x_test_scaled_dataframe)

svm_cluster_centroids_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, svm_cluster_centroids_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for svm cluster centroids from actual vs. test predictions is {:.2f}%' \
         .format(svm_cluster_centroids_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for svm cluster centroids from actual vs. test predictions is 91.75%[0m


In [284]:
svm_cluster_centroids_accuracy_score_float, \
svm_cluster_centroids_confusion_matrix_dataframe, \
svm_cluster_centroids_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         svm_cluster_centroids_predictions_nparray,
         'SVM MODEL (Cluster Centroids)',
         'Spam', 'Not Spam')

model_performance_dictionary['svm'] \
    .append(svm_cluster_centroids_accuracy_score_float * 100)

model_performance_ranking_dictionary['svm_cluster_centroids'] \
    = svm_cluster_centroids_accuracy_score_float * 100

[1mSVM MODEL (Cluster Centroids)
[0m
1) [1mOverall Accuracy Score: [0m91.68%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 643                  56
Actual Not Spam              39                 413

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.94      0.92      0.93       699
    not spam       0.88      0.91      0.90       452

    accuracy                           0.92      1151
   macro avg       0.91      0.92      0.91      1151
weighted avg       0.92      0.92      0.92      1151




### **SMOTE**

In [285]:
svm_SMOTE_model.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': 'balanced',
 'coef0': 0.0,
 'decision_function_shape': 'ovo',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': True,
 'random_state': 21,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [286]:
svm_SMOTE_predictions_nparray \
    = svm_SMOTE_model.predict(x_test_scaled_dataframe)

svm_SMOTE_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, svm_SMOTE_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for svm SMOTE from actual vs. test predictions is {:.2f}%' \
         .format(svm_SMOTE_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for svm SMOTE from actual vs. test predictions is 92.35%[0m


In [287]:
svm_SMOTE_accuracy_score_float, \
svm_SMOTE_confusion_matrix_dataframe, \
svm_SMOTE_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         svm_SMOTE_predictions_nparray,
         'SVM MODEL (SMOTE)',
         'Spam', 'Not Spam')

model_performance_dictionary['svm'] \
    .append(svm_SMOTE_accuracy_score_float * 100)

model_performance_ranking_dictionary['svm_smote'] \
    = svm_SMOTE_accuracy_score_float * 100

[1mSVM MODEL (SMOTE)
[0m
1) [1mOverall Accuracy Score: [0m91.75%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 661                  38
Actual Not Spam              50                 402

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.93      0.95      0.94       699
    not spam       0.91      0.89      0.90       452

    accuracy                           0.92      1151
   macro avg       0.92      0.92      0.92      1151
weighted avg       0.92      0.92      0.92      1151




### **SMOTEEN**

In [288]:
svm_SMOTEENN_model.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovo',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': True,
 'random_state': 21,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [289]:
svm_SMOTEENN_predictions_nparray \
    = svm_SMOTEENN_model.predict(x_test_scaled_dataframe)

svm_SMOTEENN_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, svm_SMOTEENN_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for svm SMOTEENN from actual vs. test predictions is {:.2f}%' \
         .format(svm_SMOTEENN_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for svm SMOTEENN from actual vs. test predictions is 92.44%[0m


In [290]:
svm_SMOTEENN_accuracy_score_float, \
svm_SMOTEENN_confusion_matrix_dataframe, \
svm_SMOTEENN_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         svm_SMOTEENN_predictions_nparray,
         'SVM MODEL (SMOTEENN)',
         'Spam', 'Not Spam')

model_performance_dictionary['svm'] \
    .append(svm_SMOTEENN_accuracy_score_float * 100)

model_performance_ranking_dictionary['svm_smoteen'] \
    = svm_SMOTEENN_accuracy_score_float * 100

[1mSVM MODEL (SMOTEENN)
[0m
1) [1mOverall Accuracy Score: [0m92.1%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 655                  44
Actual Not Spam              43                 409

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.94      0.94      0.94       699
    not spam       0.90      0.90      0.90       452

    accuracy                           0.92      1151
   macro avg       0.92      0.92      0.92      1151
weighted avg       0.92      0.92      0.92      1151




## **9.5: K-Nearest Neighbor (KNN)**

### **Original**

In [291]:
knn_model.get_params()

{'algorithm': 'auto',
 'leaf_size': 2,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 10,
 'p': 2,
 'weights': 'distance'}

In [292]:
knn_predictions_nparray = knn_model.predict(x_test_scaled_dataframe)

knn_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, knn_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for KNN from actual vs. test predictions is {:.2f}%' \
         .format(knn_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for KNN from actual vs. test predictions is 92.35%[0m


In [293]:
knn_accuracy_score_float, \
knn_confusion_matrix_dataframe, \
knn_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         knn_predictions_nparray,
         'KNN MODEL',
         'Spam', 'Not Spam')

model_performance_dictionary['knn'] \
    = [knn_accuracy_score_float * 100]

model_performance_ranking_dictionary['knn'] \
    = knn_accuracy_score_float * 100

[1mKNN MODEL
[0m
1) [1mOverall Accuracy Score: [0m91.91%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 657                  42
Actual Not Spam              46                 406

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.93      0.94      0.94       699
    not spam       0.91      0.90      0.90       452

    accuracy                           0.92      1151
   macro avg       0.92      0.92      0.92      1151
weighted avg       0.92      0.92      0.92      1151




### **Random Undersampling**

In [294]:
knn_undersampled_model.get_params()

{'algorithm': 'auto',
 'leaf_size': 2,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 12,
 'p': 1,
 'weights': 'distance'}

In [295]:
knn_undersampled_predictions_nparray \
    = knn_undersampled_model.predict(x_test_scaled_dataframe)

knn_undersampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, knn_undersampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for knn undersampled from actual vs. test predictions is {:.2f}%' \
         .format(knn_undersampled_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for knn undersampled from actual vs. test predictions is 91.57%[0m


In [296]:
knn_undersampled_accuracy_score_float, \
knn_undersampled_confusion_matrix_dataframe, \
knn_undersampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         knn_undersampled_predictions_nparray,
         'KNN MODEL (Undersampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['knn'] \
    .append(knn_undersampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['knn_undersampled'] \
    = knn_undersampled_accuracy_score_float * 100

[1mKNN MODEL (Undersampled)
[0m
1) [1mOverall Accuracy Score: [0m90.64%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 664                  35
Actual Not Spam              62                 390

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.91      0.95      0.93       699
    not spam       0.92      0.86      0.89       452

    accuracy                           0.92      1151
   macro avg       0.92      0.91      0.91      1151
weighted avg       0.92      0.92      0.92      1151




### **Random Oversampling**

In [297]:
knn_oversampled_model.get_params()

{'algorithm': 'auto',
 'leaf_size': 2,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 19,
 'p': 1,
 'weights': 'distance'}

In [298]:
knn_oversampled_predictions_nparray \
    = knn_oversampled_model.predict(x_test_scaled_dataframe)

knn_oversampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, knn_oversampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The balanced accuracy score for knn oversampled from actual vs. test predictions is {:.2f}%' \
         .format(knn_oversampled_balanced_accuracy_score_float)
     + '\033[0m')

[1mThe balanced accuracy score for knn oversampled from actual vs. test predictions is 91.92%[0m


In [299]:
knn_oversampled_accuracy_score_float, \
knn_oversampled_confusion_matrix_dataframe, \
knn_oversampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         knn_oversampled_predictions_nparray,
         'KNN MODEL (Oversampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['knn'] \
    .append(knn_oversampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['knn_oversampled'] \
    = knn_oversampled_accuracy_score_float * 100

[1mKNN MODEL (Oversampled)
[0m
1) [1mOverall Accuracy Score: [0m90.89%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 669                  30
Actual Not Spam              63                 389

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.91      0.96      0.94       699
    not spam       0.93      0.86      0.89       452

    accuracy                           0.92      1151
   macro avg       0.92      0.91      0.91      1151
weighted avg       0.92      0.92      0.92      1151




### **Cluster Centroids**

In [300]:
knn_cluster_centroids_model.get_params()

{'algorithm': 'auto',
 'leaf_size': 2,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 6,
 'p': 2,
 'weights': 'distance'}

In [301]:
knn_cluster_centroids_predictions_nparray \
    = knn_cluster_centroids_model.predict(x_test_scaled_dataframe)

knn_cluster_centroids_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, knn_cluster_centroids_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for knn cluster centroids from actual vs. test predictions is {:.2f}%' \
         .format(knn_cluster_centroids_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for knn cluster centroids from actual vs. test predictions is 87.92%[0m


In [302]:
knn_cluster_centroids_accuracy_score_float, \
knn_cluster_centroids_confusion_matrix_dataframe, \
knn_cluster_centroids_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         knn_cluster_centroids_predictions_nparray,
         'KNN MODEL (Cluster Centroids)',
         'Spam', 'Not Spam')

model_performance_dictionary['knn'] \
    .append(knn_cluster_centroids_accuracy_score_float * 100)

model_performance_ranking_dictionary['knn_cluster_centroids'] \
    = knn_cluster_centroids_accuracy_score_float * 100

[1mKNN MODEL (Cluster Centroids)
[0m
1) [1mOverall Accuracy Score: [0m89.0%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 587                 112
Actual Not Spam              27                 425

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.96      0.84      0.89       699
    not spam       0.79      0.94      0.86       452

    accuracy                           0.88      1151
   macro avg       0.87      0.89      0.88      1151
weighted avg       0.89      0.88      0.88      1151




### **SMOTE**

In [303]:
knn_SMOTE_model.get_params()

{'algorithm': 'auto',
 'leaf_size': 2,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 12,
 'p': 1,
 'weights': 'distance'}

In [304]:
knn_SMOTE_predictions_nparray \
    = knn_SMOTE_model.predict(x_test_scaled_dataframe)

knn_SMOTE_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, knn_SMOTE_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for knn SMOTE from actual vs. test predictions is {:.2f}%' \
         .format(knn_SMOTE_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for knn SMOTE from actual vs. test predictions is 92.62%[0m


In [305]:
knn_SMOTE_accuracy_score_float, \
knn_SMOTE_confusion_matrix_dataframe, \
knn_SMOTE_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         knn_SMOTE_predictions_nparray,
         'KNN MODEL (SMOTE)',
         'Spam', 'Not Spam')

model_performance_dictionary['knn'] \
    .append(knn_SMOTE_accuracy_score_float * 100)

model_performance_ranking_dictionary['knn_smote'] \
    = knn_SMOTE_accuracy_score_float * 100

[1mKNN MODEL (SMOTE)
[0m
1) [1mOverall Accuracy Score: [0m92.0%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 663                  36
Actual Not Spam              49                 403

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.93      0.95      0.94       699
    not spam       0.92      0.89      0.90       452

    accuracy                           0.93      1151
   macro avg       0.92      0.92      0.92      1151
weighted avg       0.93      0.93      0.93      1151




### **SMOTEEN**

In [306]:
knn_SMOTEENN_model.get_params()

{'algorithm': 'auto',
 'leaf_size': 2,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 6,
 'p': 2,
 'weights': 'distance'}

In [307]:
knn_SMOTEENN_predictions_nparray \
    = knn_SMOTEENN_model.predict(x_test_scaled_dataframe)

knn_SMOTEENN_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, knn_SMOTEENN_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for knn SMOTEENN from actual vs. test predictions is {:.2f}%' \
         .format(knn_SMOTEENN_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for knn SMOTEENN from actual vs. test predictions is 89.23%[0m


In [308]:
knn_SMOTEENN_accuracy_score_float, \
knn_SMOTEENN_confusion_matrix_dataframe, \
knn_SMOTEENN_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         knn_SMOTEENN_predictions_nparray,
         'KNN MODEL (SMOTEENN)',
         'Spam', 'Not Spam')

model_performance_dictionary['knn'] \
    .append(knn_SMOTEENN_accuracy_score_float * 100)

model_performance_ranking_dictionary['knn_smoteen'] \
    = knn_SMOTEENN_accuracy_score_float * 100

[1mKNN MODEL (SMOTEENN)
[0m
1) [1mOverall Accuracy Score: [0m89.57%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 615                  84
Actual Not Spam              40                 412

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.94      0.88      0.91       699
    not spam       0.83      0.91      0.87       452

    accuracy                           0.89      1151
   macro avg       0.88      0.90      0.89      1151
weighted avg       0.90      0.89      0.89      1151




## **9.6: Gaussian Naive Bayes (GNB)**

### **Original**

In [309]:
gnb_model.get_params()

{'priors': None, 'var_smoothing': 1e-09}

In [310]:
gnb_predictions_nparray = gnb_model.predict(x_test_scaled_dataframe)

gnb_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, gnb_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for GNB from actual vs. test predictions is {:.2f}%' \
         .format(gnb_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for GNB from actual vs. test predictions is 81.15%[0m


In [311]:
gnb_accuracy_score_float, \
gnb_confusion_matrix_dataframe, \
gnb_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         gnb_predictions_nparray,
         'GNB MODEL',
         'Spam', 'Not Spam')

model_performance_dictionary['gnb'] \
    = [gnb_accuracy_score_float * 100]

model_performance_ranking_dictionary['gnb'] \
    = gnb_accuracy_score_float * 100

[1mGNB MODEL
[0m
1) [1mOverall Accuracy Score: [0m83.66%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 503                 196
Actual Not Spam              21                 431

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.96      0.72      0.82       699
    not spam       0.69      0.95      0.80       452

    accuracy                           0.81      1151
   macro avg       0.82      0.84      0.81      1151
weighted avg       0.85      0.81      0.81      1151




### **Random Undersampling**

In [312]:
gnb_undersampled_model.get_params()

{'priors': None, 'var_smoothing': 1e-09}

In [313]:
gnb_undersampled_predictions_nparray \
    = gnb_undersampled_model.predict(x_test_scaled_dataframe)

gnb_undersampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, gnb_undersampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for gnb undersampled from actual vs. test predictions is {:.2f}%' \
         .format(gnb_undersampled_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for gnb undersampled from actual vs. test predictions is 81.15%[0m


In [314]:
gnb_undersampled_accuracy_score_float, \
gnb_undersampled_confusion_matrix_dataframe, \
gnb_undersampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         gnb_undersampled_predictions_nparray,
         'GNB MODEL (Undersampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['gnb'] \
    .append(gnb_undersampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['gnb_undersampled'] \
    = gnb_undersampled_accuracy_score_float * 100

[1mGNB MODEL (Undersampled)
[0m
1) [1mOverall Accuracy Score: [0m83.62%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 504                 195
Actual Not Spam              22                 430

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.96      0.72      0.82       699
    not spam       0.69      0.95      0.80       452

    accuracy                           0.81      1151
   macro avg       0.82      0.84      0.81      1151
weighted avg       0.85      0.81      0.81      1151




### **Random Oversampling**

In [315]:
gnb_oversampled_model.get_params()

{'priors': None, 'var_smoothing': 1e-09}

In [316]:
gnb_oversampled_predictions_nparray \
    = gnb_oversampled_model.predict(x_test_scaled_dataframe)

gnb_oversampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, gnb_oversampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for gnb oversampled from actual vs. test predictions is {:.2f}%' \
         .format(gnb_oversampled_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for gnb oversampled from actual vs. test predictions is 80.97%[0m


In [317]:
gnb_oversampled_accuracy_score_float, \
gnb_oversampled_confusion_matrix_dataframe, \
gnb_oversampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         gnb_oversampled_predictions_nparray,
         'GNB MODEL (Oversampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['gnb'] \
    .append(gnb_oversampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['gnb_oversampled'] \
    = gnb_oversampled_accuracy_score_float * 100

[1mGNB MODEL (Oversampled)
[0m
1) [1mOverall Accuracy Score: [0m83.4%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 504                 195
Actual Not Spam              24                 428

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.95      0.72      0.82       699
    not spam       0.69      0.95      0.80       452

    accuracy                           0.81      1151
   macro avg       0.82      0.83      0.81      1151
weighted avg       0.85      0.81      0.81      1151




### **Cluster Centroids**

In [318]:
gnb_cluster_centroids_model.get_params()

{'priors': None, 'var_smoothing': 1e-09}

In [319]:
gnb_cluster_centroids_predictions_nparray \
    = gnb_cluster_centroids_model.predict(x_test_scaled_dataframe)

gnb_cluster_centroids_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, gnb_cluster_centroids_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for knn cluster centroids from actual vs. test predictions is {:.2f}%' \
         .format(gnb_cluster_centroids_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for knn cluster centroids from actual vs. test predictions is 81.23%[0m


In [320]:
gnb_cluster_centroids_accuracy_score_float, \
gnb_cluster_centroids_confusion_matrix_dataframe, \
gnb_cluster_centroids_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         gnb_cluster_centroids_predictions_nparray,
         'GNB MODEL (Cluster Centroids)',
         'Spam', 'Not Spam')

model_performance_dictionary['gnb'] \
    .append(gnb_cluster_centroids_accuracy_score_float * 100)

model_performance_ranking_dictionary['gnb_cluster_centroids'] \
    = gnb_cluster_centroids_accuracy_score_float * 100

[1mGNB MODEL (Cluster Centroids)
[0m
1) [1mOverall Accuracy Score: [0m83.77%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 503                 196
Actual Not Spam              20                 432

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.96      0.72      0.82       699
    not spam       0.69      0.96      0.80       452

    accuracy                           0.81      1151
   macro avg       0.82      0.84      0.81      1151
weighted avg       0.85      0.81      0.81      1151




### **SMOTE**

In [321]:
gnb_SMOTE_model.get_params()

{'priors': None, 'var_smoothing': 1e-09}

In [322]:
gnb_SMOTE_predictions_nparray \
    = gnb_SMOTE_model.predict(x_test_scaled_dataframe)

gnb_SMOTE_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, gnb_SMOTE_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for gnb SMOTE from actual vs. test predictions is {:.2f}%' \
         .format(gnb_SMOTE_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for gnb SMOTE from actual vs. test predictions is 81.49%[0m


In [323]:
gnb_SMOTE_accuracy_score_float, \
gnb_SMOTE_confusion_matrix_dataframe, \
gnb_SMOTE_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         gnb_SMOTE_predictions_nparray,
         'GNB MODEL (SMOTE)',
         'Spam', 'Not Spam')

model_performance_dictionary['gnb'] \
    .append(gnb_SMOTE_accuracy_score_float * 100)

model_performance_ranking_dictionary['gnb_smote'] \
    = gnb_SMOTE_accuracy_score_float * 100

[1mGNB MODEL (SMOTE)
[0m
1) [1mOverall Accuracy Score: [0m83.86%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 509                 190
Actual Not Spam              23                 429

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.96      0.73      0.83       699
    not spam       0.69      0.95      0.80       452

    accuracy                           0.81      1151
   macro avg       0.82      0.84      0.81      1151
weighted avg       0.85      0.81      0.82      1151




### **SMOTEEN**

In [324]:
gnb_SMOTEENN_model.get_params()

{'priors': None, 'var_smoothing': 1e-09}

In [325]:
gnb_SMOTEENN_predictions_nparray \
    = gnb_SMOTEENN_model.predict(x_test_scaled_dataframe)

gnb_SMOTEENN_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, gnb_SMOTEENN_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for gnb SMOTEENN from actual vs. test predictions is {:.2f}%' \
         .format(gnb_SMOTEENN_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for gnb SMOTEENN from actual vs. test predictions is 84.19%[0m


In [326]:
gnb_SMOTEENN_accuracy_score_float, \
gnb_SMOTEENN_confusion_matrix_dataframe, \
gnb_SMOTEENN_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series,
         gnb_SMOTEENN_predictions_nparray,
         'GNB MODEL (SMOTEENN)',
         'Spam', 'Not Spam')

model_performance_dictionary['gnb'] \
    .append(gnb_SMOTEENN_accuracy_score_float * 100)

model_performance_ranking_dictionary['gnb_smoteen'] \
    = gnb_SMOTEENN_accuracy_score_float * 100

[1mGNB MODEL (SMOTEENN)
[0m
1) [1mOverall Accuracy Score: [0m85.81%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 547                 152
Actual Not Spam              30                 422

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.95      0.78      0.86       699
    not spam       0.74      0.93      0.82       452

    accuracy                           0.84      1151
   macro avg       0.84      0.86      0.84      1151
weighted avg       0.86      0.84      0.84      1151




## **9.7: Model Performance Results**

### **Performance Matrix**

In [327]:
index_string_list \
    = ['original', 'undersampled', 'oversampled', 'cluster centroids', 'smote', 'smoteen']

model_performace_dataframe \
    = pd.DataFrame.from_dict \
        (model_performance_dictionary, orient = 'index').transpose()

model_performace_dataframe['resampling_method'] = index_string_list

model_performace_dataframe.set_index('resampling_method', drop = True, inplace = True)

logx.log_write_object(model_performace_dataframe)

In [328]:
pandasx.return_formatted_table \
    (model_performace_dataframe,
     'Table 9.7.1: Model Performance Matrix',
     line_count_integer = 36,
     hide_index_boolean = False) \
        .format('{:,.1f}%')

Unnamed: 0_level_0,logistic_regression,decision_tree,random_forest,svm,knn,gnb
resampling_method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
original,89.7%,87.8%,91.4%,91.7%,91.9%,83.7%
undersampled,92.0%,88.0%,93.0%,92.8%,90.6%,83.6%
oversampled,91.2%,88.0%,91.6%,91.9%,90.9%,83.4%
cluster centroids,91.0%,88.8%,91.6%,91.7%,89.0%,83.8%
smote,91.7%,89.7%,92.5%,91.8%,92.0%,83.9%
smoteen,91.8%,64.0%,92.8%,92.1%,89.6%,85.8%


### **Performance Ranking**

In [329]:
temp_dictionary \
    = dict(sorted(model_performance_ranking_dictionary.items(), key = lambda x: x[1], reverse = True))

model_performace_rankings_dataframe \
    = pd.DataFrame.from_dict \
        (temp_dictionary, orient = 'index', columns = ['accuracy'])

model_performace_rankings_dataframe.index.name = 'model'

logx.log_write_object(model_performace_rankings_dataframe)

In [330]:
pandasx.return_formatted_table \
    (model_performace_rankings_dataframe,
     'Table 9.7.2: Model Performance Rankings',
     line_count_integer = 36,
     hide_index_boolean = False) \
        .format({'accuracy': '{:,.1f}%'})

Unnamed: 0_level_0,accuracy
model,Unnamed: 1_level_1
random_forest_undersampled,93.0%
random_forest_smoteen,92.8%
svm_undersampled,92.8%
random_forest_smote,92.5%
svm_smoteen,92.1%
logistic_regression_undersampled,92.0%
knn_smote,92.0%
knn,91.9%
svm_oversampled,91.9%
logistic_regression_smoteen,91.8%


# <br> **Section 10: Save Models To Files**

## **10.1: Logistic Regression**

### **Original**

In [331]:
pickle.dump \
    (logistic_regression_model,
     open(spam_detector_constants.CONSTANT_LR_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [332]:
pickle.dump \
    (logistic_regression_undersampled_model,
     open(spam_detector_constants.CONSTANT_LR_UNDERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [333]:
pickle.dump \
    (logistic_regression_oversampled_model,
     open(spam_detector_constants.CONSTANT_LR_OVERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [334]:
pickle.dump \
    (logistic_regression_cluster_centroids_model,
     open(spam_detector_constants.CONSTANT_LR_CLUSTER_CENTROIDS_MODEL_FILE_PATH, 'wb'))

### **SMOTE**

In [335]:
pickle.dump \
    (logistic_regression_SMOTE_model,
     open(spam_detector_constants.CONSTANT_LR_SMOTE_MODEL_FILE_PATH, 'wb'))

### **SMOTEEN**

In [336]:
pickle.dump \
    (logistic_regression_SMOTEENN_model,
     open(spam_detector_constants.CONSTANT_LR_SMOTEENN_MODEL_FILE_PATH, 'wb'))

## **10.2: Decision Tree**

### **Original**

In [337]:
pickle.dump \
    (decision_tree_model,
     open(spam_detector_constants.CONSTANT_DT_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [338]:
pickle.dump \
    (decision_tree_undersampled_model,
     open(spam_detector_constants.CONSTANT_DT_UNDERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [339]:
pickle.dump \
    (decision_tree_oversampled_model,
     open(spam_detector_constants.CONSTANT_DT_OVERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [340]:
pickle.dump \
    (decision_tree_cluster_centroids_model,
     open(spam_detector_constants.CONSTANT_DT_CLUSTER_CENTROIDS_MODEL_FILE_PATH, 'wb'))

### **SMOTE**

In [341]:
pickle.dump \
    (decision_tree_SMOTE_model,
     open(spam_detector_constants.CONSTANT_DT_SMOTE_MODEL_FILE_PATH, 'wb'))

### **SMOTEEN**

In [342]:
pickle.dump \
    (decision_tree_SMOTEENN_model,
     open(spam_detector_constants.CONSTANT_DT_SMOTEENN_MODEL_FILE_PATH, 'wb'))

## **10.3: Random Forest**

### **Original**

In [343]:
pickle.dump \
    (random_forest_model,
     open(spam_detector_constants.CONSTANT_RF_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [344]:
pickle.dump \
    (random_forest_undersampled_model,
     open(spam_detector_constants.CONSTANT_RF_UNDERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [345]:
pickle.dump \
    (random_forest_oversampled_model,
     open(spam_detector_constants.CONSTANT_RF_OVERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [346]:
pickle.dump \
    (random_forest_cluster_centroids_model,
     open(spam_detector_constants.CONSTANT_RF_CLUSTER_CENTROIDS_MODEL_FILE_PATH, 'wb'))

### **SMOTE**

In [347]:
pickle.dump \
    (random_forest_SMOTE_model,
     open(spam_detector_constants.CONSTANT_RF_SMOTE_MODEL_FILE_PATH, 'wb'))

### **SMOTEEN**

In [348]:
pickle.dump \
    (random_forest_SMOTEENN_model,
     open(spam_detector_constants.CONSTANT_RF_SMOTEENN_MODEL_FILE_PATH, 'wb'))

## **10.4: Support Vector Machine (SVM)**

### **Original**

In [349]:
pickle.dump \
    (svm_model,
     open(spam_detector_constants.CONSTANT_SVM_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [350]:
pickle.dump \
    (svm_undersampled_model,
     open(spam_detector_constants.CONSTANT_SVM_UNDERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [351]:
pickle.dump \
    (svm_oversampled_model,
     open(spam_detector_constants.CONSTANT_SVM_OVERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [352]:
pickle.dump \
    (svm_cluster_centroids_model,
     open(spam_detector_constants.CONSTANT_SVM_CLUSTER_CENTROIDS_MODEL_FILE_PATH, 'wb'))

### **SMOTE**

In [353]:
pickle.dump \
    (svm_SMOTE_model,
     open(spam_detector_constants.CONSTANT_SVM_SMOTE_MODEL_FILE_PATH, 'wb'))

### **SMOTEEN**

In [354]:
pickle.dump \
    (svm_SMOTEENN_model,
     open(spam_detector_constants.CONSTANT_SVM_SMOTEENN_MODEL_FILE_PATH, 'wb'))

## **10.5: K-Nearest Neighbor (KNN)**

### **Original**

In [355]:
pickle.dump \
    (knn_model,
     open(spam_detector_constants.CONSTANT_KNN_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [356]:
pickle.dump \
    (knn_undersampled_model,
     open(spam_detector_constants.CONSTANT_KNN_UNDERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [357]:
pickle.dump \
    (knn_oversampled_model,
     open(spam_detector_constants.CONSTANT_KNN_OVERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [358]:
pickle.dump \
    (knn_cluster_centroids_model,
     open(spam_detector_constants.CONSTANT_KNN_CLUSTER_CENTROIDS_MODEL_FILE_PATH, 'wb'))

### **SMOTE**

In [359]:
pickle.dump \
    (knn_SMOTE_model,
     open(spam_detector_constants.CONSTANT_KNN_SMOTE_MODEL_FILE_PATH, 'wb'))

### **SMOTEEN**

In [360]:
pickle.dump \
    (knn_SMOTEENN_model,
     open(spam_detector_constants.CONSTANT_KNN_SMOTEENN_MODEL_FILE_PATH, 'wb'))

## **10.6: Gaussian Naive Bayes (GNB)**

### **Original**

In [361]:
pickle.dump \
    (gnb_model,
     open(spam_detector_constants.CONSTANT_GNB_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [362]:
pickle.dump \
    (gnb_undersampled_model,
     open(spam_detector_constants.CONSTANT_GNB_UNDERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [363]:
pickle.dump \
    (gnb_oversampled_model,
     open(spam_detector_constants.CONSTANT_GNB_OVERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [364]:
pickle.dump \
    (gnb_cluster_centroids_model,
     open(spam_detector_constants.CONSTANT_GNB_CLUSTER_CENTROIDS_MODEL_FILE_PATH, 'wb'))

### **SMOTE**

In [365]:
pickle.dump \
    (gnb_SMOTE_model,
     open(spam_detector_constants.CONSTANT_GNB_SMOTE_MODEL_FILE_PATH, 'wb'))

### **SMOTEEN**

In [366]:
pickle.dump \
    (gnb_SMOTEENN_model,
     open(spam_detector_constants.CONSTANT_GNB_SMOTEENN_MODEL_FILE_PATH, 'wb'))

In [367]:
# logx.end_program()