In [1]:
#*******************************************************************************************
 #
 #  File Name:  spam_detector.ipynb
 #
 #  File Description:
 #      This interactive Python notebook, spam_detector.ipynb, reads a csv file, 
 #      spam-data.csv and uses Python and scikit-learn module to find the best 
 #      supervised learning model (binary classification) for detecting spam in 
 #      e-mails. Here is a list of the models:
 #
 #      logistic regression
 #      decision tree
 #      random forest
 #      support vector machine
 #      k-nearest neighbor
 #      gaussian naive bayes
 #
 #
 #  Date            Description                             Programmer
 #  ----------      ------------------------------------    ------------------
 #  04/22/2024      Initial Development                     Nicholas J. George
 #
 #******************************************************************************************/

import classificationsx
import logx
import pandas_processx
import spam_detector_constants

import pickle

import numpy as np
import pandas as pd

from imblearn.combine import SMOTEENN
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import RandomUnderSampler

from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
CONSTANT_LOCAL_FILE_NAME = 'spam_detector.ipynb'


logx.set_log_mode(False)

logx.set_image_mode(False)


logx.begin_program('spam_detector')

# <br> **Section 1: Extraction and Transformation**

## **1.1: Read the CSV data into a Pandas DataFrame**

In [3]:
data_type_dictionary \
    = {'word_freq_make': float,
       'word_freq_address': float,
       'word_freq_all': float,
       'word_freq_3d': float,
       'word_freq_our': float,
       'word_freq_over': float,
       'word_freq_remove': float,
       'word_freq_internet': float,
       'word_freq_order': float,
       'word_freq_mail': float,
       'word_freq_receive': float,
       'word_freq_will': float,
       'word_freq_people': float,
       'word_freq_report': float,
       'word_freq_addresses': float,
       'word_freq_free': float,
       'word_freq_business': float,
       'word_freq_email': float,
       'word_freq_you': float,
       'word_freq_credit': float,
       'word_freq_your': float,
       'word_freq_font': float,
       'word_freq_000': float,
       'word_freq_money': float,
       'word_freq_hp': float,
       'word_freq_hpl': float,
       'word_freq_george': float,
       'word_freq_650': float,
       'word_freq_lab': float,
       'word_freq_labs': float,
       'word_freq_telnet': float,
       'word_freq_857': float,
       'word_freq_data': float,
       'word_freq_415': float,
       'word_freq_85': float,
       'word_freq_technology': float,
       'word_freq_1999': float,
       'word_freq_parts': float,
       'word_freq_pm': float,
       'word_freq_direct': float,
       'word_freq_cs': float,
       'word_freq_meeting': float,
       'word_freq_original': float,
       'word_freq_project': float,
       'word_freq_re': float,
       'word_freq_edu': float,
       'word_freq_table': float,
       'word_freq_conference': float,
       'char_freq_;': float,
       'char_freq_(': float,
       'char_freq_[': float,
       'char_freq_!': float,
       'char_freq_$': float,
       'char_freq_#': float,
       'capital_run_length_average': float,
       'capital_run_length_longest': int,
       'capital_run_length_total': int,
       'spam': int}

spam_dataframe \
    = pd.read_csv(spam_detector_constants.CONSTANT_INPUT_FILE_PATH, dtype = data_type_dictionary)

logx.log_write_object(spam_dataframe)

## **1.2: Display Spam DataFrame**

In [4]:
pandas_processx.return_formatted_table(spam_dataframe, 'Table 1.2: Spam Data Table')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,0.0,0.64,0.0,0.0,0.0,0.32,0.0,1.29,1.93,0.0,0.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.78,0.0,0.0,3.76,61,278,1
0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,0.21,0.79,0.65,0.21,0.14,0.14,0.07,0.28,3.47,0.0,1.59,0.0,0.43,0.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.0,0.37,0.18,0.05,5.11,101,1028,1
0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,0.38,0.45,0.12,0.0,1.75,0.06,0.06,1.03,1.36,0.32,0.51,0.0,1.16,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.12,0.0,0.06,0.06,0.0,0.0,0.01,0.14,0.0,0.28,0.18,0.01,9.82,485,2259,1
0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,0.31,0.31,0.31,0.0,0.0,0.31,0.0,0.0,3.18,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.14,0.0,0.0,3.54,40,191,1
0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,0.31,0.31,0.31,0.0,0.0,0.31,0.0,0.0,3.18,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.14,0.0,0.0,3.54,40,191,1
0.0,0.0,0.0,0.0,1.85,0.0,0.0,1.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22,0.0,0.0,0.0,0.0,3.0,15,54,1
0.0,0.0,0.0,0.0,1.92,0.0,0.0,0.0,0.0,0.64,0.96,1.28,0.0,0.0,0.0,0.96,0.0,0.32,3.85,0.0,0.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.16,0.05,0.0,1.67,4,112,1
0.0,0.0,0.0,0.0,1.88,0.0,0.0,1.88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,2.45,11,49,1
0.15,0.0,0.46,0.0,0.61,0.0,0.3,0.0,0.92,0.76,0.76,0.92,0.0,0.0,0.0,0.0,0.0,0.15,1.23,3.53,2.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.27,0.0,0.18,0.2,0.02,9.74,445,1257,1
0.06,0.12,0.77,0.0,0.19,0.32,0.38,0.0,0.06,0.0,0.0,0.64,0.25,0.0,0.12,0.0,0.0,0.12,1.67,0.06,0.71,0.0,0.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.04,0.03,0.0,0.24,0.08,0.0,1.73,43,749,1


## **1.3: Create the labels series (`y`)  from the “spam” column, and then create the features (`X`) DataFrame from the remaining columns.**

### **Separate the Y Variable, The Labels**

In [5]:
y_series = spam_dataframe['spam']

logx.log_write_object(y_series)

### **Review the Y Series**

In [6]:
pandas_processx.return_formatted_table(y_series.to_frame(), 'Table 1.3.1: Spam Target Series')

spam
1
1
1
1
1
1
1
1
1
1


### **Check the Balance of the Labels Variable (`y`) by Using the `value_counts` Function.**

In [7]:
y_series.value_counts()

spam
0    2788
1    1813
Name: count, dtype: int64

### **Separate the X Variable, the Features**

In [8]:
x_dataframe = spam_dataframe.drop(columns = 'spam', axis = 1)

logx.log_write_object(x_dataframe)

### **Review the X DataFrame**

In [9]:
pandas_processx.return_formatted_table(x_dataframe, 'Table 1.3.2: Spam Features DataFrame')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,0.0,0.64,0.0,0.0,0.0,0.32,0.0,1.29,1.93,0.0,0.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.78,0.0,0.0,3.76,61,278
0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,0.21,0.79,0.65,0.21,0.14,0.14,0.07,0.28,3.47,0.0,1.59,0.0,0.43,0.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.0,0.37,0.18,0.05,5.11,101,1028
0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,0.38,0.45,0.12,0.0,1.75,0.06,0.06,1.03,1.36,0.32,0.51,0.0,1.16,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.12,0.0,0.06,0.06,0.0,0.0,0.01,0.14,0.0,0.28,0.18,0.01,9.82,485,2259
0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,0.31,0.31,0.31,0.0,0.0,0.31,0.0,0.0,3.18,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.14,0.0,0.0,3.54,40,191
0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,0.31,0.31,0.31,0.0,0.0,0.31,0.0,0.0,3.18,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.14,0.0,0.0,3.54,40,191
0.0,0.0,0.0,0.0,1.85,0.0,0.0,1.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22,0.0,0.0,0.0,0.0,3.0,15,54
0.0,0.0,0.0,0.0,1.92,0.0,0.0,0.0,0.0,0.64,0.96,1.28,0.0,0.0,0.0,0.96,0.0,0.32,3.85,0.0,0.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.16,0.05,0.0,1.67,4,112
0.0,0.0,0.0,0.0,1.88,0.0,0.0,1.88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,2.45,11,49
0.15,0.0,0.46,0.0,0.61,0.0,0.3,0.0,0.92,0.76,0.76,0.92,0.0,0.0,0.0,0.0,0.0,0.15,1.23,3.53,2.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.27,0.0,0.18,0.2,0.02,9.74,445,1257
0.06,0.12,0.77,0.0,0.19,0.32,0.38,0.0,0.06,0.0,0.0,0.64,0.25,0.0,0.12,0.0,0.0,0.12,1.67,0.06,0.71,0.0,0.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.04,0.03,0.0,0.24,0.08,0.0,1.73,43,749


## **1.4: Split the Data into Training and Testing Datasets by Using `train_test_split`.**

In [10]:
x_train_dataframe, x_test_dataframe, \
y_train_series, y_test_series \
    = train_test_split \
        (x_dataframe, y_series, 
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1)

In [11]:
logx.log_write_object(x_train_dataframe)

logx.log_write_object(x_test_dataframe)

logx.log_write_object(y_train_series)

logx.log_write_object(y_test_series)

## **1.5: Use the StandardScaler to Scale the X Variables**

### **Scale Training and Test Data as Numpy Arrays**

In [12]:
x_train_scaled_nparray = StandardScaler().fit_transform(x_train_dataframe)

logx.log_write_object(x_train_scaled_nparray)

In [13]:
x_test_scaled_nparray = StandardScaler().fit_transform(x_test_dataframe)

logx.log_write_object(x_test_scaled_nparray)

### **Create Scaled X Variable DataFrames**

In [14]:
x_train_scaled_dataframe \
    = pd.DataFrame \
        (x_train_scaled_nparray, 
         columns = x_train_dataframe.columns, 
         index = x_train_dataframe.index)
    
logx.log_write_object(x_train_scaled_dataframe)

In [15]:
x_test_scaled_dataframe \
    = pd.DataFrame \
        (x_test_scaled_nparray, 
         columns = x_test_dataframe.columns, 
         index = x_test_dataframe.index)
    
logx.log_write_object(x_test_scaled_dataframe)

### **Display Scaled Training and Testing Data**

In [16]:
pandas_processx.return_formatted_table \
    (x_train_scaled_dataframe, 
     'Table 1.5.1: Spam Scaled Features Training Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
2.83,-0.16,1.35,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.68,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,0.06,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.12,-0.3,-0.1,-0.09,-0.2,-0.36
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.35,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,3.86,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,2.8,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,0.05,-0.3,-0.1,-0.09,-0.21,-0.38
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,5.77,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
0.93,0.15,-0.56,-0.05,0.12,-0.36,-0.29,-0.26,-0.32,0.24,-0.3,0.74,-0.3,0.99,-0.19,-0.32,-0.31,-0.35,-0.28,-0.16,-0.67,-0.12,-0.29,-0.21,1.41,1.93,0.13,1.9,0.52,0.61,0.76,0.98,-0.17,0.97,1.88,0.76,0.58,-0.06,-0.18,0.88,-0.13,0.32,1.51,-0.12,0.1,-0.2,-0.07,-0.11,0.35,0.83,5.17,-0.31,-0.3,-0.1,-0.09,-0.19,-0.23
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,2.35,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.3,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,1.58,-0.18,-0.13,-0.18,-0.21,-0.12,0.46,-0.2,-0.07,-0.11,-0.16,0.48,-0.19,0.32,-0.3,-0.1,-0.08,-0.18,-0.34
1.42,-0.16,-0.56,-0.05,-0.46,0.65,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,1.45,-0.17,-0.19,-0.32,-0.31,-0.35,1.18,-0.16,-0.23,-0.12,-0.29,-0.21,-0.17,-0.3,-0.23,-0.23,-0.18,-0.22,1.11,-0.14,-0.17,-0.15,0.28,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,1.02,-0.19,-0.31,0.01,-0.1,-0.1,-0.17,-0.29
-0.35,-0.02,-0.56,-0.05,-0.2,-0.36,-0.29,0.66,-0.32,-0.37,-0.3,0.45,-0.3,-0.17,-0.19,-0.32,3.74,-0.35,-0.84,-0.16,-0.67,-0.12,-0.29,-0.21,-0.1,-0.1,-0.23,-0.23,-0.18,0.16,-0.16,-0.14,-0.17,-0.15,-0.19,0.71,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,0.28,-0.2,-0.07,-0.11,-0.16,0.36,-0.19,-0.31,-0.3,-0.1,-0.11,-0.21,-0.19
-0.35,-0.16,-0.56,-0.05,0.88,-0.36,1.88,-0.26,-0.32,1.04,4.1,-0.63,2.62,-0.17,-0.19,-0.32,-0.31,-0.35,0.58,-0.16,0.06,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.1,-0.22,-0.42
-0.35,-0.16,3.33,-0.05,1.01,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,1.65,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,1.83,-0.16,0.13,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,2.18,-0.18,-0.13,2.37,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.23,-0.42


In [17]:
pandas_processx.return_formatted_table \
    (x_test_scaled_dataframe, 
     'Table 1.5.2: Spam Scaled Features Test Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
-0.02,-0.17,-0.0,-0.05,0.06,-0.02,-0.3,0.18,-0.01,-0.38,-0.29,0.23,-0.35,0.89,-0.2,-0.27,-0.36,-0.36,0.24,-0.18,2.26,-0.12,0.78,0.02,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.05,-0.19,-0.08,-0.13,-0.1,-0.44,0.05,-0.04,-0.19,0.0,-0.02,0.2,0.48
-0.31,-0.17,-0.56,-0.05,-0.46,-0.34,-0.3,-0.28,-0.33,-0.38,-0.29,-0.62,-0.35,-0.18,-0.2,-0.27,-0.36,-0.36,-0.51,-0.18,-0.69,-0.12,-0.3,-0.26,-0.13,0.1,-0.22,1.23,0.37,0.62,-0.18,-0.15,-0.18,-0.15,0.68,-0.25,-0.33,-0.07,1.29,-0.21,-0.1,-0.16,-0.2,0.55,-0.29,-0.19,-0.08,-0.13,-0.17,-0.66,-0.13,-0.43,-0.32,-0.16,-0.19,-0.3,-0.3
-0.31,-0.17,-0.56,-0.05,-0.46,-0.34,-0.3,-0.28,-0.33,2.42,-0.29,0.85,-0.35,-0.18,-0.2,-0.27,-0.36,-0.36,-0.92,-0.18,-0.69,-0.12,-0.3,-0.26,0.99,1.11,0.14,3.68,0.77,1.27,1.72,2.2,-0.18,2.21,1.38,2.58,-0.33,-0.07,1.12,1.97,-0.1,-0.16,2.68,-0.14,0.26,-0.19,-0.08,-0.13,0.63,2.93,-0.13,-0.43,-0.32,-0.16,-0.11,-0.26,-0.2
-0.31,-0.17,-0.56,-0.05,-0.46,-0.34,-0.3,-0.28,-0.33,0.31,-0.29,1.56,-0.35,-0.18,-0.2,-0.27,-0.36,0.46,-0.12,-0.18,-0.28,-0.12,-0.3,-0.26,0.66,1.26,0.05,1.7,0.54,0.88,2.64,5.09,-0.18,5.1,2.16,0.8,-0.33,-0.07,-0.18,3.03,-0.1,1.73,-0.2,-0.14,0.12,-0.19,-0.08,4.55,-0.17,1.06,-0.13,-0.43,-0.32,-0.16,0.0,-0.1,-0.21
-0.31,-0.17,0.04,-0.05,0.83,0.15,-0.3,-0.28,2.78,-0.38,-0.29,0.24,0.15,-0.18,-0.2,-0.27,0.43,-0.36,0.35,-0.18,0.1,-0.12,0.12,0.66,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,1.28,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,-0.17,-0.2,-0.13,0.2,-0.11,-0.03,-0.13,0.03,0.27
-0.31,-0.17,0.75,-0.05,-0.46,1.88,-0.3,-0.28,-0.33,-0.38,-0.29,-0.62,-0.35,-0.18,-0.2,0.37,-0.36,-0.36,0.58,-0.18,-0.69,-0.12,1.61,-0.26,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,1.3,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,0.23,0.24,1.58,0.64,-0.32,-0.16,0.42,0.74,0.02
0.76,-0.17,2.82,-0.05,0.99,-0.34,0.76,-0.28,2.01,1.61,-0.29,-0.62,-0.35,-0.18,-0.2,-0.27,-0.36,-0.36,-0.73,2.13,-0.1,-0.12,-0.3,-0.26,-0.14,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,-0.17,-0.66,-0.13,0.96,0.12,-0.16,23.74,9.53,1.94
-0.31,-0.17,-0.56,-0.05,3.85,-0.34,-0.3,-0.28,-0.33,-0.38,-0.29,-0.62,-0.35,-0.18,-0.2,2.64,-0.36,-0.36,-0.92,-0.18,-0.69,-0.12,-0.3,-0.26,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,1.33,1.02,-0.13,0.72,-0.32,-0.16,-0.01,-0.3,-0.46
0.76,-0.17,-0.56,-0.05,-0.46,0.81,-0.3,-0.28,-0.33,-0.38,-0.29,0.58,2.07,-0.18,-0.2,-0.27,1.45,-0.36,0.84,-0.18,1.14,-0.12,-0.3,-0.26,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,-0.17,-0.66,-0.13,-0.43,-0.32,-0.16,-0.17,-0.22,-0.38
-0.31,-0.17,-0.56,-0.05,-0.46,-0.34,-0.3,-0.28,-0.33,-0.38,-0.29,-0.62,-0.35,-0.18,-0.2,-0.27,-0.36,-0.36,0.68,-0.18,-0.69,-0.12,-0.3,-0.26,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,2.74,-0.08,-0.13,-0.17,-0.66,-0.13,-0.43,-0.32,-0.16,-0.22,-0.39,-0.49


# <br> **Section 2: Undersampled and OverSampled Spam Data**

## **2.1: Instantiate the Random Undersampler Instance**

In [18]:
x_train_scaled_undersampled_dataframe, y_train_undersampled_series \
    = RandomUnderSampler(random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
        .fit_resample(x_train_scaled_dataframe, y_train_series)

In [19]:
logx.log_write_object(x_train_scaled_undersampled_dataframe)

logx.log_write_object(y_train_undersampled_series)

## **2.2: Instantiate the Random Oversampler Instance**

In [20]:
x_train_scaled_oversampled_dataframe, y_train_oversampled_series \
    = RandomOverSampler(random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
        .fit_resample(x_train_scaled_dataframe, y_train_series)

In [21]:
logx.log_write_object(x_train_scaled_undersampled_dataframe)

logx.log_write_object(y_train_undersampled_series)

## **2.3: Instantiate the Cluster Centroids Instance**

In [22]:
x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series \
    = ClusterCentroids \
        (estimator \
             = KMeans(n_init = 'auto', random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_2), 
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
        .fit_resample(x_train_scaled_dataframe, y_train_series)

In [23]:
logx.log_write_object(x_train_scaled_cluster_centroids_dataframe)

logx.log_write_object(y_train_cluster_centroids_series)

## **2.4: Instantiate the SMOTE Instance**

In [24]:
x_train_scaled_smote_dataframe, y_train_smote_series \
    = SMOTE(random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1, sampling_strategy = 'auto') \
        .fit_resample(x_train_scaled_dataframe, y_train_series)

In [25]:
logx.log_write_object(x_train_scaled_smote_dataframe)

logx.log_write_object(y_train_smote_series)

## **2.5: Instantiate the SMOTEEN Instance**

In [26]:
x_train_scaled_smoteen_dataframe, y_train_smoteen_series \
    = SMOTEENN(random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
        .fit_resample(x_train_scaled_dataframe, y_train_series)

In [27]:
logx.log_write_object(x_train_scaled_smoteen_dataframe)

logx.log_write_object(y_train_smoteen_series)

## **2.6: Check the Balance of the Labels Variable (`y`) by Using the `value_counts` Function.**

In [28]:
y_train_undersampled_series.value_counts()

spam
0    1361
1    1361
Name: count, dtype: int64

In [29]:
y_train_oversampled_series.value_counts()

spam
0    2089
1    2089
Name: count, dtype: int64

In [30]:
y_train_cluster_centroids_series.value_counts()

spam
0    1361
1    1361
Name: count, dtype: int64

In [31]:
y_train_smote_series.value_counts()

spam
0    2089
1    2089
Name: count, dtype: int64

In [32]:
y_train_smoteen_series.value_counts()

spam
1    1785
0    1651
Name: count, dtype: int64

## **2.7: Display Normalized Resampled Training and Testing Data**

In [33]:
pandas_processx.return_formatted_table \
    (x_train_scaled_undersampled_dataframe, 
     'Table 2.7.1: Scaled Features Training Undersampled Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,1.27,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,7.49,4.85,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.24,-0.44
-0.35,-0.16,-0.56,-0.05,-0.46,1.88,-0.29,-0.26,-0.32,0.57,-0.3,0.06,1.64,-0.17,-0.19,-0.32,-0.31,-0.35,0.76,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,1.06,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,0.32,3.18,-0.07,-0.11,0.23,-0.48,-0.19,-0.2,0.45,-0.1,-0.1,-0.18,-0.32
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,3.86,-0.37,-0.3,0.71,3.46,-0.17,-0.19,-0.32,-0.31,-0.35,-0.29,-0.16,0.27,-0.12,-0.29,-0.21,-0.34,-0.3,0.12,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.24,-0.44
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,6.21,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,3.12,-0.16,0.8,-0.12,-0.29,-0.21,-0.34,-0.3,0.31,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,0.62,-0.19,-0.31,-0.3,-0.1,-0.12,-0.24,-0.44
-0.35,1.34,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,1.09,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,1.43,1.8,0.33,1.44,1.47,1.75,2.03,2.53,-0.17,2.52,1.43,2.14,1.82,-0.06,2.03,2.34,-0.13,1.02,3.9,-0.12,0.66,-0.2,-0.07,-0.11,-0.16,1.11,1.67,-0.31,-0.3,-0.1,-0.1,-0.19,-0.32
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,8.64,-0.18,-0.13,4.57,-0.21,-0.12,3.51,-0.2,-0.07,-0.11,-0.16,-0.48,8.23,-0.31,-0.3,-0.1,-0.1,-0.23,-0.43
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,4.65,-0.3,-0.17,-0.19,-0.32,1.64,-0.35,-0.94,-0.16,0.07,-0.12,-0.29,-0.21,0.82,-0.3,-0.23,-0.23,-0.18,1.71,-0.16,-0.14,-0.17,-0.15,-0.19,2.09,-0.32,-0.06,1.99,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.05,-0.19,-0.02,-0.3,-0.1,-0.08,-0.19,-0.35
-0.35,0.47,2.51,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,0.27,-0.3,-0.17,-0.19,-0.32,-0.31,2.65,2.11,-0.16,2.5,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.01,-0.19,0.01,-0.3,-0.1,-0.11,-0.21,-0.39
-0.35,11.39,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.1,-0.23,-0.44
-0.35,-0.16,0.09,-0.05,0.52,-0.36,-0.29,0.56,-0.32,1.7,-0.3,-0.63,-0.3,0.81,-0.19,-0.32,0.39,-0.35,-0.2,-0.16,-0.4,-0.12,-0.29,-0.21,0.91,0.44,-0.03,-0.23,-0.18,-0.22,-0.16,-0.14,0.41,-0.15,-0.19,-0.24,1.96,-0.06,-0.18,-0.18,-0.13,-0.18,1.25,0.38,0.04,-0.2,-0.07,-0.11,0.53,0.88,-0.19,-0.05,1.03,-0.1,-0.04,-0.06,0.06


In [34]:
pandas_processx.return_formatted_table \
    (x_train_scaled_oversampled_dataframe, 
     'Table 2.7.2: Scaled Features Training Oversampled Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
2.83,-0.16,1.35,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.68,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,0.06,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.12,-0.3,-0.1,-0.09,-0.2,-0.36
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.35,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,3.86,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,2.8,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,0.05,-0.3,-0.1,-0.09,-0.21,-0.38
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,5.77,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
0.93,0.15,-0.56,-0.05,0.12,-0.36,-0.29,-0.26,-0.32,0.24,-0.3,0.74,-0.3,0.99,-0.19,-0.32,-0.31,-0.35,-0.28,-0.16,-0.67,-0.12,-0.29,-0.21,1.41,1.93,0.13,1.9,0.52,0.61,0.76,0.98,-0.17,0.97,1.88,0.76,0.58,-0.06,-0.18,0.88,-0.13,0.32,1.51,-0.12,0.1,-0.2,-0.07,-0.11,0.35,0.83,5.17,-0.31,-0.3,-0.1,-0.09,-0.19,-0.23
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,2.35,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.3,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,1.58,-0.18,-0.13,-0.18,-0.21,-0.12,0.46,-0.2,-0.07,-0.11,-0.16,0.48,-0.19,0.32,-0.3,-0.1,-0.08,-0.18,-0.34
1.42,-0.16,-0.56,-0.05,-0.46,0.65,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,1.45,-0.17,-0.19,-0.32,-0.31,-0.35,1.18,-0.16,-0.23,-0.12,-0.29,-0.21,-0.17,-0.3,-0.23,-0.23,-0.18,-0.22,1.11,-0.14,-0.17,-0.15,0.28,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,1.02,-0.19,-0.31,0.01,-0.1,-0.1,-0.17,-0.29
-0.35,-0.02,-0.56,-0.05,-0.2,-0.36,-0.29,0.66,-0.32,-0.37,-0.3,0.45,-0.3,-0.17,-0.19,-0.32,3.74,-0.35,-0.84,-0.16,-0.67,-0.12,-0.29,-0.21,-0.1,-0.1,-0.23,-0.23,-0.18,0.16,-0.16,-0.14,-0.17,-0.15,-0.19,0.71,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,0.28,-0.2,-0.07,-0.11,-0.16,0.36,-0.19,-0.31,-0.3,-0.1,-0.11,-0.21,-0.19
-0.35,-0.16,-0.56,-0.05,0.88,-0.36,1.88,-0.26,-0.32,1.04,4.1,-0.63,2.62,-0.17,-0.19,-0.32,-0.31,-0.35,0.58,-0.16,0.06,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.1,-0.22,-0.42
-0.35,-0.16,3.33,-0.05,1.01,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,1.65,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,1.83,-0.16,0.13,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,2.18,-0.18,-0.13,2.37,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.23,-0.42


In [35]:
pandas_processx.return_formatted_table \
    (x_train_scaled_cluster_centroids_dataframe, 
     'Table 2.7.3: Scaled Features Training Cluster Centroids Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,5.77,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
-0.35,-0.16,-0.55,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.62,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.2,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.11,-0.3,-0.2,-0.07,-0.11,-0.14,-0.48,-0.19,-0.29,-0.3,-0.1,-0.11,-0.24,-0.44
-0.35,0.18,-0.42,-0.05,-0.25,-0.36,-0.29,-0.26,-0.32,0.3,-0.3,-0.3,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.55,-0.16,-0.61,7.64,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,0.1,-0.12,-0.3,-0.04,-0.07,-0.11,17.3,-0.48,-0.19,-0.23,-0.3,1.19,0.05,-0.05,0.24
-0.35,11.39,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.1,-0.23,-0.44
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,4.9,4.39,1.02,7.22,7.21,8.61,9.61,11.83,-0.17,11.78,7.07,10.43,-0.32,-0.06,-0.18,11.09,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,3.47,-0.19,-0.31,-0.3,-0.1,-0.06,-0.2,-0.41
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,0.22,-0.3,-0.17,-0.19,-0.32,-0.31,1.08,-0.11,-0.16,-0.67,-0.12,-0.29,-0.21,2.48,2.21,-0.23,1.1,-0.18,1.35,1.58,-0.14,-0.17,-0.15,1.1,1.66,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.13,2.3,-0.19,-0.3,0.32,-0.1,-0.21,-0.32
-0.35,0.73,-0.56,-0.05,0.36,-0.36,-0.29,-0.26,-0.32,1.36,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,0.71,-0.94,-0.16,0.22,-0.12,-0.29,-0.21,0.01,0.32,0.43,-0.23,-0.18,-0.22,-0.16,1.44,-0.17,1.43,-0.19,-0.24,3.49,-0.06,1.13,-0.18,-0.13,-0.18,4.65,-0.12,0.27,-0.2,-0.07,-0.11,0.2,0.75,0.88,-0.11,-0.3,-0.1,-0.1,-0.15,-0.25
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,6.76,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.24,-0.44
2.14,-0.16,0.94,-0.05,-0.46,1.06,-0.29,-0.26,-0.32,-0.37,-0.3,0.69,0.93,-0.17,-0.19,-0.32,-0.31,-0.35,0.34,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,0.72,-0.18,7.3,-0.18,-0.21,-0.12,0.09,0.22,-0.07,-0.11,-0.16,0.14,-0.19,0.31,-0.3,-0.1,-0.09,-0.18,-0.23
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,3.21,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,10.64,-0.16,-0.48,-0.19,0.42,-0.3,-0.1,-0.1,-0.21,-0.43


In [36]:
pandas_processx.return_formatted_table \
    (x_train_scaled_smote_dataframe, 
     'Table 2.7.4: Scaled Features Training SMOTE Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
2.83,-0.16,1.35,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.68,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,0.06,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.12,-0.3,-0.1,-0.09,-0.2,-0.36
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.35,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,3.86,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,2.8,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,0.05,-0.3,-0.1,-0.09,-0.21,-0.38
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,5.77,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
0.93,0.15,-0.56,-0.05,0.12,-0.36,-0.29,-0.26,-0.32,0.24,-0.3,0.74,-0.3,0.99,-0.19,-0.32,-0.31,-0.35,-0.28,-0.16,-0.67,-0.12,-0.29,-0.21,1.41,1.93,0.13,1.9,0.52,0.61,0.76,0.98,-0.17,0.97,1.88,0.76,0.58,-0.06,-0.18,0.88,-0.13,0.32,1.51,-0.12,0.1,-0.2,-0.07,-0.11,0.35,0.83,5.17,-0.31,-0.3,-0.1,-0.09,-0.19,-0.23
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,2.35,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.3,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,1.58,-0.18,-0.13,-0.18,-0.21,-0.12,0.46,-0.2,-0.07,-0.11,-0.16,0.48,-0.19,0.32,-0.3,-0.1,-0.08,-0.18,-0.34
1.42,-0.16,-0.56,-0.05,-0.46,0.65,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,1.45,-0.17,-0.19,-0.32,-0.31,-0.35,1.18,-0.16,-0.23,-0.12,-0.29,-0.21,-0.17,-0.3,-0.23,-0.23,-0.18,-0.22,1.11,-0.14,-0.17,-0.15,0.28,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,1.02,-0.19,-0.31,0.01,-0.1,-0.1,-0.17,-0.29
-0.35,-0.02,-0.56,-0.05,-0.2,-0.36,-0.29,0.66,-0.32,-0.37,-0.3,0.45,-0.3,-0.17,-0.19,-0.32,3.74,-0.35,-0.84,-0.16,-0.67,-0.12,-0.29,-0.21,-0.1,-0.1,-0.23,-0.23,-0.18,0.16,-0.16,-0.14,-0.17,-0.15,-0.19,0.71,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,0.28,-0.2,-0.07,-0.11,-0.16,0.36,-0.19,-0.31,-0.3,-0.1,-0.11,-0.21,-0.19
-0.35,-0.16,-0.56,-0.05,0.88,-0.36,1.88,-0.26,-0.32,1.04,4.1,-0.63,2.62,-0.17,-0.19,-0.32,-0.31,-0.35,0.58,-0.16,0.06,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.1,-0.22,-0.42
-0.35,-0.16,3.33,-0.05,1.01,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,1.65,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,1.83,-0.16,0.13,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,2.18,-0.18,-0.13,2.37,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.23,-0.42


In [37]:
pandas_processx.return_formatted_table \
    (x_train_scaled_smoteen_dataframe, 
     'Table 2.7.5: Scaled Features Training SMOTEEN Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
2.83,-0.16,1.35,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.68,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,0.06,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.12,-0.3,-0.1,-0.09,-0.2,-0.36
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.35,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,3.86,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,2.8,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,0.05,-0.3,-0.1,-0.09,-0.21,-0.38
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,5.77,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
0.93,0.15,-0.56,-0.05,0.12,-0.36,-0.29,-0.26,-0.32,0.24,-0.3,0.74,-0.3,0.99,-0.19,-0.32,-0.31,-0.35,-0.28,-0.16,-0.67,-0.12,-0.29,-0.21,1.41,1.93,0.13,1.9,0.52,0.61,0.76,0.98,-0.17,0.97,1.88,0.76,0.58,-0.06,-0.18,0.88,-0.13,0.32,1.51,-0.12,0.1,-0.2,-0.07,-0.11,0.35,0.83,5.17,-0.31,-0.3,-0.1,-0.09,-0.19,-0.23
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,2.35,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.3,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,1.58,-0.18,-0.13,-0.18,-0.21,-0.12,0.46,-0.2,-0.07,-0.11,-0.16,0.48,-0.19,0.32,-0.3,-0.1,-0.08,-0.18,-0.34
1.42,-0.16,-0.56,-0.05,-0.46,0.65,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,1.45,-0.17,-0.19,-0.32,-0.31,-0.35,1.18,-0.16,-0.23,-0.12,-0.29,-0.21,-0.17,-0.3,-0.23,-0.23,-0.18,-0.22,1.11,-0.14,-0.17,-0.15,0.28,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,1.02,-0.19,-0.31,0.01,-0.1,-0.1,-0.17,-0.29
-0.35,-0.02,-0.56,-0.05,-0.2,-0.36,-0.29,0.66,-0.32,-0.37,-0.3,0.45,-0.3,-0.17,-0.19,-0.32,3.74,-0.35,-0.84,-0.16,-0.67,-0.12,-0.29,-0.21,-0.1,-0.1,-0.23,-0.23,-0.18,0.16,-0.16,-0.14,-0.17,-0.15,-0.19,0.71,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,0.28,-0.2,-0.07,-0.11,-0.16,0.36,-0.19,-0.31,-0.3,-0.1,-0.11,-0.21,-0.19
-0.35,-0.16,3.33,-0.05,1.01,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,1.65,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,1.83,-0.16,0.13,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,2.18,-0.18,-0.13,2.37,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.23,-0.42
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,1.79,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,1.38,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,1.34,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.11,-0.23,-0.42


# <br> **Section 3: Logistic Regression Models**

## **3.1: Fit Models by Using the Scaled Training Data.**

### **Original**

In [38]:
lr_grid_search_model \
    = pickle.load(open(spam_detector_constants.CONSTANT_LR_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

logistic_regression_model \
    = LogisticRegression \
        (class_weight = lr_grid_search_model.best_params_['class_weight'],
         solver = lr_grid_search_model.best_params_['solver'],
         multi_class = lr_grid_search_model.best_params_['multi_class'],
         max_iter = spam_detector_constants.CONSTANT_ML_LR_MAX_ITERATIONS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_dataframe, y_train_series)

### **Random Undersampling**

In [39]:
lr_undersampled_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_LR_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

logistic_regression_undersampled_model \
    = LogisticRegression \
        (class_weight = lr_undersampled_grid_search_model.best_params_['class_weight'],
         solver = lr_undersampled_grid_search_model.best_params_['solver'],
         multi_class = lr_undersampled_grid_search_model.best_params_['multi_class'],
         max_iter = spam_detector_constants.CONSTANT_ML_LR_MAX_ITERATIONS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

### **Random Oversampling**

In [40]:
lr_oversampled_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_LR_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

logistic_regression_oversampled_model \
    = LogisticRegression \
        (class_weight = lr_oversampled_grid_search_model.best_params_['class_weight'],
         solver = lr_oversampled_grid_search_model.best_params_['solver'],
         multi_class = lr_oversampled_grid_search_model.best_params_['multi_class'],
         max_iter = spam_detector_constants.CONSTANT_ML_LR_MAX_ITERATIONS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

### **Cluster Centroids**

In [41]:
lr_cluster_centroids_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_LR_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

logistic_regression_cluster_centroids_model \
    = LogisticRegression \
        (class_weight = lr_cluster_centroids_grid_search_model.best_params_['class_weight'],
         solver = lr_cluster_centroids_grid_search_model.best_params_['solver'],
         multi_class = lr_cluster_centroids_grid_search_model.best_params_['multi_class'],
         max_iter = spam_detector_constants.CONSTANT_ML_LR_MAX_ITERATIONS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

### **Smote**

In [42]:
lr_smote_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_LR_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

logistic_regression_smote_model \
    = LogisticRegression \
        (class_weight = lr_smote_grid_search_model.best_params_['class_weight'],
         solver = lr_smote_grid_search_model.best_params_['solver'],
         multi_class = lr_smote_grid_search_model.best_params_['multi_class'],
         max_iter = spam_detector_constants.CONSTANT_ML_LR_MAX_ITERATIONS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_smote_dataframe, y_train_smote_series)

### **Smoteen**

In [43]:
lr_smoteen_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_LR_SMOTEEN_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

logistic_regression_smoteen_model \
    = LogisticRegression \
        (class_weight = lr_smoteen_grid_search_model.best_params_['class_weight'],
         solver = lr_smoteen_grid_search_model.best_params_['solver'],
         multi_class = lr_smoteen_grid_search_model.best_params_['multi_class'],
         max_iter = spam_detector_constants.CONSTANT_ML_LR_MAX_ITERATIONS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_smoteen_dataframe, y_train_smoteen_series)

## **3.2: Display the Model Scores Using the Scaled Training and Testing data.**

### **Original**

In [44]:
accuracy_score_train_float \
    = logistic_regression_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model score from scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe logistic regression model score from scaled training data is 92.81%[0m


In [45]:
accuracy_score_test_float \
    = logistic_regression_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The logistic regression model score from scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe logistic regression model score from scaled test data is 90.88%[0m


### **Random Undersampling**

In [46]:
accuracy_score_train_float \
    = logistic_regression_undersampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The logistic regression model score from undersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe logistic regression model score from undersampled scaled training data is 93.51%[0m


In [47]:
accuracy_score_test_float \
    = logistic_regression_undersampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The logistic regression model score from undersampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe logistic regression model score from undersampled scaled test data is 92.44%[0m


### **Random Oversampling**

In [48]:
accuracy_score_train_float \
    = logistic_regression_oversampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The logistic regression model score from overersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe logistic regression model score from overersampled scaled training data is 93.19%[0m


In [49]:
accuracy_score_test_float \
    = logistic_regression_oversampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The logistic regression model score from oversampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe logistic regression model score from oversampled scaled test data is 91.57%[0m


### **Cluster Centroids**

In [50]:
accuracy_score_train_float \
    = logistic_regression_cluster_centroids_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The logistic regression model score from cluster centroids scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe logistic regression model score from cluster centroids scaled training data is 91.19%[0m


In [51]:
accuracy_score_test_float \
    = logistic_regression_cluster_centroids_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The logistic regression model score from cluster centroids scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe logistic regression model score from cluster centroids scaled test data is 91.40%[0m


### **SMOTE**

In [52]:
accuracy_score_train_float \
    = logistic_regression_smote_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The logistic regression model score from SMOTE scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe logistic regression model score from SMOTE scaled training data is 93.10%[0m


In [53]:
accuracy_score_test_float \
    = logistic_regression_smote_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The logistic regression model score from SMOTE scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe logistic regression model score from SMOTE scaled test data is 92.09%[0m


### **SMOTEEN**

In [54]:
accuracy_score_train_float \
    = logistic_regression_smoteen_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The logistic regression model score from SMOTEEN scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe logistic regression model score from SMOTEEN scaled training data is 93.30%[0m


In [55]:
accuracy_score_test_float \
    = logistic_regression_smoteen_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The logistic regression model score from SMOTEEN scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe logistic regression model score from SMOTEEN scaled test data is 92.18%[0m


## **3.3: Calculate Training and Test Predictions.**

### **Original**

In [56]:
lr_train_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(lr_train_predictions_dataframe)

In [57]:
lr_test_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(lr_test_predictions_dataframe)

### **Random Undersampling**

In [58]:
lr_train_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_undersampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(lr_train_undersampled_predictions_dataframe)

In [59]:
lr_test_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_undersampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(lr_test_undersampled_predictions_dataframe)

### **Random Oversampling**

In [60]:
lr_train_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_oversampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(lr_train_oversampled_predictions_dataframe)

In [61]:
lr_test_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_oversampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(lr_test_oversampled_predictions_dataframe)

### **Cluster Centroids**

In [62]:
lr_train_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_cluster_centroids_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(lr_train_cluster_centroids_predictions_dataframe)

In [63]:
lr_test_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_cluster_centroids_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(lr_test_cluster_centroids_predictions_dataframe)

### **SMOTE**

In [64]:
lr_train_smote_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_smote_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(lr_train_smote_predictions_dataframe)

In [65]:
lr_test_smote_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_smote_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(lr_test_smote_predictions_dataframe)

### **SMOTEEN**

In [66]:
lr_train_smoteen_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_smoteen_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(lr_train_smoteen_predictions_dataframe)

In [67]:
lr_test_smoteen_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (logistic_regression_smote_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(lr_test_smoteen_predictions_dataframe)

# <br> **Section 4: Decision Tree Models**

## **4.1: Fit Models by Using the Scaled Training Data.**

### **Original**

In [68]:
dt_grid_search_model \
    = pickle.load(open(spam_detector_constants.CONSTANT_DT_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

decision_tree_model \
    = DecisionTreeClassifier \
        (criterion = dt_grid_search_model.best_params_['criterion'],
         splitter = dt_grid_search_model.best_params_['splitter'],
         class_weight = dt_grid_search_model.best_params_['class_weight'],
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_dataframe, y_train_series)

### **Random Undersampling**

In [69]:
dt_undersampled_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_DT_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

decision_tree_undersampled_model \
    = DecisionTreeClassifier \
        (criterion = dt_undersampled_grid_search_model.best_params_['criterion'],
         splitter = dt_undersampled_grid_search_model.best_params_['splitter'],
         class_weight = dt_undersampled_grid_search_model.best_params_['class_weight'],
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

### **Random Oversampling**

In [70]:
dt_oversampled_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_DT_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

decision_tree_oversampled_model \
    = DecisionTreeClassifier \
        (criterion = dt_oversampled_grid_search_model.best_params_['criterion'],
         splitter = dt_oversampled_grid_search_model.best_params_['splitter'],
         class_weight = dt_oversampled_grid_search_model.best_params_['class_weight'],
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

### **Cluster Centroids**

In [71]:
dt_cluster_centroids_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_DT_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

decision_tree_cluster_centroids_model \
    = DecisionTreeClassifier \
        (criterion = dt_cluster_centroids_grid_search_model.best_params_['criterion'],
         splitter = dt_cluster_centroids_grid_search_model.best_params_['splitter'],
         class_weight = dt_cluster_centroids_grid_search_model.best_params_['class_weight'],
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

### **Smote**

In [72]:
dt_smote_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_DT_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

decision_tree_smote_model \
    = DecisionTreeClassifier \
        (criterion = dt_smote_grid_search_model.best_params_['criterion'],
         splitter = dt_smote_grid_search_model.best_params_['splitter'],
         class_weight = dt_smote_grid_search_model.best_params_['class_weight'],
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_smote_dataframe, y_train_smote_series)

### **Smoteen**

In [73]:
dt_smoteen_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_DT_SMOTEEN_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

decision_tree_smoteen_model \
    = DecisionTreeClassifier \
        (criterion = dt_smoteen_grid_search_model.best_params_['criterion'],
         splitter = dt_smoteen_grid_search_model.best_params_['splitter'],
         class_weight = dt_smoteen_grid_search_model.best_params_['class_weight'],
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_smoteen_dataframe, y_train_smoteen_series)

## **4.2: Display the Model Scores Using the Scaled Training and Testing data.**

### **Original**

In [74]:
accuracy_score_train_float \
    = decision_tree_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model score from scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe decision tree model score from scaled training data is 99.91%[0m


In [75]:
accuracy_score_test_float \
    = decision_tree_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The decision tree model score from scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe decision tree model score from scaled test data is 89.66%[0m


### **Random Undersampling**

In [76]:
accuracy_score_train_float \
    = decision_tree_undersampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The decision tree model score from undersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe decision tree model score from undersampled scaled training data is 97.54%[0m


In [77]:
accuracy_score_test_float \
    = decision_tree_undersampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The decision tree model score from undersampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe decision tree model score from undersampled scaled test data is 89.66%[0m


### **Random Oversampling**

In [78]:
accuracy_score_train_float \
    = decision_tree_oversampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The decision tree model score from overersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe decision tree model score from overersampled scaled training data is 99.91%[0m


In [79]:
accuracy_score_test_float \
    = decision_tree_oversampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The decision tree model score from oversampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe decision tree model score from oversampled scaled test data is 89.75%[0m


### **Cluster Centroids**

In [80]:
accuracy_score_train_float \
    = decision_tree_cluster_centroids_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The decision tree model score from cluster centroids scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe decision tree model score from cluster centroids scaled training data is 98.23%[0m


In [81]:
accuracy_score_test_float \
    = decision_tree_cluster_centroids_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The decision tree model score from cluster centroids scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe decision tree model score from cluster centroids scaled test data is 88.97%[0m


### **SMOTE**

In [82]:
accuracy_score_train_float \
    = decision_tree_smote_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The decision tree model score from SMOTE scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe decision tree model score from SMOTE scaled training data is 99.91%[0m


In [83]:
accuracy_score_test_float \
    = decision_tree_smote_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The decision tree model score from SMOTE scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe decision tree model score from SMOTE scaled test data is 90.96%[0m


### **SMOTEEN**

In [84]:
accuracy_score_train_float \
    = decision_tree_smoteen_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The decision tree model score from SMOTEEN scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe decision tree model score from SMOTEEN scaled training data is 94.96%[0m


In [85]:
accuracy_score_test_float \
    = decision_tree_smoteen_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The decision tree model score from SMOTEEN scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe decision tree model score from SMOTEEN scaled test data is 56.04%[0m


## **4.3: Calculate Training and Test Predictions.**

### **Original**

In [86]:
dt_train_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(dt_train_predictions_dataframe)

In [87]:
dt_test_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(dt_test_predictions_dataframe)

### **Random Undersampling**

In [88]:
dt_train_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_undersampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(dt_train_undersampled_predictions_dataframe)

In [89]:
dt_test_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_undersampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(dt_test_undersampled_predictions_dataframe)

### **Random Oversampling**

In [90]:
dt_train_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_oversampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(dt_train_oversampled_predictions_dataframe)

In [91]:
dt_test_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_oversampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(dt_test_oversampled_predictions_dataframe)

### **Cluster Centroids**

In [92]:
dt_train_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_cluster_centroids_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(dt_train_cluster_centroids_predictions_dataframe)

In [93]:
dt_test_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_cluster_centroids_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(dt_test_cluster_centroids_predictions_dataframe)

### **SMOTE**

In [94]:
dt_train_smote_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_smote_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(dt_train_smote_predictions_dataframe)

In [95]:
dt_test_smote_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_smote_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(dt_test_smote_predictions_dataframe)

### **SMOTEEN**

In [96]:
dt_train_smoteen_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_smoteen_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(dt_train_smoteen_predictions_dataframe)

In [97]:
dt_test_smoteen_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (decision_tree_smote_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(dt_test_smoteen_predictions_dataframe)

# <br> **Section 5: Random Forest Models**

## **5.1: Fit Models by Using the Scaled Training Data.**

### **Original**

In [98]:
rf_grid_search_model \
    = pickle.load(open(spam_detector_constants.CONSTANT_RF_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

random_forest_model \
    = RandomForestClassifier \
        (criterion = rf_grid_search_model.best_params_['criterion'],
         max_features = rf_grid_search_model.best_params_['max_features'],
         class_weight = rf_grid_search_model.best_params_['class_weight'],
         n_estimators = spam_detector_constants.CONSTANT_ML_RF_N_ESTIMATORS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_dataframe, y_train_series)

### **Random Undersampling**

In [99]:
rf_undersampled_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_RF_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

random_forest_undersampled_model \
    = RandomForestClassifier \
        (criterion = rf_undersampled_grid_search_model.best_params_['criterion'],
         max_features = rf_undersampled_grid_search_model.best_params_['max_features'],
         class_weight = rf_undersampled_grid_search_model.best_params_['class_weight'],
         n_estimators = spam_detector_constants.CONSTANT_ML_RF_N_ESTIMATORS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

### **Random Oversampling**

In [100]:
rf_oversampled_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_RF_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

random_forest_oversampled_model \
    = RandomForestClassifier \
        (criterion = rf_oversampled_grid_search_model.best_params_['criterion'],
         max_features = rf_oversampled_grid_search_model.best_params_['max_features'],
         class_weight = rf_oversampled_grid_search_model.best_params_['class_weight'],
         n_estimators = spam_detector_constants.CONSTANT_ML_RF_N_ESTIMATORS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

### **Cluster Centroids**

In [101]:
rf_cluster_centroids_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_RF_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

random_forest_cluster_centroids_model \
    = RandomForestClassifier \
        (criterion = rf_cluster_centroids_grid_search_model.best_params_['criterion'],
         max_features = rf_cluster_centroids_grid_search_model.best_params_['max_features'],
         class_weight = rf_cluster_centroids_grid_search_model.best_params_['class_weight'],
         n_estimators = spam_detector_constants.CONSTANT_ML_RF_N_ESTIMATORS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

### **Smote**

In [102]:
rf_smote_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_RF_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

random_forest_smote_model \
    = RandomForestClassifier \
        (criterion = rf_smote_grid_search_model.best_params_['criterion'],
         max_features = rf_smote_grid_search_model.best_params_['max_features'],
         class_weight = rf_smote_grid_search_model.best_params_['class_weight'],
         n_estimators = spam_detector_constants.CONSTANT_ML_RF_N_ESTIMATORS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_smote_dataframe, y_train_smote_series)

### **Smoteen**

In [103]:
rf_smoteen_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_RF_SMOTEEN_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

random_forest_smoteen_model \
    = RandomForestClassifier \
        (criterion = rf_smoteen_grid_search_model.best_params_['criterion'],
         max_features = rf_smoteen_grid_search_model.best_params_['max_features'],
         class_weight = rf_smoteen_grid_search_model.best_params_['class_weight'],
         n_estimators = spam_detector_constants.CONSTANT_ML_RF_N_ESTIMATORS,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_smoteen_dataframe, y_train_smoteen_series)

## **5.2: Display the Model Scores Using the Scaled Training and Testing data.**

### **Original**

In [104]:
accuracy_score_train_float \
    = random_forest_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model score from scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe random forest model score from scaled training data is 99.91%[0m


In [105]:
accuracy_score_test_float \
    = random_forest_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The random forest model score from scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe random forest model score from scaled test data is 92.96%[0m


### **Random Undersampling**

In [106]:
accuracy_score_train_float \
    = random_forest_undersampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The random forest model score from undersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe random forest model score from undersampled scaled training data is 99.07%[0m


In [107]:
accuracy_score_test_float \
    = random_forest_undersampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The random forest model score from undersampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe random forest model score from undersampled scaled test data is 94.18%[0m


### **Random Oversampling**

In [108]:
accuracy_score_train_float \
    = random_forest_oversampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The random forest model score from overersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe random forest model score from overersampled scaled training data is 99.91%[0m


In [109]:
accuracy_score_test_float \
    = random_forest_oversampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The random forest model score from oversampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe random forest model score from oversampled scaled test data is 93.05%[0m


### **Cluster Centroids**

In [110]:
accuracy_score_train_float \
    = random_forest_cluster_centroids_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The random forest model score from cluster centroids scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe random forest model score from cluster centroids scaled training data is 99.10%[0m


In [111]:
accuracy_score_test_float \
    = random_forest_cluster_centroids_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The random forest model score from cluster centroids scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe random forest model score from cluster centroids scaled test data is 92.44%[0m


### **SMOTE**

In [112]:
accuracy_score_train_float \
    = random_forest_smote_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The random forest model score from SMOTE scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe random forest model score from SMOTE scaled training data is 99.91%[0m


In [113]:
accuracy_score_test_float \
    = random_forest_smote_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The random forest model score from SMOTE scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe random forest model score from SMOTE scaled test data is 93.74%[0m


### **SMOTEEN**

In [114]:
accuracy_score_train_float \
    = random_forest_smoteen_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The random forest model score from SMOTEEN scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe random forest model score from SMOTEEN scaled training data is 95.80%[0m


In [115]:
accuracy_score_test_float \
    = random_forest_smoteen_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The random forest model score from SMOTEEN scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe random forest model score from SMOTEEN scaled test data is 93.92%[0m


## **5.3: Calculate Training and Test Predictions.**

### **Original**

In [116]:
rf_train_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(rf_train_predictions_dataframe)

In [117]:
rf_test_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(rf_test_predictions_dataframe)

### **Random Undersampling**

In [118]:
rf_train_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_undersampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(rf_train_undersampled_predictions_dataframe)

In [119]:
rf_test_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_undersampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(rf_test_undersampled_predictions_dataframe)

### **Random Oversampling**

In [120]:
rf_train_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_oversampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(rf_train_oversampled_predictions_dataframe)

In [121]:
rf_test_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_oversampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(rf_test_oversampled_predictions_dataframe)

### **Cluster Centroids**

In [122]:
rf_train_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_cluster_centroids_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(rf_train_cluster_centroids_predictions_dataframe)

In [123]:
rf_test_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_cluster_centroids_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(rf_test_cluster_centroids_predictions_dataframe)

### **SMOTE**

In [124]:
rf_train_smote_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_smote_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(rf_train_smote_predictions_dataframe)

In [125]:
rf_test_smote_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_smote_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(rf_test_smote_predictions_dataframe)

### **SMOTEEN**

In [126]:
rf_train_smoteen_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_smoteen_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(rf_train_smoteen_predictions_dataframe)

In [127]:
rf_test_smoteen_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (random_forest_smote_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(rf_test_smoteen_predictions_dataframe)

# <br> **Section 6: Support Vector Machine (SVM) Models**

## **6.1: Fit Models by Using the Scaled Training Data.**

### **Original**

In [128]:
svm_grid_search_model \
    = pickle.load(open(spam_detector_constants.CONSTANT_SVM_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

svm_model \
    = SVC \
        (kernel = svm_grid_search_model.best_params_['kernel'],
         gamma = svm_grid_search_model.best_params_['gamma'],
         class_weight = svm_grid_search_model.best_params_['class_weight'],
         decision_function_shape = svm_grid_search_model.best_params_['decision_function_shape'],
         probability = spam_detector_constants.CONSTANT_ML_SVM_PROBABILITY,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_dataframe, y_train_series)

### **Random Undersampling**

In [129]:
svm_undersampled_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_SVM_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

svm_undersampled_model \
    = SVC \
        (kernel = svm_undersampled_grid_search_model.best_params_['kernel'],
         gamma = svm_undersampled_grid_search_model.best_params_['gamma'],
         class_weight = svm_undersampled_grid_search_model.best_params_['class_weight'],
         decision_function_shape = svm_undersampled_grid_search_model.best_params_['decision_function_shape'],
         probability = spam_detector_constants.CONSTANT_ML_SVM_PROBABILITY,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

### **Random Oversampling**

In [130]:
svm_oversampled_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_SVM_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

svm_oversampled_model \
    = SVC \
        (kernel = svm_oversampled_grid_search_model.best_params_['kernel'],
         gamma = svm_oversampled_grid_search_model.best_params_['gamma'],
         class_weight = svm_oversampled_grid_search_model.best_params_['class_weight'],
         decision_function_shape = svm_oversampled_grid_search_model.best_params_['decision_function_shape'],
         probability = spam_detector_constants.CONSTANT_ML_SVM_PROBABILITY,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

### **Cluster Centroids**

In [131]:
svm_cluster_centroids_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_SVM_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

svm_cluster_centroids_model \
    = SVC \
        (kernel = svm_cluster_centroids_grid_search_model.best_params_['kernel'],
         gamma = svm_cluster_centroids_grid_search_model.best_params_['gamma'],
         class_weight = svm_cluster_centroids_grid_search_model.best_params_['class_weight'],
         decision_function_shape = svm_cluster_centroids_grid_search_model.best_params_['decision_function_shape'],
         probability = spam_detector_constants.CONSTANT_ML_SVM_PROBABILITY,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

### **Smote**

In [132]:
svm_smote_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_SVM_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

svm_smote_model \
    = SVC \
        (kernel = svm_smote_grid_search_model.best_params_['kernel'],
         gamma = svm_smote_grid_search_model.best_params_['gamma'],
         class_weight = svm_smote_grid_search_model.best_params_['class_weight'],
         decision_function_shape = svm_smote_grid_search_model.best_params_['decision_function_shape'],
         probability = spam_detector_constants.CONSTANT_ML_SVM_PROBABILITY,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_smote_dataframe, y_train_smote_series)

### **Smoteen**

In [133]:
svm_smoteen_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_SVM_SMOTEEN_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

svm_smoteen_model \
    = SVC \
        (kernel = svm_smoteen_grid_search_model.best_params_['kernel'],
         gamma = svm_smoteen_grid_search_model.best_params_['gamma'],
         class_weight = svm_smoteen_grid_search_model.best_params_['class_weight'],
         decision_function_shape = svm_smoteen_grid_search_model.best_params_['decision_function_shape'],
         probability = spam_detector_constants.CONSTANT_ML_SVM_PROBABILITY,
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit(x_train_scaled_smoteen_dataframe, y_train_smoteen_series)

## **6.2: Display the Model Scores Using the Scaled Training and Testing data.**

### **Original**

In [134]:
accuracy_score_train_float \
    = svm_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The svm model score from scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe svm model score from scaled training data is 94.81%[0m


In [135]:
accuracy_score_test_float \
    = svm_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The svm model score from scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe svm model score from scaled test data is 92.53%[0m


### **Random Undersampling**

In [136]:
accuracy_score_train_float \
    = svm_undersampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The svm model score from undersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe svm model score from undersampled scaled training data is 93.68%[0m


In [137]:
accuracy_score_test_float \
    = svm_undersampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The svm model score from undersampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe svm model score from undersampled scaled test data is 93.05%[0m


### **Random Oversampling**

In [138]:
accuracy_score_train_float \
    = svm_oversampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The svm model score from overersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe svm model score from overersampled scaled training data is 95.10%[0m


In [139]:
accuracy_score_test_float \
    = svm_oversampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The svm model score from oversampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe svm model score from oversampled scaled test data is 92.44%[0m


### **Cluster Centroids**

In [140]:
accuracy_score_train_float \
    = svm_cluster_centroids_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The svm model score from cluster centroids scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe svm model score from cluster centroids scaled training data is 89.59%[0m


In [141]:
accuracy_score_test_float \
    = svm_cluster_centroids_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The svm model score from cluster centroids scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe svm model score from cluster centroids scaled test data is 91.49%[0m


### **SMOTE**

In [142]:
accuracy_score_train_float \
    = svm_smote_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The svm model score from SMOTE scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe svm model score from SMOTE scaled training data is 95.13%[0m


In [143]:
accuracy_score_test_float \
    = svm_smote_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The svm model score from SMOTE scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe svm model score from SMOTE scaled test data is 92.35%[0m


### **SMOTEEN**

In [144]:
accuracy_score_train_float \
    = svm_smoteen_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The svm model score from SMOTEEN scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe svm model score from SMOTEEN scaled training data is 93.57%[0m


In [145]:
accuracy_score_test_float \
    = svm_smoteen_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The svm model score from SMOTEEN scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe svm model score from SMOTEEN scaled test data is 92.44%[0m


## **6.3: Calculate Training and Test Predictions.**

### **Original**

In [146]:
svm_train_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(svm_train_predictions_dataframe)

In [147]:
svm_test_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(svm_test_predictions_dataframe)

### **Random Undersampling**

In [148]:
svm_train_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_undersampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(svm_train_undersampled_predictions_dataframe)

In [149]:
svm_test_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_undersampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(svm_test_undersampled_predictions_dataframe)

### **Random Oversampling**

In [150]:
svm_train_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_oversampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(svm_train_oversampled_predictions_dataframe)

In [151]:
svm_test_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_oversampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(svm_test_oversampled_predictions_dataframe)

### **Cluster Centroids**

In [152]:
svm_train_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_cluster_centroids_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(svm_train_cluster_centroids_predictions_dataframe)

In [153]:
svm_test_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_cluster_centroids_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(svm_test_cluster_centroids_predictions_dataframe)

### **SMOTE**

In [154]:
svm_train_smote_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_smote_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(svm_train_smote_predictions_dataframe)

In [155]:
svm_test_smote_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_smote_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(svm_test_smote_predictions_dataframe)

### **SMOTEEN**

In [156]:
svm_train_smoteen_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_smoteen_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(svm_train_smoteen_predictions_dataframe)

In [157]:
svm_test_smoteen_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (svm_smote_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(svm_test_smoteen_predictions_dataframe)

# <br> **Section 7: K-Nearest Neighbor (KNN) Models**

## **7.1: Fit Models by Using the Scaled Training Data.**

### **Original**

In [158]:
knn_grid_search_model \
    = pickle.load(open(spam_detector_constants.CONSTANT_KNN_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

knn_model \
    = KNeighborsClassifier \
        (n_neighbors = knn_grid_search_model.best_params_['n_neighbors'],
         weights = knn_grid_search_model.best_params_['weights'],
         algorithm = knn_grid_search_model.best_params_['algorithm'],
         p = knn_grid_search_model.best_params_['p'],
         leaf_size = spam_detector_constants.CONSTANT_ML_KNN_LEAF_SIZE) \
            .fit(x_train_scaled_dataframe, y_train_series)

### **Random Undersampling**

In [159]:
knn_undersampled_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_KNN_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

knn_undersampled_model \
    = KNeighborsClassifier \
        (n_neighbors = knn_undersampled_grid_search_model.best_params_['n_neighbors'],
         weights = knn_undersampled_grid_search_model.best_params_['weights'],
         algorithm = knn_undersampled_grid_search_model.best_params_['algorithm'],
         p = knn_undersampled_grid_search_model.best_params_['p'],
         leaf_size = spam_detector_constants.CONSTANT_ML_KNN_LEAF_SIZE) \
            .fit(x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

### **Random Oversampling**

In [160]:
knn_oversampled_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_KNN_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

knn_oversampled_model \
    = KNeighborsClassifier \
        (n_neighbors = knn_oversampled_grid_search_model.best_params_['n_neighbors'],
         weights = knn_oversampled_grid_search_model.best_params_['weights'],
         algorithm = knn_oversampled_grid_search_model.best_params_['algorithm'],
         p = knn_oversampled_grid_search_model.best_params_['p'],
         leaf_size = spam_detector_constants.CONSTANT_ML_KNN_LEAF_SIZE) \
            .fit(x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

### **Cluster Centroids**

In [161]:
knn_cluster_centroids_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_KNN_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

knn_cluster_centroids_model \
    = KNeighborsClassifier \
        (n_neighbors = knn_cluster_centroids_grid_search_model.best_params_['n_neighbors'],
         weights = knn_cluster_centroids_grid_search_model.best_params_['weights'],
         algorithm = knn_cluster_centroids_grid_search_model.best_params_['algorithm'],
         p = knn_cluster_centroids_grid_search_model.best_params_['p'],
         leaf_size = spam_detector_constants.CONSTANT_ML_KNN_LEAF_SIZE) \
            .fit(x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

### **Smote**

In [162]:
knn_smote_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_KNN_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

knn_smote_model \
    = KNeighborsClassifier \
        (n_neighbors = knn_smote_grid_search_model.best_params_['n_neighbors'],
         weights = knn_smote_grid_search_model.best_params_['weights'],
         algorithm = knn_smote_grid_search_model.best_params_['algorithm'],
         p = knn_smote_grid_search_model.best_params_['p'],
         leaf_size = spam_detector_constants.CONSTANT_ML_KNN_LEAF_SIZE) \
            .fit(x_train_scaled_smote_dataframe, y_train_smote_series)

### **Smoteen**

In [163]:
knn_smoteen_grid_search_model \
    = pickle.load \
        (open(spam_detector_constants.CONSTANT_KNN_SMOTEEN_GRID_SEARCH_MODEL_FILE_PATH, 'rb'))

knn_smoteen_model \
    = KNeighborsClassifier \
        (n_neighbors = knn_smoteen_grid_search_model.best_params_['n_neighbors'],
         weights = knn_smoteen_grid_search_model.best_params_['weights'],
         algorithm = knn_smoteen_grid_search_model.best_params_['algorithm'],
         p = knn_smoteen_grid_search_model.best_params_['p'],
         leaf_size = spam_detector_constants.CONSTANT_ML_KNN_LEAF_SIZE) \
            .fit(x_train_scaled_smoteen_dataframe, y_train_smoteen_series)

## **7.2: Display the Model Scores Using the Scaled Training and Testing data.**

### **Original**

In [164]:
accuracy_score_train_float \
    = knn_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model score from scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe knn model score from scaled training data is 99.91%[0m


In [165]:
accuracy_score_test_float \
    = knn_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The knn model score from scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe knn model score from scaled test data is 92.35%[0m


### **Random Undersampling**

In [166]:
accuracy_score_train_float \
    = knn_undersampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The knn model score from undersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe knn model score from undersampled scaled training data is 98.90%[0m


In [167]:
accuracy_score_test_float \
    = knn_undersampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The knn model score from undersampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe knn model score from undersampled scaled test data is 91.57%[0m


### **Random Oversampling**

In [168]:
accuracy_score_train_float \
    = knn_oversampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The knn model score from overersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe knn model score from overersampled scaled training data is 99.91%[0m


In [169]:
accuracy_score_test_float \
    = knn_oversampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The knn model score from oversampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe knn model score from oversampled scaled test data is 91.92%[0m


### **Cluster Centroids**

In [170]:
accuracy_score_train_float \
    = knn_cluster_centroids_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The knn model score from cluster centroids scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe knn model score from cluster centroids scaled training data is 95.04%[0m


In [171]:
accuracy_score_test_float \
    = knn_cluster_centroids_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The knn model score from cluster centroids scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe knn model score from cluster centroids scaled test data is 87.58%[0m


### **SMOTE**

In [172]:
accuracy_score_train_float \
    = knn_smote_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The knn model score from SMOTE scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe knn model score from SMOTE scaled training data is 99.91%[0m


In [173]:
accuracy_score_test_float \
    = knn_smote_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The knn model score from SMOTE scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe knn model score from SMOTE scaled test data is 92.62%[0m


### **SMOTEEN**

In [174]:
accuracy_score_train_float \
    = knn_smoteen_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The knn model score from SMOTEEN scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe knn model score from SMOTEEN scaled training data is 91.65%[0m


In [175]:
accuracy_score_test_float \
    = knn_smoteen_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The knn model score from SMOTEEN scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe knn model score from SMOTEEN scaled test data is 89.14%[0m


## **7.3: Calculate Training and Test Predictions.**

### **Original**

In [176]:
knn_train_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(knn_train_predictions_dataframe)

In [177]:
knn_test_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(knn_test_predictions_dataframe)

### **Random Undersampling**

In [178]:
knn_train_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_undersampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(knn_train_undersampled_predictions_dataframe)

In [179]:
knn_test_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_undersampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(knn_test_undersampled_predictions_dataframe)

### **Random Oversampling**

In [180]:
knn_train_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_oversampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(knn_train_oversampled_predictions_dataframe)

In [181]:
knn_test_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_oversampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(knn_test_oversampled_predictions_dataframe)

### **Cluster Centroids**

In [182]:
knn_train_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_cluster_centroids_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(knn_train_cluster_centroids_predictions_dataframe)

In [183]:
knn_test_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_cluster_centroids_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(knn_test_cluster_centroids_predictions_dataframe)

### **SMOTE**

In [184]:
knn_train_smote_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_smote_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(knn_train_smote_predictions_dataframe)

In [185]:
knn_test_smote_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_smote_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(knn_test_smote_predictions_dataframe)

### **SMOTEEN**

In [186]:
knn_train_smoteen_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_smoteen_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(knn_train_smoteen_predictions_dataframe)

In [187]:
knn_test_smoteen_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (knn_smote_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(knn_test_smoteen_predictions_dataframe)

# <br> **Section 8: Gaussian Naive Bayes (GNB) Models**

## **8.1: Fit Models by Using the Scaled Training Data.**

### **Original**

In [188]:
gnb_model \
    = GaussianNB().fit(x_train_scaled_dataframe, y_train_series)

### **Random Undersampling**

In [189]:
gnb_undersampled_model \
    = GaussianNB().fit(x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

### **Random Oversampling**

In [190]:
gnb_oversampled_model \
    = GaussianNB().fit(x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

### **Cluster Centroids**

In [191]:
gnb_cluster_centroids_model \
    = GaussianNB().fit(x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

### **Smote**

In [192]:
gnb_smote_model \
    = GaussianNB().fit(x_train_scaled_smote_dataframe, y_train_smote_series)

### **Smoteen**

In [193]:
gnb_smoteen_model \
    = GaussianNB().fit(x_train_scaled_smoteen_dataframe, y_train_smoteen_series)

## **8.2: Display the Model Scores Using the Scaled Training and Testing data.**

### **Original**

In [194]:
accuracy_score_train_float \
    = gnb_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m'
     + 'The gnb model score from scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe gnb model score from scaled training data is 81.62%[0m


In [195]:
accuracy_score_test_float \
    = gnb_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The gnb model score from scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe gnb model score from scaled test data is 81.15%[0m


### **Random Undersampling**

In [196]:
accuracy_score_train_float \
    = gnb_undersampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The gnb model score from undersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe gnb model score from undersampled scaled training data is 81.71%[0m


In [197]:
accuracy_score_test_float \
    = gnb_undersampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The gnb model score from undersampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe gnb model score from undersampled scaled test data is 81.15%[0m


### **Random Oversampling**

In [198]:
accuracy_score_train_float \
    = gnb_oversampled_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The gnb model score from overersampled scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe gnb model score from overersampled scaled training data is 81.25%[0m


In [199]:
accuracy_score_test_float \
    = gnb_oversampled_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The gnb model score from oversampled scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe gnb model score from oversampled scaled test data is 80.97%[0m


### **Cluster Centroids**

In [200]:
accuracy_score_train_float \
    = gnb_cluster_centroids_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The gnb model score from cluster centroids scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe gnb model score from cluster centroids scaled training data is 81.30%[0m


In [201]:
accuracy_score_test_float \
    = gnb_cluster_centroids_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The gnb model score from cluster centroids scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe gnb model score from cluster centroids scaled test data is 81.23%[0m


### **SMOTE**

In [202]:
accuracy_score_train_float \
    = gnb_smote_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The gnb model score from SMOTE scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe gnb model score from SMOTE scaled training data is 81.91%[0m


In [203]:
accuracy_score_test_float \
    = gnb_smote_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The gnb model score from SMOTE scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe gnb model score from SMOTE scaled test data is 81.49%[0m


### **SMOTEEN**

In [204]:
accuracy_score_train_float \
    = gnb_smoteen_model.score \
        (x_train_scaled_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The gnb model score from SMOTEEN scaled training data is {:.2f}%' \
         .format(accuracy_score_train_float) 
     + '\033[0m')

[1mThe gnb model score from SMOTEEN scaled training data is 83.25%[0m


In [205]:
accuracy_score_test_float \
    = gnb_smoteen_model.score \
        (x_test_scaled_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The gnb model score from SMOTEEN scaled test data is {:.2f}%' \
         .format(accuracy_score_test_float) 
     + '\033[0m')

[1mThe gnb model score from SMOTEEN scaled test data is 84.19%[0m


## **8.3: Calculate Training and Test Predictions.**

### **Original**

In [206]:
gnb_train_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(gnb_train_predictions_dataframe)

In [207]:
gnb_test_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(gnb_test_predictions_dataframe)

### **Random Undersampling**

In [208]:
gnb_train_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_undersampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(gnb_train_undersampled_predictions_dataframe)

In [209]:
gnb_test_undersampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_undersampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(gnb_test_undersampled_predictions_dataframe)

### **Random Oversampling**

In [210]:
gnb_train_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_oversampled_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(gnb_train_oversampled_predictions_dataframe)

In [211]:
gnb_test_oversampled_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_oversampled_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(gnb_test_oversampled_predictions_dataframe)

### **Cluster Centroids**

In [212]:
gnb_train_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_cluster_centroids_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(gnb_train_cluster_centroids_predictions_dataframe)

In [213]:
gnb_test_cluster_centroids_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_cluster_centroids_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(gnb_test_cluster_centroids_predictions_dataframe)

### **SMOTE**

In [214]:
gnb_train_smote_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_smote_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(gnb_train_smote_predictions_dataframe)

In [215]:
gnb_test_smote_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_smote_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(gnb_test_smote_predictions_dataframe)

### **SMOTEEN**

In [216]:
gnb_train_smoteen_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_smoteen_model,
         x_train_scaled_dataframe,
         y_train_series)

logx.log_write_object(gnb_train_smoteen_predictions_dataframe)

In [217]:
gnb_test_smoteen_predictions_dataframe \
    = classificationsx.return_predictions_dataframe \
        (gnb_smote_model,
         x_test_scaled_dataframe,
         y_test_series)

logx.log_write_object(gnb_test_smoteen_predictions_dataframe)

# <br> **Section 9: Evaluate Model Performance**

## **9.1: Logistic Regression**

### **Original**

In [218]:
logistic_regression_model.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 10000,
 'multi_class': 'multinomial',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 21,
 'solver': 'newton-cg',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [219]:
lr_predictions_nparray = logistic_regression_model.predict(x_test_scaled_dataframe)

lr_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, lr_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The balanced accuracy score for logistic regression from actual vs. test predictions is {:.2f}%' \
         .format(lr_balanced_accuracy_score_float) 
     + '\033[0m')

[1mThe balanced accuracy score for logistic regression from actual vs. test predictions is 90.88%[0m


In [220]:
lr_accuracy_score_float, \
lr_confusion_matrix_dataframe, \
lr_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         lr_predictions_nparray, 
         'LOGISTIC REGRESSION MODEL',
         'Spam', 'Not Spam')

model_performance_dictionary \
    = {'logistic_regression': [lr_accuracy_score_float * 100]}

model_performance_ranking_dictionary \
    = {'logistic_regression': lr_accuracy_score_float * 100}

[1mLOGISTIC REGRESSION MODEL
[0m
1) [1mOverall Accuracy Score: [0m89.67%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 666                  33
Actual Not Spam              72                 380

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.90      0.95      0.93       699
    not spam       0.92      0.84      0.88       452

    accuracy                           0.91      1151
   macro avg       0.91      0.90      0.90      1151
weighted avg       0.91      0.91      0.91      1151




### **Random Undersampling**

In [221]:
logistic_regression_undersampled_model.get_params()

{'C': 1.0,
 'class_weight': 'balanced',
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 10000,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 21,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [222]:
lr_undersampled_predictions_nparray \
    = logistic_regression_undersampled_model.predict(x_test_scaled_dataframe)

lr_undersampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, lr_undersampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for logistic regression undersampled from actual vs. test predictions is {:.2f}%' \
         .format(lr_undersampled_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for logistic regression undersampled from actual vs. test predictions is 92.44%[0m


In [223]:
lr_undersampled_accuracy_score_float, \
lr_undersampled_confusion_matrix_dataframe, \
lr_undersampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         lr_undersampled_predictions_nparray, 
         'LOGISTIC REGRESSION MODEL (Undersampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['logistic_regression'] \
    .append(lr_undersampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['logistic_regression_undersampled'] \
    = lr_undersampled_accuracy_score_float * 100

[1mLOGISTIC REGRESSION MODEL (Undersampled)
[0m
1) [1mOverall Accuracy Score: [0m92.02%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 657                  42
Actual Not Spam              45                 407

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.94      0.94      0.94       699
    not spam       0.91      0.90      0.90       452

    accuracy                           0.92      1151
   macro avg       0.92      0.92      0.92      1151
weighted avg       0.92      0.92      0.92      1151




### **Random Oversampling**

In [224]:
logistic_regression_oversampled_model.get_params()

{'C': 1.0,
 'class_weight': 'balanced',
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 10000,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 21,
 'solver': 'saga',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [225]:
lr_oversampled_predictions_nparray \
    = logistic_regression_oversampled_model.predict(x_test_scaled_dataframe)

lr_oversampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, lr_oversampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for logistic regression oversampled from actual vs. test predictions is {:.2f}%' \
         .format(lr_oversampled_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for logistic regression oversampled from actual vs. test predictions is 91.57%[0m


In [226]:
lr_oversampled_accuracy_score_float, \
lr_oversampled_confusion_matrix_dataframe, \
lr_oversampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         lr_oversampled_predictions_nparray, 
         'LOGISTIC REGRESSION MODEL (Oversampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['logistic_regression'] \
    .append(lr_oversampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['logistic_regression_oversampled'] \
    = lr_oversampled_accuracy_score_float * 100

[1mLOGISTIC REGRESSION MODEL (Oversampled)
[0m
1) [1mOverall Accuracy Score: [0m91.22%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 649                  50
Actual Not Spam              47                 405

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.93      0.93      0.93       699
    not spam       0.89      0.90      0.89       452

    accuracy                           0.92      1151
   macro avg       0.91      0.91      0.91      1151
weighted avg       0.92      0.92      0.92      1151




### **Cluster Centroids**

In [227]:
logistic_regression_cluster_centroids_model.get_params()

{'C': 1.0,
 'class_weight': 'balanced',
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 10000,
 'multi_class': 'multinomial',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 21,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [228]:
lr_cluster_centroids_predictions_nparray \
    = logistic_regression_cluster_centroids_model.predict(x_test_scaled_dataframe)

lr_cluster_centroids_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, lr_cluster_centroids_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for logistic regression cluster centroids from actual vs. test predictions is {:.2f}%' \
         .format(lr_cluster_centroids_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for logistic regression cluster centroids from actual vs. test predictions is 91.40%[0m


In [229]:
lr_cluster_centroids_accuracy_score_float, \
lr_cluster_centroids_confusion_matrix_dataframe, \
lr_cluster_centroids_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         lr_cluster_centroids_predictions_nparray, 
         'LOGISTIC REGRESSION MODEL (Cluster Centroids)',
         'Spam', 'Not Spam')

model_performance_dictionary['logistic_regression'] \
    .append(lr_cluster_centroids_accuracy_score_float * 100)

model_performance_ranking_dictionary['logistic_regression_cluster_centroids'] \
    = lr_cluster_centroids_accuracy_score_float * 100

[1mLOGISTIC REGRESSION MODEL (Cluster Centroids)
[0m
1) [1mOverall Accuracy Score: [0m91.12%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 646                  53
Actual Not Spam              46                 406

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.93      0.92      0.93       699
    not spam       0.88      0.90      0.89       452

    accuracy                           0.91      1151
   macro avg       0.91      0.91      0.91      1151
weighted avg       0.91      0.91      0.91      1151




### **SMOTE**

In [230]:
logistic_regression_smote_model.get_params()

{'C': 1.0,
 'class_weight': 'balanced',
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 10000,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 21,
 'solver': 'saga',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [231]:
lr_smote_predictions_nparray \
    = logistic_regression_smote_model.predict(x_test_scaled_dataframe)

lr_smote_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, lr_smote_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for logistic regression SMOTE from actual vs. test predictions is {:.2f}%' \
         .format(lr_smote_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for logistic regression SMOTE from actual vs. test predictions is 92.09%[0m


In [232]:
lr_smote_accuracy_score_float, \
lr_smote_confusion_matrix_dataframe, \
lr_smote_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         lr_smote_predictions_nparray, 
         'LOGISTIC REGRESSION MODEL (SMOTE)',
         'Spam', 'Not Spam')

model_performance_dictionary['logistic_regression'] \
    .append(lr_smote_accuracy_score_float * 100)

model_performance_ranking_dictionary['logistic_regression_smote'] \
    = lr_smote_accuracy_score_float * 100

[1mLOGISTIC REGRESSION MODEL (SMOTE)
[0m
1) [1mOverall Accuracy Score: [0m91.69%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 654                  45
Actual Not Spam              46                 406

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.93      0.94      0.93       699
    not spam       0.90      0.90      0.90       452

    accuracy                           0.92      1151
   macro avg       0.92      0.92      0.92      1151
weighted avg       0.92      0.92      0.92      1151




### **SMOTEEN**

In [233]:
logistic_regression_smoteen_model.get_params()

{'C': 1.0,
 'class_weight': 'balanced',
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 10000,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 21,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [234]:
lr_smoteen_predictions_nparray \
    = logistic_regression_smoteen_model.predict(x_test_scaled_dataframe)

lr_smoteen_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, lr_smoteen_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for logistic regression SMOTEEN from actual vs. test predictions is {:.2f}%' \
         .format(lr_smoteen_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for logistic regression SMOTEEN from actual vs. test predictions is 92.18%[0m


In [235]:
lr_smoteen_accuracy_score_float, \
lr_smoteen_confusion_matrix_dataframe, \
lr_smoteen_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         lr_smoteen_predictions_nparray, 
         'LOGISTIC REGRESSION MODEL (SMOTEEN)',
         'Spam', 'Not Spam')

model_performance_dictionary['logistic_regression'] \
    .append(lr_smoteen_accuracy_score_float * 100)

model_performance_ranking_dictionary['logistic_regression_smoteen'] \
    = lr_smoteen_accuracy_score_float * 100

[1mLOGISTIC REGRESSION MODEL (SMOTEEN)
[0m
1) [1mOverall Accuracy Score: [0m92.04%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 648                  51
Actual Not Spam              39                 413

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.94      0.93      0.94       699
    not spam       0.89      0.91      0.90       452

    accuracy                           0.92      1151
   macro avg       0.92      0.92      0.92      1151
weighted avg       0.92      0.92      0.92      1151




## **9.2: Decision Tree**

### **Original**

In [236]:
decision_tree_model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': 21,
 'splitter': 'best'}

In [237]:
dt_predictions_nparray \
    = decision_tree_model.predict(x_test_scaled_dataframe)

dt_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, dt_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The balanced accuracy score for decision tree from actual vs. test predictions is {:.2f}%' \
         .format(dt_balanced_accuracy_score_float) 
     + '\033[0m')

[1mThe balanced accuracy score for decision tree from actual vs. test predictions is 89.66%[0m


In [238]:
dt_accuracy_score_float, \
dt_confusion_matrix_dataframe, \
dt_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         dt_predictions_nparray, 
         'DECISION TREE MODEL',
         'Spam', 'Not Spam')

model_performance_dictionary['decision_tree'] \
    = [dt_accuracy_score_float * 100]

model_performance_ranking_dictionary['decision_tree'] \
    = dt_accuracy_score_float * 100

[1mDECISION TREE MODEL
[0m
1) [1mOverall Accuracy Score: [0m87.77%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 675                  24
Actual Not Spam              95                 357

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.88      0.97      0.92       699
    not spam       0.94      0.79      0.86       452

    accuracy                           0.90      1151
   macro avg       0.91      0.88      0.89      1151
weighted avg       0.90      0.90      0.89      1151




### **Random Undersampling**

In [239]:
decision_tree_undersampled_model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': 21,
 'splitter': 'best'}

In [240]:
dt_undersampled_predictions_nparray \
    = decision_tree_undersampled_model.predict(x_test_scaled_dataframe)

dt_undersampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, dt_undersampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for decision tree undersampled from actual vs. test predictions is {:.2f}%' \
         .format(dt_undersampled_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for decision tree undersampled from actual vs. test predictions is 89.66%[0m


In [241]:
dt_undersampled_accuracy_score_float, \
dt_undersampled_confusion_matrix_dataframe, \
dt_undersampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         dt_undersampled_predictions_nparray, 
         'DECISION TREE MODEL (Undersampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['decision_tree'] \
    .append(dt_undersampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['decision_tree_undersampling'] \
    = dt_undersampled_accuracy_score_float * 100

[1mDECISION TREE MODEL (Undersampled)
[0m
1) [1mOverall Accuracy Score: [0m88.01%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 669                  30
Actual Not Spam              89                 363

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.88      0.96      0.92       699
    not spam       0.92      0.80      0.86       452

    accuracy                           0.90      1151
   macro avg       0.90      0.88      0.89      1151
weighted avg       0.90      0.90      0.90      1151




### **Random Oversampling**

In [242]:
decision_tree_oversampled_model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': 21,
 'splitter': 'best'}

In [243]:
dt_oversampled_predictions_nparray \
    = decision_tree_oversampled_model.predict(x_test_scaled_dataframe)

dt_oversampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, dt_oversampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for decision tree oversampled from actual vs. test predictions is {:.2f}%' \
         .format(dt_oversampled_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for decision tree oversampled from actual vs. test predictions is 89.75%[0m


In [244]:
dt_oversampled_accuracy_score_float, \
dt_oversampled_confusion_matrix_dataframe, \
dt_oversampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         dt_oversampled_predictions_nparray, 
         'DECISION TREE MODEL (Oversampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['decision_tree'] \
    .append(dt_oversampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['decision_tree_oversampling'] \
    = dt_oversampled_accuracy_score_float * 100

[1mDECISION TREE MODEL (Oversampled)
[0m
1) [1mOverall Accuracy Score: [0m88.04%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 671                  28
Actual Not Spam              90                 362

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.88      0.96      0.92       699
    not spam       0.93      0.80      0.86       452

    accuracy                           0.90      1151
   macro avg       0.90      0.88      0.89      1151
weighted avg       0.90      0.90      0.90      1151




### **Cluster Centroids**

In [245]:
decision_tree_cluster_centroids_model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': 21,
 'splitter': 'best'}

In [246]:
dt_cluster_centroids_predictions_nparray \
    = decision_tree_cluster_centroids_model.predict(x_test_scaled_dataframe)

dt_cluster_centroids_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, dt_cluster_centroids_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for decision tree cluster centroids from actual vs. test predictions is {:.2f}%' \
         .format(dt_cluster_centroids_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for decision tree cluster centroids from actual vs. test predictions is 88.97%[0m


In [247]:
dt_cluster_centroids_accuracy_score_float, \
dt_cluster_centroids_confusion_matrix_dataframe, \
dt_cluster_centroids_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         dt_cluster_centroids_predictions_nparray, 
         'DECISION TREE MODEL (Cluster Centroids)',
         'Spam', 'Not Spam')

model_performance_dictionary['decision_tree'] \
    .append(dt_cluster_centroids_accuracy_score_float * 100)

model_performance_ranking_dictionary['decision_tree_cluster_centroids'] \
    = dt_cluster_centroids_accuracy_score_float * 100

[1mDECISION TREE MODEL (Cluster Centroids)
[0m
1) [1mOverall Accuracy Score: [0m87.28%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 665                  34
Actual Not Spam              93                 359

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.88      0.95      0.91       699
    not spam       0.91      0.79      0.85       452

    accuracy                           0.89      1151
   macro avg       0.90      0.87      0.88      1151
weighted avg       0.89      0.89      0.89      1151




### **SMOTE**

In [248]:
decision_tree_smote_model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': 21,
 'splitter': 'best'}

In [249]:
dt_smote_predictions_nparray \
    = decision_tree_smote_model.predict(x_test_scaled_dataframe)

dt_smote_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, dt_smote_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for decision tree SMOTE from actual vs. test predictions is {:.2f}%' \
         .format(dt_smote_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for decision tree SMOTE from actual vs. test predictions is 90.96%[0m


In [250]:
dt_smote_accuracy_score_float, \
dt_smote_confusion_matrix_dataframe, \
dt_smote_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         dt_smote_predictions_nparray, 
         'DECISION TREE MODEL (SMOTE)',
         'Spam', 'Not Spam')

model_performance_dictionary['decision_tree'] \
    .append(dt_smote_accuracy_score_float * 100)

model_performance_ranking_dictionary['decision_tree_smote'] \
    = dt_smote_accuracy_score_float * 100

[1mDECISION TREE MODEL (SMOTE)
[0m
1) [1mOverall Accuracy Score: [0m89.67%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 669                  30
Actual Not Spam              74                 378

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.90      0.96      0.93       699
    not spam       0.93      0.84      0.88       452

    accuracy                           0.91      1151
   macro avg       0.91      0.90      0.90      1151
weighted avg       0.91      0.91      0.91      1151




### **SMOTEEN**

In [251]:
decision_tree_smoteen_model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': 21,
 'splitter': 'best'}

In [252]:
dt_smoteen_predictions_nparray \
    = decision_tree_smoteen_model.predict(x_test_scaled_dataframe)

dt_smoteen_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, dt_smoteen_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for decision tree SMOTEEN from actual vs. test predictions is {:.2f}%' \
         .format(dt_smoteen_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for decision tree SMOTEEN from actual vs. test predictions is 56.04%[0m


In [253]:
dt_smoteen_accuracy_score_float, \
dt_smoteen_confusion_matrix_dataframe, \
dt_smoteen_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         dt_smoteen_predictions_nparray, 
         'DECISION TREE MODEL (SMOTEEN)',
         'Spam', 'Not Spam')

model_performance_dictionary['decision_tree'] \
    .append(dt_smoteen_accuracy_score_float * 100)

model_performance_ranking_dictionary['decision_tree_smoteen'] \
    = dt_smoteen_accuracy_score_float * 100

[1mDECISION TREE MODEL (SMOTEEN)
[0m
1) [1mOverall Accuracy Score: [0m62.4%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 229                 470
Actual Not Spam              36                 416

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.86      0.33      0.48       699
    not spam       0.47      0.92      0.62       452

    accuracy                           0.56      1151
   macro avg       0.67      0.62      0.55      1151
weighted avg       0.71      0.56      0.53      1151




## **9.3: Random Forest**

### **Original**

In [254]:
random_forest_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': 'balanced_subsample',
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 21,
 'verbose': 0,
 'warm_start': False}

In [255]:
rt_predictions_nparray \
    = random_forest_model.predict(x_test_scaled_dataframe)

rf_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, rt_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for random forest from actual vs. test predictions is {:.2f}%' \
         .format(rf_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for random forest from actual vs. test predictions is 92.96%[0m


In [256]:
rf_accuracy_score_float, rf_confusion_matrix_dataframe, rf_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         rt_predictions_nparray, 
         'RANDOM FOREST MODEL',
         'Spam', 'Not Spam')

model_performance_dictionary['random_forest'] \
    = [rf_accuracy_score_float * 100]

model_performance_ranking_dictionary['random_forest'] \
    = rf_accuracy_score_float * 100

[1mRANDOM FOREST MODEL
[0m
1) [1mOverall Accuracy Score: [0m91.35%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 691                   8
Actual Not Spam              73                 379

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.90      0.99      0.94       699
    not spam       0.98      0.84      0.90       452

    accuracy                           0.93      1151
   macro avg       0.94      0.91      0.92      1151
weighted avg       0.93      0.93      0.93      1151




### **Random Undersampling**

In [257]:
random_forest_undersampled_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 21,
 'verbose': 0,
 'warm_start': False}

In [258]:
rt_undersampled_predictions_nparray \
    = random_forest_undersampled_model.predict(x_test_scaled_dataframe)

rf_undersampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, rt_undersampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for random forest undersampled from actual vs. test predictions is {:.2f}%' \
         .format(rf_undersampled_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for random forest undersampled from actual vs. test predictions is 94.18%[0m


In [259]:
rf_undersampled_accuracy_score_float, \
rf_undersampled_confusion_matrix_dataframe, \
rf_undersampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         rt_undersampled_predictions_nparray, 
         'RANDOM FOREST MODEL (Undersampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['random_forest'] \
    .append(rf_undersampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['random_forest_undersampled'] \
    = rf_undersampled_accuracy_score_float * 100

[1mRANDOM FOREST MODEL (Undersampled)
[0m
1) [1mOverall Accuracy Score: [0m93.02%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 688                  11
Actual Not Spam              56                 396

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.92      0.98      0.95       699
    not spam       0.97      0.88      0.92       452

    accuracy                           0.94      1151
   macro avg       0.95      0.93      0.94      1151
weighted avg       0.94      0.94      0.94      1151




### **Random Oversampling**

In [260]:
random_forest_oversampled_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 21,
 'verbose': 0,
 'warm_start': False}

In [261]:
rt_oversampled_predictions_nparray \
    = random_forest_oversampled_model.predict(x_test_scaled_dataframe)

rf_oversampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, rt_oversampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for random forest oversampled from actual vs. test predictions is {:.2f}%' \
         .format(rf_oversampled_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for random forest oversampled from actual vs. test predictions is 93.05%[0m


In [262]:
rf_oversampled_accuracy_score_float, \
rf_oversampled_confusion_matrix_dataframe, \
rf_oversampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         rt_oversampled_predictions_nparray, 
         'RANDOM FOREST MODEL (Oversampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['random_forest'] \
    .append(rf_oversampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['random_forest_oversampled'] \
    = rf_oversampled_accuracy_score_float * 100

[1mRANDOM FOREST MODEL (Oversampled)
[0m
1) [1mOverall Accuracy Score: [0m91.62%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 687                  12
Actual Not Spam              68                 384

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.91      0.98      0.94       699
    not spam       0.97      0.85      0.91       452

    accuracy                           0.93      1151
   macro avg       0.94      0.92      0.93      1151
weighted avg       0.93      0.93      0.93      1151




### **Cluster Centroids**

In [263]:
random_forest_cluster_centroids_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': 'balanced_subsample',
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 21,
 'verbose': 0,
 'warm_start': False}

In [264]:
rf_cluster_centroids_predictions_nparray \
    = random_forest_cluster_centroids_model.predict(x_test_scaled_dataframe)

rf_cluster_centroids_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, rf_cluster_centroids_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for random forest cluster centroids from actual vs. test predictions is {:.2f}%' \
         .format(rf_cluster_centroids_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for random forest cluster centroids from actual vs. test predictions is 92.44%[0m


In [265]:
rf_cluster_centroids_accuracy_score_float, \
rf_cluster_centroids_confusion_matrix_dataframe, \
rf_cluster_centroids_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         rf_cluster_centroids_predictions_nparray, 
         'RANDOM FOREST MODEL (Cluster Centroids)',
         'Spam', 'Not Spam')

model_performance_dictionary['random_forest'] \
    .append(rf_cluster_centroids_accuracy_score_float * 100)

model_performance_ranking_dictionary['random_forest_cluster_centroids'] \
    = rf_cluster_centroids_accuracy_score_float * 100

[1mRANDOM FOREST MODEL (Cluster Centroids)
[0m
1) [1mOverall Accuracy Score: [0m90.69%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 691                   8
Actual Not Spam              79                 373

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.90      0.99      0.94       699
    not spam       0.98      0.83      0.90       452

    accuracy                           0.92      1151
   macro avg       0.94      0.91      0.92      1151
weighted avg       0.93      0.92      0.92      1151




### **SMOTE**

In [266]:
random_forest_smote_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': 'balanced_subsample',
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 21,
 'verbose': 0,
 'warm_start': False}

In [267]:
rf_smote_predictions_nparray \
    = random_forest_smote_model.predict(x_test_scaled_dataframe)

rf_smote_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, rf_smote_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for random forest SMOTE from actual vs. test predictions is {:.2f}%' \
         .format(rf_smote_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for random forest SMOTE from actual vs. test predictions is 93.74%[0m


In [268]:
rf_smote_accuracy_score_float, \
rf_smote_confusion_matrix_dataframe, \
rf_smote_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         rf_smote_predictions_nparray, 
         'RANDOM FOREST MODEL (SMOTE)',
         'Spam', 'Not Spam')

model_performance_dictionary['random_forest'] \
    .append(rf_smote_accuracy_score_float * 100)

model_performance_ranking_dictionary['random_forest_smote'] \
    = rf_smote_accuracy_score_float * 100

[1mRANDOM FOREST MODEL (SMOTE)
[0m
1) [1mOverall Accuracy Score: [0m92.5%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 687                  12
Actual Not Spam              60                 392

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.92      0.98      0.95       699
    not spam       0.97      0.87      0.92       452

    accuracy                           0.94      1151
   macro avg       0.94      0.93      0.93      1151
weighted avg       0.94      0.94      0.94      1151




### **SMOTEEN**

In [269]:
random_forest_smoteen_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': 'balanced_subsample',
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 21,
 'verbose': 0,
 'warm_start': False}

In [270]:
rf_smoteen_predictions_nparray \
    = random_forest_smoteen_model.predict(x_test_scaled_dataframe)

rf_smoteen_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, rf_smoteen_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for random forest SMOTEEN from actual vs. test predictions is {:.2f}%' \
         .format(rf_smoteen_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for random forest SMOTEEN from actual vs. test predictions is 93.92%[0m


In [271]:
rf_smoteen_accuracy_score_float, \
rf_smoteen_confusion_matrix_dataframe, \
rf_smoteen_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         rf_smoteen_predictions_nparray, 
         'RANDOM FOREST MODEL (SMOTEEN)',
         'Spam', 'Not Spam')

model_performance_dictionary['random_forest'] \
    .append(rf_smoteen_accuracy_score_float * 100)

model_performance_ranking_dictionary['random_forest_smoteen'] \
    = rf_smoteen_accuracy_score_float * 100

[1mRANDOM FOREST MODEL (SMOTEEN)
[0m
1) [1mOverall Accuracy Score: [0m92.92%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 682                  17
Actual Not Spam              53                 399

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.93      0.98      0.95       699
    not spam       0.96      0.88      0.92       452

    accuracy                           0.94      1151
   macro avg       0.94      0.93      0.94      1151
weighted avg       0.94      0.94      0.94      1151




## **9.4: Support Vector Machine (SVM)**

### **Original**

In [272]:
svm_model.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovo',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': True,
 'random_state': 21,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [273]:
svm_predictions_nparray \
    = svm_model.predict(x_test_scaled_dataframe)

svm_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, svm_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for SVM from actual vs. test predictions is {:.2f}%' \
         .format(svm_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for SVM from actual vs. test predictions is 92.53%[0m


In [274]:
svm_accuracy_score_float, svm_confusion_matrix_dataframe, svm_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         svm_predictions_nparray, 
         'SVM MODEL',
         'Spam', 'Not Spam')

model_performance_dictionary['svm'] \
    = [svm_accuracy_score_float * 100]

model_performance_ranking_dictionary['svm'] \
    = svm_accuracy_score_float * 100

[1mSVM MODEL
[0m
1) [1mOverall Accuracy Score: [0m91.7%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 668                  31
Actual Not Spam              55                 397

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.92      0.96      0.94       699
    not spam       0.93      0.88      0.90       452

    accuracy                           0.93      1151
   macro avg       0.93      0.92      0.92      1151
weighted avg       0.93      0.93      0.92      1151




### **Random Undersampling**

In [275]:
svm_undersampled_model.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': 'balanced',
 'coef0': 0.0,
 'decision_function_shape': 'ovo',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'linear',
 'max_iter': -1,
 'probability': True,
 'random_state': 21,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [276]:
svm_undersampled_predictions_nparray \
    = svm_undersampled_model.predict(x_test_scaled_dataframe)

svm_undersampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, svm_undersampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The balanced accuracy score for svm undersampled from actual vs. test predictions is {:.2f}%' \
         .format(svm_undersampled_balanced_accuracy_score_float) 
     + '\033[0m')

[1mThe balanced accuracy score for svm undersampled from actual vs. test predictions is 93.05%[0m


In [277]:
svm_undersampled_accuracy_score_float, \
svm_undersampled_confusion_matrix_dataframe, \
svm_undersampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         svm_undersampled_predictions_nparray, 
         'SVM MODEL (Undersampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['svm'] \
    .append(svm_undersampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['svm_undersampled'] \
    = svm_undersampled_accuracy_score_float * 100

[1mSVM MODEL (Undersampled)
[0m
1) [1mOverall Accuracy Score: [0m92.79%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 657                  42
Actual Not Spam              38                 414

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.95      0.94      0.94       699
    not spam       0.91      0.92      0.91       452

    accuracy                           0.93      1151
   macro avg       0.93      0.93      0.93      1151
weighted avg       0.93      0.93      0.93      1151




### **Random Oversampling**

In [278]:
svm_oversampled_model.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': 'balanced',
 'coef0': 0.0,
 'decision_function_shape': 'ovo',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': True,
 'random_state': 21,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [279]:
svm_oversampled_predictions_nparray \
    = svm_oversampled_model.predict(x_test_scaled_dataframe)

svm_oversampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, svm_oversampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The balanced accuracy score for svm oversampled from actual vs. test predictions is {:.2f}%' \
         .format(svm_oversampled_balanced_accuracy_score_float) 
     + '\033[0m')

[1mThe balanced accuracy score for svm oversampled from actual vs. test predictions is 92.44%[0m


In [280]:
svm_oversampled_accuracy_score_float, \
svm_oversampled_confusion_matrix_dataframe, \
svm_oversampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         svm_oversampled_predictions_nparray, 
         'SVM MODEL (Oversampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['svm'] \
    .append(svm_oversampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['svm_oversampled'] \
    = svm_oversampled_accuracy_score_float * 100

[1mSVM MODEL (Oversampled)
[0m
1) [1mOverall Accuracy Score: [0m91.86%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 661                  38
Actual Not Spam              49                 403

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.93      0.95      0.94       699
    not spam       0.91      0.89      0.90       452

    accuracy                           0.92      1151
   macro avg       0.92      0.92      0.92      1151
weighted avg       0.92      0.92      0.92      1151




### **Cluster Centroids**

In [281]:
svm_cluster_centroids_model.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': 'balanced',
 'coef0': 0.0,
 'decision_function_shape': 'ovo',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'linear',
 'max_iter': -1,
 'probability': True,
 'random_state': 21,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [282]:
svm_cluster_centroids_predictions_nparray \
    = svm_cluster_centroids_model.predict(x_test_scaled_dataframe)

svm_cluster_centroids_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, svm_cluster_centroids_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for svm cluster centroids from actual vs. test predictions is {:.2f}%' \
         .format(svm_cluster_centroids_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for svm cluster centroids from actual vs. test predictions is 91.49%[0m


In [283]:
svm_cluster_centroids_accuracy_score_float, \
svm_cluster_centroids_confusion_matrix_dataframe, \
svm_cluster_centroids_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         svm_cluster_centroids_predictions_nparray, 
         'SVM MODEL (Cluster Centroids)',
         'Spam', 'Not Spam')

model_performance_dictionary['svm'] \
    .append(svm_cluster_centroids_accuracy_score_float * 100)

model_performance_ranking_dictionary['svm_cluster_centroids'] \
    = svm_cluster_centroids_accuracy_score_float * 100

[1mSVM MODEL (Cluster Centroids)
[0m
1) [1mOverall Accuracy Score: [0m91.39%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 642                  57
Actual Not Spam              41                 411

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.94      0.92      0.93       699
    not spam       0.88      0.91      0.89       452

    accuracy                           0.91      1151
   macro avg       0.91      0.91      0.91      1151
weighted avg       0.92      0.91      0.92      1151




### **SMOTE**

In [284]:
svm_smote_model.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': 'balanced',
 'coef0': 0.0,
 'decision_function_shape': 'ovo',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': True,
 'random_state': 21,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [285]:
svm_smote_predictions_nparray \
    = svm_smote_model.predict(x_test_scaled_dataframe)

svm_smote_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, svm_smote_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for svm SMOTE from actual vs. test predictions is {:.2f}%' \
         .format(svm_smote_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for svm SMOTE from actual vs. test predictions is 92.35%[0m


In [286]:
svm_smote_accuracy_score_float, \
svm_smote_confusion_matrix_dataframe, \
svm_smote_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         svm_smote_predictions_nparray, 
         'SVM MODEL (SMOTE)',
         'Spam', 'Not Spam')

model_performance_dictionary['svm'] \
    .append(svm_smote_accuracy_score_float * 100)

model_performance_ranking_dictionary['svm_smote'] \
    = svm_smote_accuracy_score_float * 100

[1mSVM MODEL (SMOTE)
[0m
1) [1mOverall Accuracy Score: [0m91.75%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 661                  38
Actual Not Spam              50                 402

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.93      0.95      0.94       699
    not spam       0.91      0.89      0.90       452

    accuracy                           0.92      1151
   macro avg       0.92      0.92      0.92      1151
weighted avg       0.92      0.92      0.92      1151




### **SMOTEEN**

In [287]:
svm_smoteen_model.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovo',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': True,
 'random_state': 21,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [288]:
svm_smoteen_predictions_nparray \
    = svm_smoteen_model.predict(x_test_scaled_dataframe)

svm_smoteen_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, svm_smoteen_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for svm SMOTEEN from actual vs. test predictions is {:.2f}%' \
         .format(svm_smoteen_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for svm SMOTEEN from actual vs. test predictions is 92.44%[0m


In [289]:
svm_smoteen_accuracy_score_float, \
svm_smoteen_confusion_matrix_dataframe, \
svm_smoteen_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         svm_smoteen_predictions_nparray, 
         'SVM MODEL (SMOTEEN)',
         'Spam', 'Not Spam')

model_performance_dictionary['svm'] \
    .append(svm_smoteen_accuracy_score_float * 100)

model_performance_ranking_dictionary['svm_smoteen'] \
    = svm_smoteen_accuracy_score_float * 100

[1mSVM MODEL (SMOTEEN)
[0m
1) [1mOverall Accuracy Score: [0m92.1%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 655                  44
Actual Not Spam              43                 409

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.94      0.94      0.94       699
    not spam       0.90      0.90      0.90       452

    accuracy                           0.92      1151
   macro avg       0.92      0.92      0.92      1151
weighted avg       0.92      0.92      0.92      1151




## **9.5: K-Nearest Neighbor (KNN)**

### **Original**

In [290]:
knn_model.get_params()

{'algorithm': 'auto',
 'leaf_size': 2,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 10,
 'p': 2,
 'weights': 'distance'}

In [291]:
knn_predictions_nparray = knn_model.predict(x_test_scaled_dataframe)

knn_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, knn_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for KNN from actual vs. test predictions is {:.2f}%' \
         .format(knn_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for KNN from actual vs. test predictions is 92.35%[0m


In [292]:
knn_accuracy_score_float, \
knn_confusion_matrix_dataframe, \
knn_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         knn_predictions_nparray, 
         'KNN MODEL',
         'Spam', 'Not Spam')

model_performance_dictionary['knn'] \
    = [knn_accuracy_score_float * 100]

model_performance_ranking_dictionary['knn'] \
    = knn_accuracy_score_float * 100

[1mKNN MODEL
[0m
1) [1mOverall Accuracy Score: [0m91.91%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 657                  42
Actual Not Spam              46                 406

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.93      0.94      0.94       699
    not spam       0.91      0.90      0.90       452

    accuracy                           0.92      1151
   macro avg       0.92      0.92      0.92      1151
weighted avg       0.92      0.92      0.92      1151




### **Random Undersampling**

In [293]:
knn_undersampled_model.get_params()

{'algorithm': 'auto',
 'leaf_size': 2,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 12,
 'p': 1,
 'weights': 'distance'}

In [294]:
knn_undersampled_predictions_nparray \
    = knn_undersampled_model.predict(x_test_scaled_dataframe)

knn_undersampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, knn_undersampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for knn undersampled from actual vs. test predictions is {:.2f}%' \
         .format(knn_undersampled_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for knn undersampled from actual vs. test predictions is 91.57%[0m


In [295]:
knn_undersampled_accuracy_score_float, \
knn_undersampled_confusion_matrix_dataframe, \
knn_undersampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         knn_undersampled_predictions_nparray, 
         'KNN MODEL (Undersampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['knn'] \
    .append(knn_undersampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['knn_undersampled'] \
    = knn_undersampled_accuracy_score_float * 100

[1mKNN MODEL (Undersampled)
[0m
1) [1mOverall Accuracy Score: [0m90.64%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 664                  35
Actual Not Spam              62                 390

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.91      0.95      0.93       699
    not spam       0.92      0.86      0.89       452

    accuracy                           0.92      1151
   macro avg       0.92      0.91      0.91      1151
weighted avg       0.92      0.92      0.92      1151




### **Random Oversampling**

In [296]:
knn_oversampled_model.get_params()

{'algorithm': 'auto',
 'leaf_size': 2,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 19,
 'p': 1,
 'weights': 'distance'}

In [297]:
knn_oversampled_predictions_nparray \
    = knn_oversampled_model.predict(x_test_scaled_dataframe)

knn_oversampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, knn_oversampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' 
     + 'The balanced accuracy score for knn oversampled from actual vs. test predictions is {:.2f}%' \
         .format(knn_oversampled_balanced_accuracy_score_float) 
     + '\033[0m')

[1mThe balanced accuracy score for knn oversampled from actual vs. test predictions is 91.92%[0m


In [298]:
knn_oversampled_accuracy_score_float, \
knn_oversampled_confusion_matrix_dataframe, \
knn_oversampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         knn_oversampled_predictions_nparray, 
         'KNN MODEL (Oversampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['knn'] \
    .append(knn_oversampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['knn_oversampled'] \
    = knn_oversampled_accuracy_score_float * 100

[1mKNN MODEL (Oversampled)
[0m
1) [1mOverall Accuracy Score: [0m90.89%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 669                  30
Actual Not Spam              63                 389

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.91      0.96      0.94       699
    not spam       0.93      0.86      0.89       452

    accuracy                           0.92      1151
   macro avg       0.92      0.91      0.91      1151
weighted avg       0.92      0.92      0.92      1151




### **Cluster Centroids**

In [299]:
knn_cluster_centroids_model.get_params()

{'algorithm': 'auto',
 'leaf_size': 2,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 6,
 'p': 2,
 'weights': 'distance'}

In [300]:
knn_cluster_centroids_predictions_nparray \
    = knn_cluster_centroids_model.predict(x_test_scaled_dataframe)

knn_cluster_centroids_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, knn_cluster_centroids_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for knn cluster centroids from actual vs. test predictions is {:.2f}%' \
         .format(knn_cluster_centroids_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for knn cluster centroids from actual vs. test predictions is 87.58%[0m


In [301]:
knn_cluster_centroids_accuracy_score_float, \
knn_cluster_centroids_confusion_matrix_dataframe, \
knn_cluster_centroids_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         knn_cluster_centroids_predictions_nparray, 
         'KNN MODEL (Cluster Centroids)',
         'Spam', 'Not Spam')

model_performance_dictionary['knn'] \
    .append(knn_cluster_centroids_accuracy_score_float * 100)

model_performance_ranking_dictionary['knn_cluster_centroids'] \
    = knn_cluster_centroids_accuracy_score_float * 100

[1mKNN MODEL (Cluster Centroids)
[0m
1) [1mOverall Accuracy Score: [0m88.68%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 584                 115
Actual Not Spam              28                 424

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.95      0.84      0.89       699
    not spam       0.79      0.94      0.86       452

    accuracy                           0.88      1151
   macro avg       0.87      0.89      0.87      1151
weighted avg       0.89      0.88      0.88      1151




### **SMOTE**

In [302]:
knn_smote_model.get_params()

{'algorithm': 'auto',
 'leaf_size': 2,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 12,
 'p': 1,
 'weights': 'distance'}

In [303]:
knn_smote_predictions_nparray \
    = knn_smote_model.predict(x_test_scaled_dataframe)

knn_smote_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, knn_smote_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for knn SMOTE from actual vs. test predictions is {:.2f}%' \
         .format(knn_smote_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for knn SMOTE from actual vs. test predictions is 92.62%[0m


In [304]:
knn_smote_accuracy_score_float, \
knn_smote_confusion_matrix_dataframe, \
knn_smote_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         knn_smote_predictions_nparray, 
         'KNN MODEL (SMOTE)',
         'Spam', 'Not Spam')

model_performance_dictionary['knn'] \
    .append(knn_smote_accuracy_score_float * 100)

model_performance_ranking_dictionary['knn_smote'] \
    = knn_smote_accuracy_score_float * 100

[1mKNN MODEL (SMOTE)
[0m
1) [1mOverall Accuracy Score: [0m92.0%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 663                  36
Actual Not Spam              49                 403

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.93      0.95      0.94       699
    not spam       0.92      0.89      0.90       452

    accuracy                           0.93      1151
   macro avg       0.92      0.92      0.92      1151
weighted avg       0.93      0.93      0.93      1151




### **SMOTEEN**

In [305]:
knn_smoteen_model.get_params()

{'algorithm': 'auto',
 'leaf_size': 2,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 6,
 'p': 2,
 'weights': 'distance'}

In [306]:
knn_smoteen_predictions_nparray \
    = knn_smoteen_model.predict(x_test_scaled_dataframe)

knn_smoteen_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, knn_smoteen_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for knn SMOTEEN from actual vs. test predictions is {:.2f}%' \
         .format(knn_smoteen_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for knn SMOTEEN from actual vs. test predictions is 89.14%[0m


In [307]:
knn_smoteen_accuracy_score_float, \
knn_smoteen_confusion_matrix_dataframe, \
knn_smoteen_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         knn_smoteen_predictions_nparray, 
         'KNN MODEL (SMOTEEN)',
         'Spam', 'Not Spam')

model_performance_dictionary['knn'] \
    .append(knn_smoteen_accuracy_score_float * 100)

model_performance_ranking_dictionary['knn_smoteen'] \
    = knn_smoteen_accuracy_score_float * 100

[1mKNN MODEL (SMOTEEN)
[0m
1) [1mOverall Accuracy Score: [0m89.5%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 614                  85
Actual Not Spam              40                 412

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.94      0.88      0.91       699
    not spam       0.83      0.91      0.87       452

    accuracy                           0.89      1151
   macro avg       0.88      0.89      0.89      1151
weighted avg       0.90      0.89      0.89      1151




## **9.6: Gaussian Naive Bayes (GNB)**

### **Original**

In [308]:
gnb_model.get_params()

{'priors': None, 'var_smoothing': 1e-09}

In [309]:
gnb_predictions_nparray = gnb_model.predict(x_test_scaled_dataframe)

gnb_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, gnb_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for GNB from actual vs. test predictions is {:.2f}%' \
         .format(gnb_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for GNB from actual vs. test predictions is 81.15%[0m


In [310]:
gnb_accuracy_score_float, \
gnb_confusion_matrix_dataframe, \
gnb_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         gnb_predictions_nparray, 
         'GNB MODEL',
         'Spam', 'Not Spam')

model_performance_dictionary['gnb'] \
    = [gnb_accuracy_score_float * 100]

model_performance_ranking_dictionary['gnb'] \
    = gnb_accuracy_score_float * 100

[1mGNB MODEL
[0m
1) [1mOverall Accuracy Score: [0m83.66%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 503                 196
Actual Not Spam              21                 431

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.96      0.72      0.82       699
    not spam       0.69      0.95      0.80       452

    accuracy                           0.81      1151
   macro avg       0.82      0.84      0.81      1151
weighted avg       0.85      0.81      0.81      1151




### **Random Undersampling**

In [311]:
gnb_undersampled_model.get_params()

{'priors': None, 'var_smoothing': 1e-09}

In [312]:
gnb_undersampled_predictions_nparray \
    = gnb_undersampled_model.predict(x_test_scaled_dataframe)

gnb_undersampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, gnb_undersampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for gnb undersampled from actual vs. test predictions is {:.2f}%' \
         .format(gnb_undersampled_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for gnb undersampled from actual vs. test predictions is 81.15%[0m


In [313]:
gnb_undersampled_accuracy_score_float, \
gnb_undersampled_confusion_matrix_dataframe, \
gnb_undersampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         gnb_undersampled_predictions_nparray, 
         'GNB MODEL (Undersampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['gnb'] \
    .append(gnb_undersampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['gnb_undersampled'] \
    = gnb_undersampled_accuracy_score_float * 100

[1mGNB MODEL (Undersampled)
[0m
1) [1mOverall Accuracy Score: [0m83.62%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 504                 195
Actual Not Spam              22                 430

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.96      0.72      0.82       699
    not spam       0.69      0.95      0.80       452

    accuracy                           0.81      1151
   macro avg       0.82      0.84      0.81      1151
weighted avg       0.85      0.81      0.81      1151




### **Random Oversampling**

In [314]:
gnb_oversampled_model.get_params()

{'priors': None, 'var_smoothing': 1e-09}

In [315]:
gnb_oversampled_predictions_nparray \
    = gnb_oversampled_model.predict(x_test_scaled_dataframe)

gnb_oversampled_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, gnb_oversampled_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for gnb oversampled from actual vs. test predictions is {:.2f}%' \
         .format(gnb_oversampled_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for gnb oversampled from actual vs. test predictions is 80.97%[0m


In [316]:
gnb_oversampled_accuracy_score_float, \
gnb_oversampled_confusion_matrix_dataframe, \
gnb_oversampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         gnb_oversampled_predictions_nparray, 
         'GNB MODEL (Oversampled)',
         'Spam', 'Not Spam')

model_performance_dictionary['gnb'] \
    .append(gnb_oversampled_accuracy_score_float * 100)

model_performance_ranking_dictionary['gnb_oversampled'] \
    = gnb_oversampled_accuracy_score_float * 100

[1mGNB MODEL (Oversampled)
[0m
1) [1mOverall Accuracy Score: [0m83.4%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 504                 195
Actual Not Spam              24                 428

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.95      0.72      0.82       699
    not spam       0.69      0.95      0.80       452

    accuracy                           0.81      1151
   macro avg       0.82      0.83      0.81      1151
weighted avg       0.85      0.81      0.81      1151




### **Cluster Centroids**

In [317]:
gnb_cluster_centroids_model.get_params()

{'priors': None, 'var_smoothing': 1e-09}

In [318]:
gnb_cluster_centroids_predictions_nparray \
    = gnb_cluster_centroids_model.predict(x_test_scaled_dataframe)

gnb_cluster_centroids_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, gnb_cluster_centroids_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for knn cluster centroids from actual vs. test predictions is {:.2f}%' \
         .format(gnb_cluster_centroids_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for knn cluster centroids from actual vs. test predictions is 81.23%[0m


In [319]:
gnb_cluster_centroids_accuracy_score_float, \
gnb_cluster_centroids_confusion_matrix_dataframe, \
gnb_cluster_centroids_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         gnb_cluster_centroids_predictions_nparray, 
         'GNB MODEL (Cluster Centroids)',
         'Spam', 'Not Spam')

model_performance_dictionary['gnb'] \
    .append(gnb_cluster_centroids_accuracy_score_float * 100)

model_performance_ranking_dictionary['gnb_cluster_centroids'] \
    = gnb_cluster_centroids_accuracy_score_float * 100

[1mGNB MODEL (Cluster Centroids)
[0m
1) [1mOverall Accuracy Score: [0m83.77%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 503                 196
Actual Not Spam              20                 432

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.96      0.72      0.82       699
    not spam       0.69      0.96      0.80       452

    accuracy                           0.81      1151
   macro avg       0.82      0.84      0.81      1151
weighted avg       0.85      0.81      0.81      1151




### **SMOTE**

In [320]:
gnb_smote_model.get_params()

{'priors': None, 'var_smoothing': 1e-09}

In [321]:
gnb_smote_predictions_nparray \
    = gnb_smote_model.predict(x_test_scaled_dataframe)

gnb_smote_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, gnb_smote_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for gnb SMOTE from actual vs. test predictions is {:.2f}%' \
         .format(gnb_smote_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for gnb SMOTE from actual vs. test predictions is 81.49%[0m


In [322]:
gnb_smote_accuracy_score_float, \
gnb_smote_confusion_matrix_dataframe, \
gnb_smote_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         gnb_smote_predictions_nparray, 
         'GNB MODEL (SMOTE)',
         'Spam', 'Not Spam')

model_performance_dictionary['gnb'] \
    .append(gnb_smote_accuracy_score_float * 100)

model_performance_ranking_dictionary['gnb_smote'] \
    = gnb_smote_accuracy_score_float * 100

[1mGNB MODEL (SMOTE)
[0m
1) [1mOverall Accuracy Score: [0m83.86%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 509                 190
Actual Not Spam              23                 429

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.96      0.73      0.83       699
    not spam       0.69      0.95      0.80       452

    accuracy                           0.81      1151
   macro avg       0.82      0.84      0.81      1151
weighted avg       0.85      0.81      0.82      1151




### **SMOTEEN**

In [323]:
gnb_smoteen_model.get_params()

{'priors': None, 'var_smoothing': 1e-09}

In [324]:
gnb_smoteen_predictions_nparray \
    = gnb_smoteen_model.predict(x_test_scaled_dataframe)

gnb_smoteen_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, gnb_smoteen_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' \
     + 'The balanced accuracy score for gnb SMOTEEN from actual vs. test predictions is {:.2f}%' \
         .format(gnb_smoteen_balanced_accuracy_score_float) \
     + '\033[0m')

[1mThe balanced accuracy score for gnb SMOTEEN from actual vs. test predictions is 84.19%[0m


In [325]:
gnb_smoteen_accuracy_score_float, \
gnb_smoteen_confusion_matrix_dataframe, \
gnb_smoteen_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         gnb_smoteen_predictions_nparray, 
         'GNB MODEL (SMOTEEN)',
         'Spam', 'Not Spam')

model_performance_dictionary['gnb'] \
    .append(gnb_smoteen_accuracy_score_float * 100)

model_performance_ranking_dictionary['gnb_smoteen'] \
    = gnb_smoteen_accuracy_score_float * 100

[1mGNB MODEL (SMOTEEN)
[0m
1) [1mOverall Accuracy Score: [0m85.81%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 547                 152
Actual Not Spam              30                 422

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.95      0.78      0.86       699
    not spam       0.74      0.93      0.82       452

    accuracy                           0.84      1151
   macro avg       0.84      0.86      0.84      1151
weighted avg       0.86      0.84      0.84      1151




## **9.7: Model Performance Results**

### **Performance Matrix**

In [326]:
index_string_list \
    = ['original', 'undersampled', 'oversampled', 'cluster centroids', 'smote', 'smoteen']

model_performace_dataframe \
    = pd.DataFrame.from_dict \
        (model_performance_dictionary, orient = 'index').transpose()

model_performace_dataframe['resampling_method'] = index_string_list

model_performace_dataframe.set_index('resampling_method', drop = True, inplace = True)

logx.log_write_object(model_performace_dataframe)

In [327]:
pandas_processx.return_formatted_table \
    (model_performace_dataframe, 
     'Table 9.7.1: Model Performance Matrix',
     line_count_integer = 36,
     hide_index_boolean = False) \
        .format('{:,.1f}%')

Unnamed: 0_level_0,logistic_regression,decision_tree,random_forest,svm,knn,gnb
resampling_method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
original,89.7%,87.8%,91.4%,91.7%,91.9%,83.7%
undersampled,92.0%,88.0%,93.0%,92.8%,90.6%,83.6%
oversampled,91.2%,88.0%,91.6%,91.9%,90.9%,83.4%
cluster centroids,91.1%,87.3%,90.7%,91.4%,88.7%,83.8%
smote,91.7%,89.7%,92.5%,91.8%,92.0%,83.9%
smoteen,92.0%,62.4%,92.9%,92.1%,89.5%,85.8%


### **Performance Ranking**

In [328]:
temp_dictionary \
    = dict(sorted(model_performance_ranking_dictionary.items(), key = lambda x: x[1], reverse = True))

model_performace_rankings_dataframe \
    = pd.DataFrame.from_dict \
        (temp_dictionary, orient = 'index', columns = ['accuracy'])

model_performace_rankings_dataframe.index.name = 'model'

logx.log_write_object(model_performace_rankings_dataframe)

In [329]:
pandas_processx.return_formatted_table \
    (model_performace_rankings_dataframe, 
     'Table 9.7.2: Model Performance Rankings',
     line_count_integer = 36,
     hide_index_boolean = False) \
        .format({'accuracy': '{:,.1f}%'})

Unnamed: 0_level_0,accuracy
model,Unnamed: 1_level_1
random_forest_undersampled,93.0%
random_forest_smoteen,92.9%
svm_undersampled,92.8%
random_forest_smote,92.5%
svm_smoteen,92.1%
logistic_regression_smoteen,92.0%
logistic_regression_undersampled,92.0%
knn_smote,92.0%
knn,91.9%
svm_oversampled,91.9%


# <br> **Section 10: Save Models To Files**

## **10.1: Logistic Regression**

### **Original**

In [330]:
pickle.dump \
    (logistic_regression_model, 
     open(spam_detector_constants.CONSTANT_LR_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [331]:
pickle.dump \
    (logistic_regression_undersampled_model, 
     open(spam_detector_constants.CONSTANT_LR_UNDERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [332]:
pickle.dump \
    (logistic_regression_oversampled_model, 
     open(spam_detector_constants.CONSTANT_LR_OVERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [333]:
pickle.dump \
    (logistic_regression_cluster_centroids_model, 
     open(spam_detector_constants.CONSTANT_LR_CENTROIDS_MODEL_FILE_PATH, 'wb'))

### **SMOTE**

In [334]:
pickle.dump \
    (logistic_regression_smote_model, 
     open(spam_detector_constants.CONSTANT_LR_SMOTE_MODEL_FILE_PATH, 'wb'))

### **SMOTEEN**

In [335]:
pickle.dump \
    (logistic_regression_smoteen_model, 
     open(spam_detector_constants.CONSTANT_LR_SMOTEEN_MODEL_FILE_PATH, 'wb'))

## **10.2: Decision Tree**

### **Original**

In [336]:
pickle.dump \
    (decision_tree_model, 
     open(spam_detector_constants.CONSTANT_DT_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [337]:
pickle.dump \
    (decision_tree_undersampled_model, 
     open(spam_detector_constants.CONSTANT_DT_UNDERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [338]:
pickle.dump \
    (decision_tree_oversampled_model, 
     open(spam_detector_constants.CONSTANT_DT_OVERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [339]:
pickle.dump \
    (decision_tree_cluster_centroids_model, 
     open(spam_detector_constants.CONSTANT_DT_CENTROIDS_MODEL_FILE_PATH, 'wb'))

### **SMOTE**

In [340]:
pickle.dump \
    (decision_tree_smote_model, 
     open(spam_detector_constants.CONSTANT_DT_SMOTE_MODEL_FILE_PATH, 'wb'))

### **SMOTEEN**

In [341]:
pickle.dump \
    (decision_tree_smoteen_model, 
     open(spam_detector_constants.CONSTANT_DT_SMOTEEN_MODEL_FILE_PATH, 'wb'))

## **10.3: Random Forest**

### **Original**

In [342]:
pickle.dump \
    (random_forest_model, 
     open(spam_detector_constants.CONSTANT_RF_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [343]:
pickle.dump \
    (random_forest_undersampled_model, 
     open(spam_detector_constants.CONSTANT_RF_UNDERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [344]:
pickle.dump \
    (random_forest_oversampled_model, 
     open(spam_detector_constants.CONSTANT_RF_OVERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [345]:
pickle.dump \
    (random_forest_cluster_centroids_model, 
     open(spam_detector_constants.CONSTANT_RF_CENTROIDS_MODEL_FILE_PATH, 'wb'))

### **SMOTE**

In [346]:
pickle.dump \
    (random_forest_smote_model, 
     open(spam_detector_constants.CONSTANT_RF_SMOTE_MODEL_FILE_PATH, 'wb'))

### **SMOTEEN**

In [347]:
pickle.dump \
    (random_forest_smoteen_model, 
     open(spam_detector_constants.CONSTANT_RF_SMOTEEN_MODEL_FILE_PATH, 'wb'))

## **10.4: Support Vector Machine (SVM)**

### **Original**

In [348]:
pickle.dump \
    (svm_model, 
     open(spam_detector_constants.CONSTANT_SVM_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [349]:
pickle.dump \
    (svm_undersampled_model, 
     open(spam_detector_constants.CONSTANT_SVM_UNDERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [350]:
pickle.dump \
    (svm_oversampled_model, 
     open(spam_detector_constants.CONSTANT_SVM_OVERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [351]:
pickle.dump \
    (svm_cluster_centroids_model, 
     open(spam_detector_constants.CONSTANT_SVM_CENTROIDS_MODEL_FILE_PATH, 'wb'))

### **SMOTE**

In [352]:
pickle.dump \
    (svm_smote_model, 
     open(spam_detector_constants.CONSTANT_SVM_SMOTE_MODEL_FILE_PATH, 'wb'))

### **SMOTEEN**

In [353]:
pickle.dump \
    (svm_smoteen_model, 
     open(spam_detector_constants.CONSTANT_SVM_SMOTEEN_MODEL_FILE_PATH, 'wb'))

## **10.5: K-Nearest Neighbor (KNN)**

### **Original**

In [354]:
pickle.dump \
    (knn_model, 
     open(spam_detector_constants.CONSTANT_KNN_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [355]:
pickle.dump \
    (knn_undersampled_model, 
     open(spam_detector_constants.CONSTANT_KNN_UNDERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [356]:
pickle.dump \
    (knn_oversampled_model, 
     open(spam_detector_constants.CONSTANT_KNN_OVERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [357]:
pickle.dump \
    (knn_cluster_centroids_model, 
     open(spam_detector_constants.CONSTANT_KNN_CENTROIDS_MODEL_FILE_PATH, 'wb'))

### **SMOTE**

In [358]:
pickle.dump \
    (knn_smote_model, 
     open(spam_detector_constants.CONSTANT_KNN_SMOTE_MODEL_FILE_PATH, 'wb'))

### **SMOTEEN**

In [359]:
pickle.dump \
    (knn_smoteen_model, 
     open(spam_detector_constants.CONSTANT_KNN_SMOTEEN_MODEL_FILE_PATH, 'wb'))

## **10.6: Gaussian Naive Bayes (GNB)**

### **Original**

In [360]:
pickle.dump \
    (gnb_model, 
     open(spam_detector_constants.CONSTANT_GNB_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [361]:
pickle.dump \
    (gnb_undersampled_model, 
     open(spam_detector_constants.CONSTANT_GNB_UNDERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [362]:
pickle.dump \
    (gnb_oversampled_model, 
     open(spam_detector_constants.CONSTANT_GNB_OVERSAMPLED_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [363]:
pickle.dump \
    (gnb_cluster_centroids_model, 
     open(spam_detector_constants.CONSTANT_GNB_CENTROIDS_MODEL_FILE_PATH, 'wb'))

### **SMOTE**

In [364]:
pickle.dump \
    (gnb_smote_model, 
     open(spam_detector_constants.CONSTANT_GNB_SMOTE_MODEL_FILE_PATH, 'wb'))

### **SMOTEEN**

In [365]:
pickle.dump \
    (gnb_smoteen_model, 
     open(spam_detector_constants.CONSTANT_GNB_SMOTEEN_MODEL_FILE_PATH, 'wb'))

In [366]:
# logx.end_program()