In [1]:
#*******************************************************************************************
 #
 #  File Name:  spam_detector_optimization.ipynb
 #
 #  File Description:
 #      This interactive Python notebook, spam_detector_optimization.ipynb, reads 
 #      a csv file, spam-data.csv, and uses Python and scikit-learn module to find 
 #      the best hyperparameters for supervised learning models (binary classification) 
 #      that detects spam in e-mails. Here is a list of the models:
 #
 #      logistic regression
 #      decision tree
 #      random forest
 #      support vector machine
 #      k-nearest neighbor
 #      gaussian naive bayes
 #
 #
 #  Date            Description                             Programmer
 #  ----------      ------------------------------------    ------------------
 #  04/22/2024      Initial Development                     Nicholas J. George
 #
 #******************************************************************************************/

import classificationsx
import logx
import pandas_processx
import spam_detector_constants

import copy
import pickle

import numpy as np
import pandas as pd

from IPython.display import clear_output

from imblearn.combine import SMOTEENN
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import RandomUnderSampler

from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
CONSTANT_LOCAL_FILE_NAME = 'spam_detector_optimization.ipynb'

CONSTANT_SPAM_DATA_CSV_FILE_PATH = 'https://static.bc-edx.com/mbc/ai/m4/datasets/spam-data.csv'


logx.set_log_mode(False)

logx.set_image_mode(False)


logx.begin_program('spam_detector_optimization')

Program execution begins...



# <br> **Section 1: Split Data into Training and Testing Sets**

## **1.1: Read the CSV data into a Pandas DataFrame**

In [3]:
data_type_dictionary \
    = {'word_freq_make': float,
       'word_freq_address': float,
       'word_freq_all': float,
       'word_freq_3d': float,
       'word_freq_our': float,
       'word_freq_over': float,
       'word_freq_remove': float,
       'word_freq_internet': float,
       'word_freq_order': float,
       'word_freq_mail': float,
       'word_freq_receive': float,
       'word_freq_will': float,
       'word_freq_people': float,
       'word_freq_report': float,
       'word_freq_addresses': float,
       'word_freq_free': float,
       'word_freq_business': float,
       'word_freq_email': float,
       'word_freq_you': float,
       'word_freq_credit': float,
       'word_freq_your': float,
       'word_freq_font': float,
       'word_freq_000': float,
       'word_freq_money': float,
       'word_freq_hp': float,
       'word_freq_hpl': float,
       'word_freq_george': float,
       'word_freq_650': float,
       'word_freq_lab': float,
       'word_freq_labs': float,
       'word_freq_telnet': float,
       'word_freq_857': float,
       'word_freq_data': float,
       'word_freq_415': float,
       'word_freq_85': float,
       'word_freq_technology': float,
       'word_freq_1999': float,
       'word_freq_parts': float,
       'word_freq_pm': float,
       'word_freq_direct': float,
       'word_freq_cs': float,
       'word_freq_meeting': float,
       'word_freq_original': float,
       'word_freq_project': float,
       'word_freq_re': float,
       'word_freq_edu': float,
       'word_freq_table': float,
       'word_freq_conference': float,
       'char_freq_;': float,
       'char_freq_(': float,
       'char_freq_[': float,
       'char_freq_!': float,
       'char_freq_$': float,
       'char_freq_#': float,
       'capital_run_length_average': float,
       'capital_run_length_longest': int,
       'capital_run_length_total': int,
       'spam': int}

spam_dataframe \
    = pd.read_csv(spam_detector_constants.CONSTANT_INPUT_FILE_PATH, dtype = data_type_dictionary)

logx.log_write_object(spam_dataframe)

## **1.2: Display Spam DataFrame**

In [4]:
pandas_processx.return_formatted_table(spam_dataframe, 'Table 1.2: Spam Data Table')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,0.0,0.64,0.0,0.0,0.0,0.32,0.0,1.29,1.93,0.0,0.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.78,0.0,0.0,3.76,61,278,1
0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,0.21,0.79,0.65,0.21,0.14,0.14,0.07,0.28,3.47,0.0,1.59,0.0,0.43,0.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.0,0.37,0.18,0.05,5.11,101,1028,1
0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,0.38,0.45,0.12,0.0,1.75,0.06,0.06,1.03,1.36,0.32,0.51,0.0,1.16,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.12,0.0,0.06,0.06,0.0,0.0,0.01,0.14,0.0,0.28,0.18,0.01,9.82,485,2259,1
0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,0.31,0.31,0.31,0.0,0.0,0.31,0.0,0.0,3.18,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.14,0.0,0.0,3.54,40,191,1
0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,0.31,0.31,0.31,0.0,0.0,0.31,0.0,0.0,3.18,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.14,0.0,0.0,3.54,40,191,1
0.0,0.0,0.0,0.0,1.85,0.0,0.0,1.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22,0.0,0.0,0.0,0.0,3.0,15,54,1
0.0,0.0,0.0,0.0,1.92,0.0,0.0,0.0,0.0,0.64,0.96,1.28,0.0,0.0,0.0,0.96,0.0,0.32,3.85,0.0,0.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.16,0.05,0.0,1.67,4,112,1
0.0,0.0,0.0,0.0,1.88,0.0,0.0,1.88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,2.45,11,49,1
0.15,0.0,0.46,0.0,0.61,0.0,0.3,0.0,0.92,0.76,0.76,0.92,0.0,0.0,0.0,0.0,0.0,0.15,1.23,3.53,2.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.27,0.0,0.18,0.2,0.02,9.74,445,1257,1
0.06,0.12,0.77,0.0,0.19,0.32,0.38,0.0,0.06,0.0,0.0,0.64,0.25,0.0,0.12,0.0,0.0,0.12,1.67,0.06,0.71,0.0,0.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.04,0.03,0.0,0.24,0.08,0.0,1.73,43,749,1


## **1.3: Create the labels series (`y`)  from the “spam” column, and then create the features (`X`) DataFrame from the remaining columns.**

### **Separate the Y Variable, The Labels**

In [5]:
y_series = spam_dataframe['spam']

logx.log_write_object(y_series)

### **Review the Y Series**

In [6]:
pandas_processx.return_formatted_table(y_series.to_frame(), 'Table 1.3.1: Spam Y Series')

spam
1
1
1
1
1
1
1
1
1
1


### **Check the Balance of the Labels Variable (`y`) by Using the `value_counts` Function.**

In [7]:
y_series.value_counts()

spam
0    2788
1    1813
Name: count, dtype: int64

### **Separate the X Variable, the Features**

In [8]:
x_dataframe = spam_dataframe.drop(columns = 'spam', axis = 1)

logx.log_write_object(x_dataframe)

### **Review the X DataFrame**

In [9]:
pandas_processx.return_formatted_table(x_dataframe, 'Table 1.3.2: Spam X DataFrame')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,0.0,0.64,0.0,0.0,0.0,0.32,0.0,1.29,1.93,0.0,0.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.78,0.0,0.0,3.76,61,278
0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,0.21,0.79,0.65,0.21,0.14,0.14,0.07,0.28,3.47,0.0,1.59,0.0,0.43,0.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.0,0.37,0.18,0.05,5.11,101,1028
0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,0.38,0.45,0.12,0.0,1.75,0.06,0.06,1.03,1.36,0.32,0.51,0.0,1.16,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.12,0.0,0.06,0.06,0.0,0.0,0.01,0.14,0.0,0.28,0.18,0.01,9.82,485,2259
0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,0.31,0.31,0.31,0.0,0.0,0.31,0.0,0.0,3.18,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.14,0.0,0.0,3.54,40,191
0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,0.31,0.31,0.31,0.0,0.0,0.31,0.0,0.0,3.18,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.14,0.0,0.0,3.54,40,191
0.0,0.0,0.0,0.0,1.85,0.0,0.0,1.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22,0.0,0.0,0.0,0.0,3.0,15,54
0.0,0.0,0.0,0.0,1.92,0.0,0.0,0.0,0.0,0.64,0.96,1.28,0.0,0.0,0.0,0.96,0.0,0.32,3.85,0.0,0.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.16,0.05,0.0,1.67,4,112
0.0,0.0,0.0,0.0,1.88,0.0,0.0,1.88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,2.45,11,49
0.15,0.0,0.46,0.0,0.61,0.0,0.3,0.0,0.92,0.76,0.76,0.92,0.0,0.0,0.0,0.0,0.0,0.15,1.23,3.53,2.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.27,0.0,0.18,0.2,0.02,9.74,445,1257
0.06,0.12,0.77,0.0,0.19,0.32,0.38,0.0,0.06,0.0,0.0,0.64,0.25,0.0,0.12,0.0,0.0,0.12,1.67,0.06,0.71,0.0,0.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.04,0.03,0.0,0.24,0.08,0.0,1.73,43,749


## **1.4: Split the Data into Training and Testing Datasets by Using `train_test_split`.**

In [10]:
x_train_dataframe, x_test_dataframe, \
y_train_series, y_test_series \
    = train_test_split \
        (x_dataframe, y_series, 
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1)

In [11]:
logx.log_write_object(x_train_dataframe)

logx.log_write_object(x_test_dataframe)

logx.log_write_object(y_train_series)

logx.log_write_object(y_test_series)

## **1.5: Use the StandardScaler to Scale the X Variables**

### **Scale Training and Test Data as Numpy Arrays**

In [12]:
x_train_scaled_nparray = StandardScaler().fit_transform(x_train_dataframe)

logx.log_write_object(x_train_scaled_nparray)

In [13]:
x_test_scaled_nparray = StandardScaler().fit_transform(x_test_dataframe)

logx.log_write_object(x_test_scaled_nparray)

### **Create Scaled X Variable DataFrames**

In [14]:
x_train_scaled_dataframe \
    = pd.DataFrame \
        (x_train_scaled_nparray, 
         columns = x_train_dataframe.columns, 
         index = x_train_dataframe.index)
    
logx.log_write_object(x_train_scaled_dataframe)

In [15]:
x_test_scaled_dataframe \
    = pd.DataFrame \
        (x_test_scaled_nparray, 
         columns = x_test_dataframe.columns, 
         index = x_test_dataframe.index)
    
logx.log_write_object(x_test_scaled_dataframe)

### **Display Scaled Training and Testing Data**

In [16]:
pandas_processx.return_formatted_table \
    (x_train_scaled_dataframe, 
     'Table 1.5.1: Spam Scaled X Variable Training Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
2.83,-0.16,1.35,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.68,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,0.06,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.12,-0.3,-0.1,-0.09,-0.2,-0.36
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.35,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,3.86,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,2.8,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,0.05,-0.3,-0.1,-0.09,-0.21,-0.38
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,5.77,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
0.93,0.15,-0.56,-0.05,0.12,-0.36,-0.29,-0.26,-0.32,0.24,-0.3,0.74,-0.3,0.99,-0.19,-0.32,-0.31,-0.35,-0.28,-0.16,-0.67,-0.12,-0.29,-0.21,1.41,1.93,0.13,1.9,0.52,0.61,0.76,0.98,-0.17,0.97,1.88,0.76,0.58,-0.06,-0.18,0.88,-0.13,0.32,1.51,-0.12,0.1,-0.2,-0.07,-0.11,0.35,0.83,5.17,-0.31,-0.3,-0.1,-0.09,-0.19,-0.23
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,2.35,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.3,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,1.58,-0.18,-0.13,-0.18,-0.21,-0.12,0.46,-0.2,-0.07,-0.11,-0.16,0.48,-0.19,0.32,-0.3,-0.1,-0.08,-0.18,-0.34
1.42,-0.16,-0.56,-0.05,-0.46,0.65,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,1.45,-0.17,-0.19,-0.32,-0.31,-0.35,1.18,-0.16,-0.23,-0.12,-0.29,-0.21,-0.17,-0.3,-0.23,-0.23,-0.18,-0.22,1.11,-0.14,-0.17,-0.15,0.28,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,1.02,-0.19,-0.31,0.01,-0.1,-0.1,-0.17,-0.29
-0.35,-0.02,-0.56,-0.05,-0.2,-0.36,-0.29,0.66,-0.32,-0.37,-0.3,0.45,-0.3,-0.17,-0.19,-0.32,3.74,-0.35,-0.84,-0.16,-0.67,-0.12,-0.29,-0.21,-0.1,-0.1,-0.23,-0.23,-0.18,0.16,-0.16,-0.14,-0.17,-0.15,-0.19,0.71,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,0.28,-0.2,-0.07,-0.11,-0.16,0.36,-0.19,-0.31,-0.3,-0.1,-0.11,-0.21,-0.19
-0.35,-0.16,-0.56,-0.05,0.88,-0.36,1.88,-0.26,-0.32,1.04,4.1,-0.63,2.62,-0.17,-0.19,-0.32,-0.31,-0.35,0.58,-0.16,0.06,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.1,-0.22,-0.42
-0.35,-0.16,3.33,-0.05,1.01,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,1.65,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,1.83,-0.16,0.13,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,2.18,-0.18,-0.13,2.37,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.23,-0.42


In [17]:
pandas_processx.return_formatted_table \
    (x_test_scaled_dataframe, 
     'Table 1.5.2: Spam Scaled X Variable Test Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
-0.02,-0.17,-0.0,-0.05,0.06,-0.02,-0.3,0.18,-0.01,-0.38,-0.29,0.23,-0.35,0.89,-0.2,-0.27,-0.36,-0.36,0.24,-0.18,2.26,-0.12,0.78,0.02,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.05,-0.19,-0.08,-0.13,-0.1,-0.44,0.05,-0.04,-0.19,0.0,-0.02,0.2,0.48
-0.31,-0.17,-0.56,-0.05,-0.46,-0.34,-0.3,-0.28,-0.33,-0.38,-0.29,-0.62,-0.35,-0.18,-0.2,-0.27,-0.36,-0.36,-0.51,-0.18,-0.69,-0.12,-0.3,-0.26,-0.13,0.1,-0.22,1.23,0.37,0.62,-0.18,-0.15,-0.18,-0.15,0.68,-0.25,-0.33,-0.07,1.29,-0.21,-0.1,-0.16,-0.2,0.55,-0.29,-0.19,-0.08,-0.13,-0.17,-0.66,-0.13,-0.43,-0.32,-0.16,-0.19,-0.3,-0.3
-0.31,-0.17,-0.56,-0.05,-0.46,-0.34,-0.3,-0.28,-0.33,2.42,-0.29,0.85,-0.35,-0.18,-0.2,-0.27,-0.36,-0.36,-0.92,-0.18,-0.69,-0.12,-0.3,-0.26,0.99,1.11,0.14,3.68,0.77,1.27,1.72,2.2,-0.18,2.21,1.38,2.58,-0.33,-0.07,1.12,1.97,-0.1,-0.16,2.68,-0.14,0.26,-0.19,-0.08,-0.13,0.63,2.93,-0.13,-0.43,-0.32,-0.16,-0.11,-0.26,-0.2
-0.31,-0.17,-0.56,-0.05,-0.46,-0.34,-0.3,-0.28,-0.33,0.31,-0.29,1.56,-0.35,-0.18,-0.2,-0.27,-0.36,0.46,-0.12,-0.18,-0.28,-0.12,-0.3,-0.26,0.66,1.26,0.05,1.7,0.54,0.88,2.64,5.09,-0.18,5.1,2.16,0.8,-0.33,-0.07,-0.18,3.03,-0.1,1.73,-0.2,-0.14,0.12,-0.19,-0.08,4.55,-0.17,1.06,-0.13,-0.43,-0.32,-0.16,0.0,-0.1,-0.21
-0.31,-0.17,0.04,-0.05,0.83,0.15,-0.3,-0.28,2.78,-0.38,-0.29,0.24,0.15,-0.18,-0.2,-0.27,0.43,-0.36,0.35,-0.18,0.1,-0.12,0.12,0.66,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,1.28,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,-0.17,-0.2,-0.13,0.2,-0.11,-0.03,-0.13,0.03,0.27
-0.31,-0.17,0.75,-0.05,-0.46,1.88,-0.3,-0.28,-0.33,-0.38,-0.29,-0.62,-0.35,-0.18,-0.2,0.37,-0.36,-0.36,0.58,-0.18,-0.69,-0.12,1.61,-0.26,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,1.3,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,0.23,0.24,1.58,0.64,-0.32,-0.16,0.42,0.74,0.02
0.76,-0.17,2.82,-0.05,0.99,-0.34,0.76,-0.28,2.01,1.61,-0.29,-0.62,-0.35,-0.18,-0.2,-0.27,-0.36,-0.36,-0.73,2.13,-0.1,-0.12,-0.3,-0.26,-0.14,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,-0.17,-0.66,-0.13,0.96,0.12,-0.16,23.74,9.53,1.94
-0.31,-0.17,-0.56,-0.05,3.85,-0.34,-0.3,-0.28,-0.33,-0.38,-0.29,-0.62,-0.35,-0.18,-0.2,2.64,-0.36,-0.36,-0.92,-0.18,-0.69,-0.12,-0.3,-0.26,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,1.33,1.02,-0.13,0.72,-0.32,-0.16,-0.01,-0.3,-0.46
0.76,-0.17,-0.56,-0.05,-0.46,0.81,-0.3,-0.28,-0.33,-0.38,-0.29,0.58,2.07,-0.18,-0.2,-0.27,1.45,-0.36,0.84,-0.18,1.14,-0.12,-0.3,-0.26,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,-0.17,-0.66,-0.13,-0.43,-0.32,-0.16,-0.17,-0.22,-0.38
-0.31,-0.17,-0.56,-0.05,-0.46,-0.34,-0.3,-0.28,-0.33,-0.38,-0.29,-0.62,-0.35,-0.18,-0.2,-0.27,-0.36,-0.36,0.68,-0.18,-0.69,-0.12,-0.3,-0.26,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,2.74,-0.08,-0.13,-0.17,-0.66,-0.13,-0.43,-0.32,-0.16,-0.22,-0.39,-0.49


# <br> **Section 2: Undersampled and OverSampled Spam Data**

## **2.1: Instantiate the Random Undersampler Instance**

In [18]:
x_train_scaled_undersampled_dataframe, y_train_undersampled_series \
    = RandomUnderSampler(random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
        .fit_resample(x_train_scaled_dataframe, y_train_series)

In [19]:
logx.log_write_object(x_train_scaled_undersampled_dataframe)

logx.log_write_object(y_train_undersampled_series)

## **2.2: Instantiate the Random Oversampler Instance**

In [20]:
x_train_scaled_oversampled_dataframe, y_train_oversampled_series \
    = RandomOverSampler(random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
        .fit_resample(x_train_scaled_dataframe, y_train_series)

In [21]:
logx.log_write_object(x_train_scaled_undersampled_dataframe)

logx.log_write_object(y_train_undersampled_series)

## **2.3: Instantiate the Cluster Centroids Instance**

In [22]:
x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series \
    = ClusterCentroids \
        (estimator \
             = KMeans(n_init = 'auto', random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_2), 
         random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit_resample(x_train_scaled_dataframe, y_train_series)

In [23]:
logx.log_write_object(x_train_scaled_cluster_centroids_dataframe)

logx.log_write_object(y_train_cluster_centroids_series)

## **2.4: Instantiate the SMOTE Instance**

In [24]:
x_train_scaled_smote_dataframe, y_train_smote_series \
    = SMOTE(random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1, sampling_strategy = 'auto') \
        .fit_resample(x_train_scaled_dataframe, y_train_series)

In [25]:
logx.log_write_object(x_train_scaled_smote_dataframe)

logx.log_write_object(y_train_smote_series)

## **2.5: Instantiate the SMOTEEN Instance**

In [26]:
x_train_scaled_smoteen_dataframe, y_train_smoteen_series \
    = SMOTEENN(random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1) \
        .fit_resample(x_train_scaled_dataframe, y_train_series)

In [27]:
logx.log_write_object(x_train_scaled_smoteen_dataframe)

logx.log_write_object(y_train_smoteen_series)

## **2.6: Check the Balance of the Labels Variable (`y`) by Using the `value_counts` Function.**

In [28]:
y_train_undersampled_series.value_counts()

spam
0    1361
1    1361
Name: count, dtype: int64

In [29]:
y_train_oversampled_series.value_counts()

spam
0    2089
1    2089
Name: count, dtype: int64

In [30]:
y_train_cluster_centroids_series.value_counts()

spam
0    1361
1    1361
Name: count, dtype: int64

In [31]:
y_train_smote_series.value_counts()

spam
0    2089
1    2089
Name: count, dtype: int64

In [32]:
y_train_smoteen_series.value_counts()

spam
1    1785
0    1651
Name: count, dtype: int64

## **2.7: Display Normalized Resampled Training and Testing Data**

In [33]:
pandas_processx.return_formatted_table \
    (x_train_scaled_undersampled_dataframe, 
     'Table 2.7.1: X Training Scaled Undersampled Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,1.27,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,7.49,4.85,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.24,-0.44
-0.35,-0.16,-0.56,-0.05,-0.46,1.88,-0.29,-0.26,-0.32,0.57,-0.3,0.06,1.64,-0.17,-0.19,-0.32,-0.31,-0.35,0.76,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,1.06,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,0.32,3.18,-0.07,-0.11,0.23,-0.48,-0.19,-0.2,0.45,-0.1,-0.1,-0.18,-0.32
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,3.86,-0.37,-0.3,0.71,3.46,-0.17,-0.19,-0.32,-0.31,-0.35,-0.29,-0.16,0.27,-0.12,-0.29,-0.21,-0.34,-0.3,0.12,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.24,-0.44
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,6.21,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,3.12,-0.16,0.8,-0.12,-0.29,-0.21,-0.34,-0.3,0.31,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,0.62,-0.19,-0.31,-0.3,-0.1,-0.12,-0.24,-0.44
-0.35,1.34,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,1.09,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,1.43,1.8,0.33,1.44,1.47,1.75,2.03,2.53,-0.17,2.52,1.43,2.14,1.82,-0.06,2.03,2.34,-0.13,1.02,3.9,-0.12,0.66,-0.2,-0.07,-0.11,-0.16,1.11,1.67,-0.31,-0.3,-0.1,-0.1,-0.19,-0.32
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,8.64,-0.18,-0.13,4.57,-0.21,-0.12,3.51,-0.2,-0.07,-0.11,-0.16,-0.48,8.23,-0.31,-0.3,-0.1,-0.1,-0.23,-0.43
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,4.65,-0.3,-0.17,-0.19,-0.32,1.64,-0.35,-0.94,-0.16,0.07,-0.12,-0.29,-0.21,0.82,-0.3,-0.23,-0.23,-0.18,1.71,-0.16,-0.14,-0.17,-0.15,-0.19,2.09,-0.32,-0.06,1.99,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.05,-0.19,-0.02,-0.3,-0.1,-0.08,-0.19,-0.35
-0.35,0.47,2.51,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,0.27,-0.3,-0.17,-0.19,-0.32,-0.31,2.65,2.11,-0.16,2.5,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.01,-0.19,0.01,-0.3,-0.1,-0.11,-0.21,-0.39
-0.35,11.39,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.1,-0.23,-0.44
-0.35,-0.16,0.09,-0.05,0.52,-0.36,-0.29,0.56,-0.32,1.7,-0.3,-0.63,-0.3,0.81,-0.19,-0.32,0.39,-0.35,-0.2,-0.16,-0.4,-0.12,-0.29,-0.21,0.91,0.44,-0.03,-0.23,-0.18,-0.22,-0.16,-0.14,0.41,-0.15,-0.19,-0.24,1.96,-0.06,-0.18,-0.18,-0.13,-0.18,1.25,0.38,0.04,-0.2,-0.07,-0.11,0.53,0.88,-0.19,-0.05,1.03,-0.1,-0.04,-0.06,0.06


In [34]:
pandas_processx.return_formatted_table \
    (x_train_scaled_oversampled_dataframe, 
     'Table 2.7.2: X Training Scaled Oversampled Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
2.83,-0.16,1.35,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.68,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,0.06,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.12,-0.3,-0.1,-0.09,-0.2,-0.36
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.35,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,3.86,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,2.8,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,0.05,-0.3,-0.1,-0.09,-0.21,-0.38
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,5.77,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
0.93,0.15,-0.56,-0.05,0.12,-0.36,-0.29,-0.26,-0.32,0.24,-0.3,0.74,-0.3,0.99,-0.19,-0.32,-0.31,-0.35,-0.28,-0.16,-0.67,-0.12,-0.29,-0.21,1.41,1.93,0.13,1.9,0.52,0.61,0.76,0.98,-0.17,0.97,1.88,0.76,0.58,-0.06,-0.18,0.88,-0.13,0.32,1.51,-0.12,0.1,-0.2,-0.07,-0.11,0.35,0.83,5.17,-0.31,-0.3,-0.1,-0.09,-0.19,-0.23
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,2.35,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.3,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,1.58,-0.18,-0.13,-0.18,-0.21,-0.12,0.46,-0.2,-0.07,-0.11,-0.16,0.48,-0.19,0.32,-0.3,-0.1,-0.08,-0.18,-0.34
1.42,-0.16,-0.56,-0.05,-0.46,0.65,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,1.45,-0.17,-0.19,-0.32,-0.31,-0.35,1.18,-0.16,-0.23,-0.12,-0.29,-0.21,-0.17,-0.3,-0.23,-0.23,-0.18,-0.22,1.11,-0.14,-0.17,-0.15,0.28,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,1.02,-0.19,-0.31,0.01,-0.1,-0.1,-0.17,-0.29
-0.35,-0.02,-0.56,-0.05,-0.2,-0.36,-0.29,0.66,-0.32,-0.37,-0.3,0.45,-0.3,-0.17,-0.19,-0.32,3.74,-0.35,-0.84,-0.16,-0.67,-0.12,-0.29,-0.21,-0.1,-0.1,-0.23,-0.23,-0.18,0.16,-0.16,-0.14,-0.17,-0.15,-0.19,0.71,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,0.28,-0.2,-0.07,-0.11,-0.16,0.36,-0.19,-0.31,-0.3,-0.1,-0.11,-0.21,-0.19
-0.35,-0.16,-0.56,-0.05,0.88,-0.36,1.88,-0.26,-0.32,1.04,4.1,-0.63,2.62,-0.17,-0.19,-0.32,-0.31,-0.35,0.58,-0.16,0.06,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.1,-0.22,-0.42
-0.35,-0.16,3.33,-0.05,1.01,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,1.65,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,1.83,-0.16,0.13,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,2.18,-0.18,-0.13,2.37,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.23,-0.42


In [35]:
pandas_processx.return_formatted_table \
    (x_train_scaled_cluster_centroids_dataframe, 
     'Table 2.7.3: X Training Scaled Cluster Centroids Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,5.77,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
-0.35,-0.16,-0.55,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.62,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.2,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.11,-0.3,-0.2,-0.07,-0.11,-0.14,-0.48,-0.19,-0.29,-0.3,-0.1,-0.11,-0.24,-0.44
-0.35,0.18,-0.42,-0.05,-0.25,-0.36,-0.29,-0.26,-0.32,0.3,-0.3,-0.3,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.55,-0.16,-0.61,7.64,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,0.1,-0.12,-0.3,-0.04,-0.07,-0.11,17.3,-0.48,-0.19,-0.23,-0.3,1.19,0.05,-0.05,0.24
-0.35,11.39,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.1,-0.23,-0.44
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,4.9,4.39,1.02,7.22,7.21,8.61,9.61,11.83,-0.17,11.78,7.07,10.43,-0.32,-0.06,-0.18,11.09,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,3.47,-0.19,-0.31,-0.3,-0.1,-0.06,-0.2,-0.41
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,0.22,-0.3,-0.17,-0.19,-0.32,-0.31,1.08,-0.11,-0.16,-0.67,-0.12,-0.29,-0.21,2.48,2.21,-0.23,1.1,-0.18,1.35,1.58,-0.14,-0.17,-0.15,1.1,1.66,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.13,2.3,-0.19,-0.3,0.32,-0.1,-0.21,-0.32
-0.35,0.73,-0.56,-0.05,0.36,-0.36,-0.29,-0.26,-0.32,1.36,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,0.71,-0.94,-0.16,0.22,-0.12,-0.29,-0.21,0.01,0.32,0.43,-0.23,-0.18,-0.22,-0.16,1.44,-0.17,1.43,-0.19,-0.24,3.49,-0.06,1.13,-0.18,-0.13,-0.18,4.65,-0.12,0.27,-0.2,-0.07,-0.11,0.2,0.75,0.88,-0.11,-0.3,-0.1,-0.1,-0.15,-0.25
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,6.76,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.24,-0.44
2.14,-0.16,0.94,-0.05,-0.46,1.06,-0.29,-0.26,-0.32,-0.37,-0.3,0.69,0.93,-0.17,-0.19,-0.32,-0.31,-0.35,0.34,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,0.72,-0.18,7.3,-0.18,-0.21,-0.12,0.09,0.22,-0.07,-0.11,-0.16,0.14,-0.19,0.31,-0.3,-0.1,-0.09,-0.18,-0.23
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,3.21,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,10.64,-0.16,-0.48,-0.19,0.42,-0.3,-0.1,-0.1,-0.21,-0.43


In [36]:
pandas_processx.return_formatted_table \
    (x_train_scaled_smote_dataframe, 
     'Table 2.7.4: X Training Scaled SMOTE Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
2.83,-0.16,1.35,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.68,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,0.06,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.12,-0.3,-0.1,-0.09,-0.2,-0.36
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.35,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,3.86,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,2.8,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,0.05,-0.3,-0.1,-0.09,-0.21,-0.38
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,5.77,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
0.93,0.15,-0.56,-0.05,0.12,-0.36,-0.29,-0.26,-0.32,0.24,-0.3,0.74,-0.3,0.99,-0.19,-0.32,-0.31,-0.35,-0.28,-0.16,-0.67,-0.12,-0.29,-0.21,1.41,1.93,0.13,1.9,0.52,0.61,0.76,0.98,-0.17,0.97,1.88,0.76,0.58,-0.06,-0.18,0.88,-0.13,0.32,1.51,-0.12,0.1,-0.2,-0.07,-0.11,0.35,0.83,5.17,-0.31,-0.3,-0.1,-0.09,-0.19,-0.23
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,2.35,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.3,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,1.58,-0.18,-0.13,-0.18,-0.21,-0.12,0.46,-0.2,-0.07,-0.11,-0.16,0.48,-0.19,0.32,-0.3,-0.1,-0.08,-0.18,-0.34
1.42,-0.16,-0.56,-0.05,-0.46,0.65,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,1.45,-0.17,-0.19,-0.32,-0.31,-0.35,1.18,-0.16,-0.23,-0.12,-0.29,-0.21,-0.17,-0.3,-0.23,-0.23,-0.18,-0.22,1.11,-0.14,-0.17,-0.15,0.28,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,1.02,-0.19,-0.31,0.01,-0.1,-0.1,-0.17,-0.29
-0.35,-0.02,-0.56,-0.05,-0.2,-0.36,-0.29,0.66,-0.32,-0.37,-0.3,0.45,-0.3,-0.17,-0.19,-0.32,3.74,-0.35,-0.84,-0.16,-0.67,-0.12,-0.29,-0.21,-0.1,-0.1,-0.23,-0.23,-0.18,0.16,-0.16,-0.14,-0.17,-0.15,-0.19,0.71,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,0.28,-0.2,-0.07,-0.11,-0.16,0.36,-0.19,-0.31,-0.3,-0.1,-0.11,-0.21,-0.19
-0.35,-0.16,-0.56,-0.05,0.88,-0.36,1.88,-0.26,-0.32,1.04,4.1,-0.63,2.62,-0.17,-0.19,-0.32,-0.31,-0.35,0.58,-0.16,0.06,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.1,-0.22,-0.42
-0.35,-0.16,3.33,-0.05,1.01,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,1.65,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,1.83,-0.16,0.13,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,2.18,-0.18,-0.13,2.37,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.23,-0.42


In [37]:
pandas_processx.return_formatted_table \
    (x_train_scaled_smoteen_dataframe, 
     'Table 2.7.5: X Training Scaled SMOTEEN Data')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
2.83,-0.16,1.35,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.68,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,0.06,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.12,-0.3,-0.1,-0.09,-0.2,-0.36
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.35,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,3.86,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,2.8,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,0.05,-0.3,-0.1,-0.09,-0.21,-0.38
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,5.77,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
0.93,0.15,-0.56,-0.05,0.12,-0.36,-0.29,-0.26,-0.32,0.24,-0.3,0.74,-0.3,0.99,-0.19,-0.32,-0.31,-0.35,-0.28,-0.16,-0.67,-0.12,-0.29,-0.21,1.41,1.93,0.13,1.9,0.52,0.61,0.76,0.98,-0.17,0.97,1.88,0.76,0.58,-0.06,-0.18,0.88,-0.13,0.32,1.51,-0.12,0.1,-0.2,-0.07,-0.11,0.35,0.83,5.17,-0.31,-0.3,-0.1,-0.09,-0.19,-0.23
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,2.35,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.3,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,1.58,-0.18,-0.13,-0.18,-0.21,-0.12,0.46,-0.2,-0.07,-0.11,-0.16,0.48,-0.19,0.32,-0.3,-0.1,-0.08,-0.18,-0.34
1.42,-0.16,-0.56,-0.05,-0.46,0.65,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,1.45,-0.17,-0.19,-0.32,-0.31,-0.35,1.18,-0.16,-0.23,-0.12,-0.29,-0.21,-0.17,-0.3,-0.23,-0.23,-0.18,-0.22,1.11,-0.14,-0.17,-0.15,0.28,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,1.02,-0.19,-0.31,0.01,-0.1,-0.1,-0.17,-0.29
-0.35,-0.02,-0.56,-0.05,-0.2,-0.36,-0.29,0.66,-0.32,-0.37,-0.3,0.45,-0.3,-0.17,-0.19,-0.32,3.74,-0.35,-0.84,-0.16,-0.67,-0.12,-0.29,-0.21,-0.1,-0.1,-0.23,-0.23,-0.18,0.16,-0.16,-0.14,-0.17,-0.15,-0.19,0.71,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,0.28,-0.2,-0.07,-0.11,-0.16,0.36,-0.19,-0.31,-0.3,-0.1,-0.11,-0.21,-0.19
-0.35,-0.16,3.33,-0.05,1.01,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,1.65,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,1.83,-0.16,0.13,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,2.18,-0.18,-0.13,2.37,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.23,-0.42
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,1.79,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,1.38,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,1.34,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.11,-0.23,-0.42


# <br> **Section 3: Model Optimization**

## **3.1: Logistic Regression**

### **Original**

In [38]:
parameters_grid_dictionary \
    = {'class_weight': ['balanced', None],
       'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
       'multi_class': ['auto', 'ovr', 'multinomial']}

lr_grid_search_model \
    = GridSearchCV \
        (LogisticRegression \
             (random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1, 
              max_iter = spam_detector_constants.CONSTANT_ML_LR_MAX_ITERATIONS), 
         parameters_grid_dictionary)

lr_undersampled_grid_search_model = copy.copy(lr_grid_search_model)

lr_oversampled_grid_search_model = copy.copy(lr_grid_search_model)

lr_cluster_centroids_grid_search_model = copy.copy(lr_grid_search_model)

lr_smote_grid_search_model = copy.copy(lr_grid_search_model)

lr_smoteen_grid_search_model = copy.copy(lr_grid_search_model)

In [39]:
lr_grid_search_model \
    .fit(x_train_scaled_dataframe, y_train_series)

clear_output()

In [40]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model best accuracy score is {:.2f}%' \
         .format(lr_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(lr_grid_search_model.best_params_)
     + '\033[0m')

[1mThe logistic regression model best accuracy score is 92.52%

The optimal model hyperparameters are:
{'class_weight': None, 'multi_class': 'multinomial', 'solver': 'newton-cg'}[0m


### **Random Undersampling**

In [41]:
lr_undersampled_grid_search_model.fit \
    (x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

clear_output()

In [42]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The logistic regression model with random undersampling best accuracy score is {:.2f}%' \
         .format(lr_undersampled_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(lr_undersampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe logistic regression model with random undersampling best accuracy score is 92.21%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'multi_class': 'auto', 'solver': 'liblinear'}[0m


### **Random Oversampling**

In [43]:
lr_oversampled_grid_search_model.fit \
    (x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

clear_output()

In [44]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The logistic regression model with random oversampling best accuracy score is {:.2f}%' \
         .format(lr_oversampled_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(lr_oversampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe logistic regression model with random oversampling best accuracy score is 92.41%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'multi_class': 'auto', 'solver': 'saga'}[0m


### **Cluster Centroids**

In [45]:
lr_cluster_centroids_grid_search_model.fit \
    (x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

clear_output()

In [46]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The logistic regression model with cluster centroid best accuracy score is {:.2f}%' \
         .format(lr_cluster_centroids_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(lr_cluster_centroids_grid_search_model.best_params_)
     + '\033[0m')

[1mThe logistic regression model with cluster centroid best accuracy score is 92.47%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'multi_class': 'multinomial', 'solver': 'lbfgs'}[0m


### **Smote**

In [47]:
lr_smote_grid_search_model.fit \
    (x_train_scaled_smote_dataframe, y_train_smote_series)

clear_output()

In [48]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The logistic regression model with smote best accuracy score is {:.2f}%' \
         .format(lr_smote_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(lr_smote_grid_search_model.best_params_)
     + '\033[0m')

[1mThe logistic regression model with smote best accuracy score is 92.56%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'multi_class': 'auto', 'solver': 'saga'}[0m


### **Smoteen**

In [49]:
lr_smoteen_grid_search_model.fit \
    (x_train_scaled_smoteen_dataframe, y_train_smoteen_series)

clear_output()

In [50]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The logistic regression model with smoteen best accuracy score is {:.2f}%' \
         .format(lr_smoteen_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(lr_smoteen_grid_search_model.best_params_)
     + '\033[0m')

[1mThe logistic regression model with smoteen best accuracy score is 97.79%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'multi_class': 'auto', 'solver': 'lbfgs'}[0m


## **3.2: Decision Tree**

### **Original**

In [51]:
parameters_grid_dictionary \
    = {'criterion': ['gini', 'entropy', 'log_loss'],
       'splitter': ['best', 'random'],
       'class_weight': ['balanced', None]}

dt_grid_search_model \
    = GridSearchCV \
        (DecisionTreeClassifier(random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1), 
         parameters_grid_dictionary)

dt_undersampled_grid_search_model = copy.copy(dt_grid_search_model)

dt_oversampled_grid_search_model = copy.copy(dt_grid_search_model)

dt_cluster_centroids_grid_search_model = copy.copy(dt_grid_search_model)

dt_smote_grid_search_model = copy.copy(dt_grid_search_model)

dt_smoteen_grid_search_model = copy.copy(dt_grid_search_model)

In [52]:
dt_grid_search_model \
    .fit(x_train_scaled_dataframe, y_train_series)

clear_output()

In [53]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model best accuracy score is {:.2f}%' \
         .format(dt_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(dt_grid_search_model.best_params_)
     + '\033[0m')

[1mThe decision tree model best accuracy score is 91.51%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'criterion': 'entropy', 'splitter': 'best'}[0m


### **Random Undersampling**

In [54]:
dt_undersampled_grid_search_model.fit \
    (x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

clear_output()

In [55]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The decision tree model with random undersampling best accuracy score is {:.2f}%' \
         .format(dt_undersampled_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(dt_undersampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe decision tree model with random undersampling best accuracy score is 90.70%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'criterion': 'entropy', 'splitter': 'best'}[0m


### **Random Oversampling**

In [56]:
dt_oversampled_grid_search_model.fit \
    (x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

clear_output()

In [57]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The decision tree model with random oversampling best accuracy score is {:.2f}%' \
         .format(dt_oversampled_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(dt_oversampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe decision tree model with random oversampling best accuracy score is 93.92%

The optimal model hyperparameters are:
{'class_weight': None, 'criterion': 'entropy', 'splitter': 'best'}[0m


### **Cluster Centroids**

In [58]:
dt_cluster_centroids_grid_search_model.fit \
    (x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

clear_output()

In [59]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The decision tree model with cluster centroid best accuracy score is {:.2f}%' \
         .format(dt_cluster_centroids_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(dt_cluster_centroids_grid_search_model.best_params_)
     + '\033[0m')

[1mThe decision tree model with cluster centroid best accuracy score is 90.59%

The optimal model hyperparameters are:
{'class_weight': None, 'criterion': 'entropy', 'splitter': 'best'}[0m


### **Smote**

In [60]:
dt_smote_grid_search_model.fit \
    (x_train_scaled_smote_dataframe, y_train_smote_series)

clear_output()

In [61]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The decision tree model with smote best accuracy score is {:.2f}%' \
         .format(dt_smote_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(dt_smote_grid_search_model.best_params_)
     + '\033[0m')

[1mThe decision tree model with smote best accuracy score is 91.74%

The optimal model hyperparameters are:
{'class_weight': None, 'criterion': 'entropy', 'splitter': 'best'}[0m


### **Smoteen**

In [62]:
dt_smoteen_grid_search_model.fit \
    (x_train_scaled_smoteen_dataframe, y_train_smoteen_series)

clear_output()

In [63]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The decision tree model with smoteen best accuracy score is {:.2f}%' \
         .format(dt_smoteen_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(dt_smoteen_grid_search_model.best_params_)
     + '\033[0m')

[1mThe decision tree model with smoteen best accuracy score is 96.25%

The optimal model hyperparameters are:
{'class_weight': None, 'criterion': 'entropy', 'splitter': 'best'}[0m


## **3.3: Random Forest**

### **Original**

In [64]:
parameters_grid_dictionary \
    = {'criterion': ['gini', 'entropy', 'log_loss'],
       'max_features': ['sqrt', 'log2', None],
       'class_weight': ['balanced', 'balanced_subsample', None]}

rf_grid_search_model \
    = GridSearchCV \
        (RandomForestClassifier \
             (n_estimators = spam_detector_constants.CONSTANT_ML_RF_N_ESTIMATORS, 
              random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1), 
         parameters_grid_dictionary)

rf_undersampled_grid_search_model = copy.copy(rf_grid_search_model)

rf_oversampled_grid_search_model = copy.copy(rf_grid_search_model)

rf_cluster_centroids_grid_search_model = copy.copy(rf_grid_search_model)

rf_smote_grid_search_model = copy.copy(rf_grid_search_model)

rf_smoteen_grid_search_model = copy.copy(rf_grid_search_model)

In [65]:
rf_grid_search_model \
    .fit(x_train_scaled_dataframe, y_train_series)

clear_output()

In [66]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model best accuracy score is {:.2f}%' \
         .format(rf_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(rf_grid_search_model.best_params_)
     + '\033[0m')

[1mThe random forest model best accuracy score is 95.07%

The optimal model hyperparameters are:
{'class_weight': 'balanced_subsample', 'criterion': 'entropy', 'max_features': 'log2'}[0m


### **Random Undersampling**

In [67]:
rf_undersampled_grid_search_model.fit \
    (x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

clear_output()

In [68]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The random forest model with random undersampling best accuracy score is {:.2f}%' \
         .format(rf_undersampled_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(rf_undersampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe random forest model with random undersampling best accuracy score is 94.71%

The optimal model hyperparameters are:
{'class_weight': None, 'criterion': 'gini', 'max_features': 'log2'}[0m


### **Random Oversampling**

In [69]:
rf_oversampled_grid_search_model.fit \
    (x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

clear_output()

In [70]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The random forest model with random oversampling best accuracy score is {:.2f}%' \
         .format(rf_oversampled_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(rf_oversampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe random forest model with random oversampling best accuracy score is 96.84%

The optimal model hyperparameters are:
{'class_weight': None, 'criterion': 'gini', 'max_features': 'log2'}[0m


### **Cluster Centroids**

In [71]:
rf_cluster_centroids_grid_search_model.fit \
    (x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

clear_output()

In [72]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The random forest model with cluster centroid best accuracy score is {:.2f}%' \
         .format(rf_cluster_centroids_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(rf_cluster_centroids_grid_search_model.best_params_)
     + '\033[0m')

[1mThe random forest model with cluster centroid best accuracy score is 94.75%

The optimal model hyperparameters are:
{'class_weight': 'balanced_subsample', 'criterion': 'gini', 'max_features': 'log2'}[0m


### **Smote**

In [73]:
rf_smote_grid_search_model.fit \
    (x_train_scaled_smote_dataframe, y_train_smote_series)

clear_output()

In [74]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The random forest model with smote best accuracy score is {:.2f}%' \
         .format(rf_smote_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(rf_smote_grid_search_model.best_params_)
     + '\033[0m')

[1mThe random forest model with smote best accuracy score is 95.98%

The optimal model hyperparameters are:
{'class_weight': 'balanced_subsample', 'criterion': 'entropy', 'max_features': 'log2'}[0m


### **Smoteen**

In [75]:
rf_smoteen_grid_search_model.fit \
    (x_train_scaled_smoteen_dataframe, y_train_smoteen_series)

clear_output()

In [76]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The random forest model with smoteen best accuracy score is {:.2f}%' \
         .format(rf_smoteen_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(rf_smoteen_grid_search_model.best_params_)
     + '\033[0m')

[1mThe random forest model with smoteen best accuracy score is 98.72%

The optimal model hyperparameters are:
{'class_weight': 'balanced_subsample', 'criterion': 'gini', 'max_features': 'log2'}[0m


## **3.4: Support Vector Machine (SVM)**

### **Original**

In [77]:
parameters_grid_dictionary \
    = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
       'gamma': ['scale', 'auto'],
       'class_weight': ['balanced', None],
       'decision_function_shape': ['ovo', 'ovr']}

svm_grid_search_model \
    = GridSearchCV \
        (SVC \
             (probability = spam_detector_constants.CONSTANT_ML_SVM_PROBABILITY, 
              random_state = spam_detector_constants.CONSTANT_ML_RANDOM_STATE_1), 
         parameters_grid_dictionary)

svm_undersampled_grid_search_model = copy.copy(svm_grid_search_model)

svm_oversampled_grid_search_model = copy.copy(svm_grid_search_model)

svm_cluster_centroids_grid_search_model = copy.copy(svm_grid_search_model)

svm_smote_grid_search_model = copy.copy(svm_grid_search_model)

svm_smoteen_grid_search_model = copy.copy(svm_grid_search_model)

In [78]:
svm_grid_search_model \
    .fit(x_train_scaled_dataframe, y_train_series)

clear_output()

In [79]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The svm model best accuracy score is {:.2f}%' \
         .format(svm_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(svm_grid_search_model.best_params_)
     + '\033[0m')

[1mThe svm model best accuracy score is 92.99%

The optimal model hyperparameters are:
{'class_weight': None, 'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'rbf'}[0m


### **Random Undersampling**

In [80]:
svm_undersampled_grid_search_model.fit \
    (x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

clear_output()

In [81]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The svm model with random undersampling best accuracy score is {:.2f}%' \
         .format(svm_undersampled_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(svm_undersampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe svm model with random undersampling best accuracy score is 92.73%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'linear'}[0m


### **Random Oversampling**

In [82]:
svm_oversampled_grid_search_model.fit \
    (x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

clear_output()

In [83]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The svm model with random oversampling best accuracy score is {:.2f}%' \
         .format(svm_oversampled_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(svm_oversampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe svm model with random oversampling best accuracy score is 93.56%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'rbf'}[0m


### **Cluster Centroids**

In [84]:
svm_cluster_centroids_grid_search_model.fit \
    (x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

clear_output()

In [85]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The svm model with cluster centroid best accuracy score is {:.2f}%' \
         .format(svm_cluster_centroids_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(svm_cluster_centroids_grid_search_model.best_params_)
     + '\033[0m')

[1mThe svm model with cluster centroid best accuracy score is 92.06%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'linear'}[0m


### **Smote**

In [86]:
svm_smote_grid_search_model.fit \
    (x_train_scaled_smote_dataframe, y_train_smote_series)

clear_output()

In [87]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The svm model with smote best accuracy score is {:.2f}%' \
         .format(svm_smote_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(svm_smote_grid_search_model.best_params_)
     + '\033[0m')

[1mThe svm model with smote best accuracy score is 93.54%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'rbf'}[0m


### **Smoteen**

In [88]:
svm_smoteen_grid_search_model.fit \
    (x_train_scaled_smoteen_dataframe, y_train_smoteen_series)

clear_output()

In [89]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The svm model with smoteen best accuracy score is {:.2f}%' \
         .format(svm_smoteen_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(svm_smoteen_grid_search_model.best_params_)
     + '\033[0m')

[1mThe svm model with smoteen best accuracy score is 98.25%

The optimal model hyperparameters are:
{'class_weight': None, 'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'rbf'}[0m


## **3.5: K-Nearest Neighbor (KNN)**

### **Original**

In [90]:
parameters_grid_dictionary \
    = {'n_neighbors': np.arange(5, 21, 1),
       'weights': ['uniform', 'distance', None],
       'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
       'p': [1, 2]}

knn_grid_search_model \
    = GridSearchCV \
        (KNeighborsClassifier(leaf_size = spam_detector_constants.CONSTANT_ML_KNN_LEAF_SIZE), 
         parameters_grid_dictionary)

knn_undersampled_grid_search_model = copy.copy(knn_grid_search_model)

knn_oversampled_grid_search_model = copy.copy(knn_grid_search_model)

knn_cluster_centroids_grid_search_model = copy.copy(knn_grid_search_model)

knn_smote_grid_search_model = copy.copy(knn_grid_search_model)

knn_smoteen_grid_search_model = copy.copy(knn_grid_search_model)

In [91]:
knn_grid_search_model \
    .fit(x_train_scaled_dataframe, y_train_series)

clear_output()

In [92]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model best accuracy score is {:.2f}%' \
         .format(knn_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(knn_grid_search_model.best_params_)
     + '\033[0m')

[1mThe knn model best accuracy score is 92.00%

The optimal model hyperparameters are:
{'algorithm': 'auto', 'n_neighbors': 10, 'p': 2, 'weights': 'distance'}[0m


### **Random Undersampling**

In [93]:
knn_undersampled_grid_search_model.fit \
    (x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

clear_output()

In [94]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The knn model with random undersampling best accuracy score is {:.2f}%' \
         .format(knn_undersampled_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(knn_undersampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe knn model with random undersampling best accuracy score is 91.70%

The optimal model hyperparameters are:
{'algorithm': 'auto', 'n_neighbors': 12, 'p': 1, 'weights': 'distance'}[0m


### **Random Oversampling**

In [95]:
knn_oversampled_grid_search_model.fit \
    (x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

clear_output()

In [96]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The knn model with random oversampling best accuracy score is {:.2f}%' \
         .format(knn_oversampled_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(knn_oversampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe knn model with random oversampling best accuracy score is 95.64%

The optimal model hyperparameters are:
{'algorithm': 'auto', 'n_neighbors': 19, 'p': 1, 'weights': 'distance'}[0m


### **Cluster Centroids**

In [97]:
knn_cluster_centroids_grid_search_model.fit \
    (x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

clear_output()

In [98]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The knn model with cluster centroid best accuracy score is {:.2f}%' \
         .format(knn_cluster_centroids_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(knn_cluster_centroids_grid_search_model.best_params_)
     + '\033[0m')

[1mThe knn model with cluster centroid best accuracy score is 90.08%

The optimal model hyperparameters are:
{'algorithm': 'auto', 'n_neighbors': 6, 'p': 2, 'weights': 'distance'}[0m


### **Smote**

In [99]:
knn_smote_grid_search_model.fit \
    (x_train_scaled_smote_dataframe, y_train_smote_series)

clear_output()

In [100]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The knn model with smote best accuracy score is {:.2f}%' \
         .format(knn_smote_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(knn_smote_grid_search_model.best_params_)
     + '\033[0m')

[1mThe knn model with smote best accuracy score is 94.52%

The optimal model hyperparameters are:
{'algorithm': 'auto', 'n_neighbors': 12, 'p': 1, 'weights': 'distance'}[0m


### **Smoteen**

In [101]:
knn_smoteen_grid_search_model.fit \
    (x_train_scaled_smoteen_dataframe, y_train_smoteen_series)

clear_output()

In [102]:
logx.print_and_log_text \
    ('\033[1m' 
     + 'The knn model with smoteen best accuracy score is {:.2f}%' \
         .format(knn_smoteen_grid_search_model.best_score_ * 100) 
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(knn_smoteen_grid_search_model.best_params_)
     + '\033[0m')

[1mThe knn model with smoteen best accuracy score is 98.98%

The optimal model hyperparameters are:
{'algorithm': 'auto', 'n_neighbors': 6, 'p': 2, 'weights': 'distance'}[0m


# <br> **Section 4: Save Models To Files**

## **4.1: Logistic Regression**

### **Original**

In [103]:
pickle.dump \
    (lr_grid_search_model, 
     open(spam_detector_constants.CONSTANT_LR_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [104]:
pickle.dump \
    (lr_undersampled_grid_search_model, 
     open(spam_detector_constants.CONSTANT_LR_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [105]:
pickle.dump \
    (lr_oversampled_grid_search_model, 
     open(spam_detector_constants.CONSTANT_LR_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [106]:
pickle.dump \
    (lr_cluster_centroids_grid_search_model, 
     open(spam_detector_constants.CONSTANT_LR_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Smote**

In [107]:
pickle.dump \
    (lr_smote_grid_search_model, 
     open(spam_detector_constants.CONSTANT_LR_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Smoteen**

In [108]:
pickle.dump \
    (lr_smoteen_grid_search_model, 
     open(spam_detector_constants.CONSTANT_LR_SMOTEEN_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

## **4.2: Decision Tree**

### **Original**

In [109]:
pickle.dump \
    (dt_grid_search_model, 
     open(spam_detector_constants.CONSTANT_DT_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [110]:
pickle.dump \
    (dt_undersampled_grid_search_model, 
     open(spam_detector_constants.CONSTANT_DT_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [111]:
pickle.dump \
    (dt_oversampled_grid_search_model, 
     open(spam_detector_constants.CONSTANT_DT_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [112]:
pickle.dump \
    (dt_cluster_centroids_grid_search_model, 
     open(spam_detector_constants.CONSTANT_DT_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Smote**

In [113]:
pickle.dump \
    (dt_smote_grid_search_model, 
     open(spam_detector_constants.CONSTANT_DT_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Smoteen**

In [114]:
pickle.dump \
    (dt_smoteen_grid_search_model, 
     open(spam_detector_constants.CONSTANT_DT_SMOTEEN_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

## **4.3: Random Forest**

### **Original**

In [115]:
pickle.dump \
    (rf_grid_search_model, 
     open(spam_detector_constants.CONSTANT_RF_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [116]:
pickle.dump \
    (rf_undersampled_grid_search_model, 
     open(spam_detector_constants.CONSTANT_RF_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [117]:
pickle.dump \
    (rf_oversampled_grid_search_model, 
     open(spam_detector_constants.CONSTANT_RF_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [118]:
pickle.dump \
    (rf_cluster_centroids_grid_search_model, 
     open(spam_detector_constants.CONSTANT_RF_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Smote**

In [119]:
pickle.dump \
    (rf_smote_grid_search_model, 
     open(spam_detector_constants.CONSTANT_RF_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Smoteen**

In [120]:
pickle.dump \
    (rf_smoteen_grid_search_model, 
     open(spam_detector_constants.CONSTANT_RF_SMOTEEN_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

## **4.4: Support Vector Machine (SVM)**

### **Original**

In [121]:
pickle.dump \
    (svm_grid_search_model, 
     open(spam_detector_constants.CONSTANT_SVM_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [122]:
pickle.dump \
    (svm_undersampled_grid_search_model, 
     open(spam_detector_constants.CONSTANT_SVM_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [123]:
pickle.dump \
    (svm_oversampled_grid_search_model, 
     open(spam_detector_constants.CONSTANT_SVM_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [124]:
pickle.dump \
    (svm_cluster_centroids_grid_search_model, 
     open(spam_detector_constants.CONSTANT_SVM_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Smote**

In [125]:
pickle.dump \
    (svm_smote_grid_search_model, 
     open(spam_detector_constants.CONSTANT_SVM_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Smoteen**

In [126]:
pickle.dump \
    (svm_smoteen_grid_search_model, 
     open(spam_detector_constants.CONSTANT_SVM_SMOTEEN_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

## **4.5: K-Nearest Neighbor (KNN)**

### **Original**

In [127]:
pickle.dump \
    (knn_grid_search_model, 
     open(spam_detector_constants.CONSTANT_KNN_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [128]:
pickle.dump \
    (knn_undersampled_grid_search_model, 
     open(spam_detector_constants.CONSTANT_KNN_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [129]:
pickle.dump \
    (knn_oversampled_grid_search_model, 
     open(spam_detector_constants.CONSTANT_KNN_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [130]:
pickle.dump \
    (knn_cluster_centroids_grid_search_model, 
     open(spam_detector_constants.CONSTANT_KNN_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Smote**

In [131]:
pickle.dump \
    (knn_smote_grid_search_model, 
     open(spam_detector_constants.CONSTANT_KNN_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Smoteen**

In [132]:
pickle.dump \
    (knn_smoteen_grid_search_model, 
     open(spam_detector_constants.CONSTANT_KNN_SMOTEEN_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

In [133]:
# logx.end_program()