In [1]:
#*******************************************************************************************
 #
 #  File Name:  spam_detector.ipynb
 #
 #  File Description:
 #      This interactive Python notebook, spam_detector.ipynb, reads a csv file, 
 #      spam-data.csv and uses Python and scikit-learn module to find the best 
 #      supervised learning model (binary classification) for detecting spam in 
 #      e-mails. Here is a list of the models:
 #
 #      logistic regression
 #      decision tree
 #      random forest
 #      support vector machine
 #      k-nearest neighbor
 #
 #      I reinstalled the scikit-learn module using the following commands to allow the
 #      RandomOverSampler function to work:
 #
 #      pip3 uninstall scikit-learn
 #      pip3 install scikit-learn==1.2.2
 #
 #
 #  Date            Description                             Programmer
 #  ----------      ------------------------------------    ------------------
 #  04/22/2024      Initial Development                     Nicholas J. George
 #
 #******************************************************************************************/

import classificationsx
import logx
import pandas_processx

import pickle

import pandas as pd

from imblearn.over_sampling import RandomOverSampler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
CONSTANT_LOCAL_FILE_NAME = 'spam_detector.ipynb'

CONSTANT_SPAM_DATA_CSV_FILE_PATH = 'https://static.bc-edx.com/mbc/ai/m4/datasets/spam-data.csv'


CONSTANT_LR_MODEL_FILE_PATH = './resources/lr_model.sav'

CONSTANT_DT_MODEL_FILE_PATH = './resources/dt_model.sav'

CONSTANT_RF_MODEL_FILE_PATH = './resources/rf_model.sav'

CONSTANT_SVM_MODEL_FILE_PATH = './resources/svm_model.sav'

CONSTANT_KNN_MODEL_FILE_PATH = './resources/knn_model.sav'


CONSTANT_LR_RESAMPLED_MODEL_FILE_PATH = './resources/lr_resampled_model.sav'

CONSTANT_DT_RESAMPLED_MODEL_FILE_PATH = './resources/dt_resampled_model.sav'

CONSTANT_RF_RESAMPLED_MODEL_FILE_PATH = './resources/rf_resampled_model.sav'

CONSTANT_SVM_RESAMPLED_MODEL_FILE_PATH = './resources/svm_resampled_model.sav'

CONSTANT_KNN_RESAMPLED_MODEL_FILE_PATH = './resources/knn_resampled_model.sav'


logx.set_log_mode(False)

logx.set_image_mode(False)

logx.begin_program('spam_detector')

# <br> **Section 1: Split Data into Training and Testing Sets**

## **1.1: Read the CSV data into a Pandas DataFrame**

In [3]:
data_type_dictionary \
    = {'word_freq_make': float,
       'word_freq_address': float,
       'word_freq_all': float,
       'word_freq_3d': float,
       'word_freq_our': float,
       'word_freq_over': float,
       'word_freq_remove': float,
       'word_freq_internet': float,
       'word_freq_order': float,
       'word_freq_mail': float,
       'word_freq_receive': float,
       'word_freq_will': float,
       'word_freq_people': float,
       'word_freq_report': float,
       'word_freq_addresses': float,
       'word_freq_free': float,
       'word_freq_business': float,
       'word_freq_email': float,
       'word_freq_you': float,
       'word_freq_credit': float,
       'word_freq_your': float,
       'word_freq_font': float,
       'word_freq_000': float,
       'word_freq_money': float,
       'word_freq_hp': float,
       'word_freq_hpl': float,
       'word_freq_george': float,
       'word_freq_650': float,
       'word_freq_lab': float,
       'word_freq_labs': float,
       'word_freq_telnet': float,
       'word_freq_857': float,
       'word_freq_data': float,
       'word_freq_415': float,
       'word_freq_85': float,
       'word_freq_technology': float,
       'word_freq_1999': float,
       'word_freq_parts': float,
       'word_freq_pm': float,
       'word_freq_direct': float,
       'word_freq_cs': float,
       'word_freq_meeting': float,
       'word_freq_original': float,
       'word_freq_project': float,
       'word_freq_re': float,
       'word_freq_edu': float,
       'word_freq_table': float,
       'word_freq_conference': float,
       'char_freq_;': float,
       'char_freq_(': float,
       'char_freq_[': float,
       'char_freq_!': float,
       'char_freq_$': float,
       'char_freq_#': float,
       'capital_run_length_average': float,
       'capital_run_length_longest': int,
       'capital_run_length_total': int,
       'spam': int}

spam_dataframe \
    = pd.read_csv(CONSTANT_SPAM_DATA_CSV_FILE_PATH, dtype = data_type_dictionary)

logx.log_write_object(spam_dataframe)

## **1.2: Display Spam DataFrame**

In [4]:
pandas_processx.return_formatted_table(spam_dataframe, 'Table 1.2: Spam Data Table')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,0.0,0.64,0.0,0.0,0.0,0.32,0.0,1.29,1.93,0.0,0.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.78,0.0,0.0,3.76,61,278,1
0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,0.21,0.79,0.65,0.21,0.14,0.14,0.07,0.28,3.47,0.0,1.59,0.0,0.43,0.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.0,0.37,0.18,0.05,5.11,101,1028,1
0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,0.38,0.45,0.12,0.0,1.75,0.06,0.06,1.03,1.36,0.32,0.51,0.0,1.16,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.12,0.0,0.06,0.06,0.0,0.0,0.01,0.14,0.0,0.28,0.18,0.01,9.82,485,2259,1
0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,0.31,0.31,0.31,0.0,0.0,0.31,0.0,0.0,3.18,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.14,0.0,0.0,3.54,40,191,1
0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,0.31,0.31,0.31,0.0,0.0,0.31,0.0,0.0,3.18,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.14,0.0,0.0,3.54,40,191,1
0.0,0.0,0.0,0.0,1.85,0.0,0.0,1.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22,0.0,0.0,0.0,0.0,3.0,15,54,1
0.0,0.0,0.0,0.0,1.92,0.0,0.0,0.0,0.0,0.64,0.96,1.28,0.0,0.0,0.0,0.96,0.0,0.32,3.85,0.0,0.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.16,0.05,0.0,1.67,4,112,1
0.0,0.0,0.0,0.0,1.88,0.0,0.0,1.88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,2.45,11,49,1
0.15,0.0,0.46,0.0,0.61,0.0,0.3,0.0,0.92,0.76,0.76,0.92,0.0,0.0,0.0,0.0,0.0,0.15,1.23,3.53,2.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.27,0.0,0.18,0.2,0.02,9.74,445,1257,1
0.06,0.12,0.77,0.0,0.19,0.32,0.38,0.0,0.06,0.0,0.0,0.64,0.25,0.0,0.12,0.0,0.0,0.12,1.67,0.06,0.71,0.0,0.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.04,0.03,0.0,0.24,0.08,0.0,1.73,43,749,1


## **1.3: Create the labels series (`y`)  from the “spam” column, and then create the features (`X`) DataFrame from the remaining columns.**

### **Separate the Y Variable, The Labels**

In [5]:
y_series = spam_dataframe['spam']

logx.log_write_object(y_series)

### **Review the Y Series**

In [6]:
pandas_processx.return_formatted_table(y_series.to_frame(), 'Table 1.3.1: Spam Y Series')

spam
1
1
1
1
1
1
1
1
1
1


### **Check the Balance of the Labels Variable (`y`) by Using the `value_counts` Function.**

In [7]:
y_series.value_counts()

spam
0    2788
1    1813
Name: count, dtype: int64

### **Separate the X Variable, the Features**

In [8]:
x_dataframe = spam_dataframe.drop(columns = 'spam', axis = 1)

logx.log_write_object(x_dataframe)

### **Review the X DataFrame**

In [9]:
pandas_processx.return_formatted_table(x_dataframe, 'Table 1.3.2: Spam X DataFrame')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,0.0,0.64,0.0,0.0,0.0,0.32,0.0,1.29,1.93,0.0,0.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.78,0.0,0.0,3.76,61,278
0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,0.21,0.79,0.65,0.21,0.14,0.14,0.07,0.28,3.47,0.0,1.59,0.0,0.43,0.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.0,0.37,0.18,0.05,5.11,101,1028
0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,0.38,0.45,0.12,0.0,1.75,0.06,0.06,1.03,1.36,0.32,0.51,0.0,1.16,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.12,0.0,0.06,0.06,0.0,0.0,0.01,0.14,0.0,0.28,0.18,0.01,9.82,485,2259
0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,0.31,0.31,0.31,0.0,0.0,0.31,0.0,0.0,3.18,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.14,0.0,0.0,3.54,40,191
0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,0.31,0.31,0.31,0.0,0.0,0.31,0.0,0.0,3.18,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.14,0.0,0.0,3.54,40,191
0.0,0.0,0.0,0.0,1.85,0.0,0.0,1.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22,0.0,0.0,0.0,0.0,3.0,15,54
0.0,0.0,0.0,0.0,1.92,0.0,0.0,0.0,0.0,0.64,0.96,1.28,0.0,0.0,0.0,0.96,0.0,0.32,3.85,0.0,0.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.16,0.05,0.0,1.67,4,112
0.0,0.0,0.0,0.0,1.88,0.0,0.0,1.88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,2.45,11,49
0.15,0.0,0.46,0.0,0.61,0.0,0.3,0.0,0.92,0.76,0.76,0.92,0.0,0.0,0.0,0.0,0.0,0.15,1.23,3.53,2.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.27,0.0,0.18,0.2,0.02,9.74,445,1257
0.06,0.12,0.77,0.0,0.19,0.32,0.38,0.0,0.06,0.0,0.0,0.64,0.25,0.0,0.12,0.0,0.0,0.12,1.67,0.06,0.71,0.0,0.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.04,0.03,0.0,0.24,0.08,0.0,1.73,43,749


## **1.4: Split the Data into Training and Testing Datasets by Using `train_test_split`.**

In [10]:
x_train_dataframe, x_test_dataframe, \
y_train_series, y_test_series \
    = train_test_split(x_dataframe, y_series, random_state = 21)

In [11]:
logx.log_write_object(x_train_dataframe)

logx.log_write_object(x_test_dataframe)

logx.log_write_object(y_train_series)

logx.log_write_object(y_test_series)

## **1.5: Use the StandardScaler to Scale the X Variables**

### **Normalize Training and Test Data as Numpy Arrays**

In [12]:
normalized_x_train_nparray = StandardScaler().fit_transform(x_train_dataframe)

logx.log_write_object(normalized_x_train_nparray)

In [13]:
normalized_x_test_nparray = StandardScaler().fit_transform(x_test_dataframe)

logx.log_write_object(normalized_x_test_nparray)

### **Create Normalized X Variable DataFrames**

In [14]:
normalized_x_train_dataframe \
    = pd.DataFrame(normalized_x_train_nparray, columns = x_train_dataframe.columns, index = x_train_dataframe.index)
    
logx.log_write_object(normalized_x_train_dataframe)

In [15]:
normalized_x_test_dataframe \
    = pd.DataFrame(normalized_x_test_nparray, columns = x_test_dataframe.columns, index = x_test_dataframe.index)
    
logx.log_write_object(normalized_x_test_dataframe)

### **Display Normalized Training and Testing Data**

In [16]:
pandas_processx.return_formatted_table \
    (normalized_x_train_dataframe, 
     'Table 1.5.1: Normalized Spam X Training Table')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
2.83,-0.16,1.35,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.68,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,0.06,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.12,-0.3,-0.1,-0.09,-0.2,-0.36
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.35,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,3.86,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,2.8,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,0.05,-0.3,-0.1,-0.09,-0.21,-0.38
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,-0.94,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,5.77,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.13,-0.25,-0.45
0.93,0.15,-0.56,-0.05,0.12,-0.36,-0.29,-0.26,-0.32,0.24,-0.3,0.74,-0.3,0.99,-0.19,-0.32,-0.31,-0.35,-0.28,-0.16,-0.67,-0.12,-0.29,-0.21,1.41,1.93,0.13,1.9,0.52,0.61,0.76,0.98,-0.17,0.97,1.88,0.76,0.58,-0.06,-0.18,0.88,-0.13,0.32,1.51,-0.12,0.1,-0.2,-0.07,-0.11,0.35,0.83,5.17,-0.31,-0.3,-0.1,-0.09,-0.19,-0.23
-0.35,-0.16,-0.56,-0.05,-0.46,-0.36,-0.29,-0.26,2.35,-0.37,-0.3,-0.63,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,0.3,-0.16,-0.67,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,1.58,-0.18,-0.13,-0.18,-0.21,-0.12,0.46,-0.2,-0.07,-0.11,-0.16,0.48,-0.19,0.32,-0.3,-0.1,-0.08,-0.18,-0.34
1.42,-0.16,-0.56,-0.05,-0.46,0.65,-0.29,-0.26,-0.32,-0.37,-0.3,-0.63,1.45,-0.17,-0.19,-0.32,-0.31,-0.35,1.18,-0.16,-0.23,-0.12,-0.29,-0.21,-0.17,-0.3,-0.23,-0.23,-0.18,-0.22,1.11,-0.14,-0.17,-0.15,0.28,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,1.02,-0.19,-0.31,0.01,-0.1,-0.1,-0.17,-0.29
-0.35,-0.02,-0.56,-0.05,-0.2,-0.36,-0.29,0.66,-0.32,-0.37,-0.3,0.45,-0.3,-0.17,-0.19,-0.32,3.74,-0.35,-0.84,-0.16,-0.67,-0.12,-0.29,-0.21,-0.1,-0.1,-0.23,-0.23,-0.18,0.16,-0.16,-0.14,-0.17,-0.15,-0.19,0.71,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,0.28,-0.2,-0.07,-0.11,-0.16,0.36,-0.19,-0.31,-0.3,-0.1,-0.11,-0.21,-0.19
-0.35,-0.16,-0.56,-0.05,0.88,-0.36,1.88,-0.26,-0.32,1.04,4.1,-0.63,2.62,-0.17,-0.19,-0.32,-0.31,-0.35,0.58,-0.16,0.06,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,-0.18,-0.18,-0.13,-0.18,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.1,-0.22,-0.42
-0.35,-0.16,3.33,-0.05,1.01,-0.36,-0.29,-0.26,-0.32,-0.37,-0.3,1.65,-0.3,-0.17,-0.19,-0.32,-0.31,-0.35,1.83,-0.16,0.13,-0.12,-0.29,-0.21,-0.34,-0.3,-0.23,-0.23,-0.18,-0.22,-0.16,-0.14,-0.17,-0.15,-0.19,-0.24,-0.32,-0.06,2.18,-0.18,-0.13,2.37,-0.21,-0.12,-0.3,-0.2,-0.07,-0.11,-0.16,-0.48,-0.19,-0.31,-0.3,-0.1,-0.12,-0.23,-0.42


In [17]:
pandas_processx.return_formatted_table \
    (normalized_x_test_dataframe, 
     'Table 1.5.2: Normalized Spam X Test Table')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
-0.02,-0.17,-0.0,-0.05,0.06,-0.02,-0.3,0.18,-0.01,-0.38,-0.29,0.23,-0.35,0.89,-0.2,-0.27,-0.36,-0.36,0.24,-0.18,2.26,-0.12,0.78,0.02,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.05,-0.19,-0.08,-0.13,-0.1,-0.44,0.05,-0.04,-0.19,0.0,-0.02,0.2,0.48
-0.31,-0.17,-0.56,-0.05,-0.46,-0.34,-0.3,-0.28,-0.33,-0.38,-0.29,-0.62,-0.35,-0.18,-0.2,-0.27,-0.36,-0.36,-0.51,-0.18,-0.69,-0.12,-0.3,-0.26,-0.13,0.1,-0.22,1.23,0.37,0.62,-0.18,-0.15,-0.18,-0.15,0.68,-0.25,-0.33,-0.07,1.29,-0.21,-0.1,-0.16,-0.2,0.55,-0.29,-0.19,-0.08,-0.13,-0.17,-0.66,-0.13,-0.43,-0.32,-0.16,-0.19,-0.3,-0.3
-0.31,-0.17,-0.56,-0.05,-0.46,-0.34,-0.3,-0.28,-0.33,2.42,-0.29,0.85,-0.35,-0.18,-0.2,-0.27,-0.36,-0.36,-0.92,-0.18,-0.69,-0.12,-0.3,-0.26,0.99,1.11,0.14,3.68,0.77,1.27,1.72,2.2,-0.18,2.21,1.38,2.58,-0.33,-0.07,1.12,1.97,-0.1,-0.16,2.68,-0.14,0.26,-0.19,-0.08,-0.13,0.63,2.93,-0.13,-0.43,-0.32,-0.16,-0.11,-0.26,-0.2
-0.31,-0.17,-0.56,-0.05,-0.46,-0.34,-0.3,-0.28,-0.33,0.31,-0.29,1.56,-0.35,-0.18,-0.2,-0.27,-0.36,0.46,-0.12,-0.18,-0.28,-0.12,-0.3,-0.26,0.66,1.26,0.05,1.7,0.54,0.88,2.64,5.09,-0.18,5.1,2.16,0.8,-0.33,-0.07,-0.18,3.03,-0.1,1.73,-0.2,-0.14,0.12,-0.19,-0.08,4.55,-0.17,1.06,-0.13,-0.43,-0.32,-0.16,0.0,-0.1,-0.21
-0.31,-0.17,0.04,-0.05,0.83,0.15,-0.3,-0.28,2.78,-0.38,-0.29,0.24,0.15,-0.18,-0.2,-0.27,0.43,-0.36,0.35,-0.18,0.1,-0.12,0.12,0.66,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,1.28,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,-0.17,-0.2,-0.13,0.2,-0.11,-0.03,-0.13,0.03,0.27
-0.31,-0.17,0.75,-0.05,-0.46,1.88,-0.3,-0.28,-0.33,-0.38,-0.29,-0.62,-0.35,-0.18,-0.2,0.37,-0.36,-0.36,0.58,-0.18,-0.69,-0.12,1.61,-0.26,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,1.3,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,0.23,0.24,1.58,0.64,-0.32,-0.16,0.42,0.74,0.02
0.76,-0.17,2.82,-0.05,0.99,-0.34,0.76,-0.28,2.01,1.61,-0.29,-0.62,-0.35,-0.18,-0.2,-0.27,-0.36,-0.36,-0.73,2.13,-0.1,-0.12,-0.3,-0.26,-0.14,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,-0.17,-0.66,-0.13,0.96,0.12,-0.16,23.74,9.53,1.94
-0.31,-0.17,-0.56,-0.05,3.85,-0.34,-0.3,-0.28,-0.33,-0.38,-0.29,-0.62,-0.35,-0.18,-0.2,2.64,-0.36,-0.36,-0.92,-0.18,-0.69,-0.12,-0.3,-0.26,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,1.33,1.02,-0.13,0.72,-0.32,-0.16,-0.01,-0.3,-0.46
0.76,-0.17,-0.56,-0.05,-0.46,0.81,-0.3,-0.28,-0.33,-0.38,-0.29,0.58,2.07,-0.18,-0.2,-0.27,1.45,-0.36,0.84,-0.18,1.14,-0.12,-0.3,-0.26,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,-0.19,-0.08,-0.13,-0.17,-0.66,-0.13,-0.43,-0.32,-0.16,-0.17,-0.22,-0.38
-0.31,-0.17,-0.56,-0.05,-0.46,-0.34,-0.3,-0.28,-0.33,-0.38,-0.29,-0.62,-0.35,-0.18,-0.2,-0.27,-0.36,-0.36,0.68,-0.18,-0.69,-0.12,-0.3,-0.26,-0.32,-0.29,-0.22,-0.24,-0.14,-0.23,-0.18,-0.15,-0.18,-0.15,-0.23,-0.25,-0.33,-0.07,-0.18,-0.21,-0.1,-0.16,-0.2,-0.14,-0.29,2.74,-0.08,-0.13,-0.17,-0.66,-0.13,-0.43,-0.32,-0.16,-0.22,-0.39,-0.49


# <br> **Section 2: Logistic Regression Model**

## **2.1: Fit a Logistic Regression Model by Using the Normalized Training Data.**

In [18]:
logistic_regression_model \
    = LogisticRegression(random_state = 21).fit(normalized_x_train_dataframe, y_train_series)

## **2.2: Display the Model Scores Using the Normalized Training and Testing data.**

In [19]:
accuracy_score_float = logistic_regression_model.score(normalized_x_train_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The logistic regression model score from normalized training data is {:.2f}%' \
         .format(accuracy_score_float) + '\033[0m')

[1mThe logistic regression model score from normalized training data is 92.93%[0m


In [20]:
accuracy_score_float = logistic_regression_model.score(normalized_x_test_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The logistic regression model score from normalized test data is {:.2f}%' \
         .format(accuracy_score_float) + '\033[0m')

[1mThe logistic regression model score from normalized test data is 91.05%[0m


## **2.3: Calculate and Display Training and Test Predictions.**

### **Training Predictions**

In [21]:
logistic_regression_train_predictions_nparray = logistic_regression_model.predict(normalized_x_train_dataframe)

logistic_regression_train_predictions_dictionary \
    = {'prediction': logistic_regression_train_predictions_nparray, 'actual': y_train_series}

logistic_regression_train_predictions_dataframe = pd.DataFrame(logistic_regression_train_predictions_dictionary)

logx.log_write_object(logistic_regression_train_predictions_dataframe)

In [22]:
pandas_processx.return_formatted_table \
    (logistic_regression_train_predictions_dataframe, 
     'Table 2.3.1: Logistic Regression Training Predictions')

prediction,actual
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0
1,1
0,0


### **Testing Predictions**

In [23]:
logistic_regression_test_predictions_nparray = logistic_regression_model.predict(normalized_x_test_dataframe)

logistic_regression_test_predictions_dictionary \
    = {'prediction': logistic_regression_test_predictions_nparray, 'actual': y_test_series}

logistic_regression_test_predictions_dataframe = pd.DataFrame(logistic_regression_test_predictions_dictionary)

logx.log_write_object(logistic_regression_test_predictions_dataframe)

In [24]:
pandas_processx.return_formatted_table \
    (logistic_regression_test_predictions_dataframe, 
     'Table 2.3.2: Logistic Regression Test Predictions')

prediction,actual
1,1
0,0
0,0
0,0
0,1
1,1
1,1
1,1
0,1
0,0


## **2.4: Evaluate the Model’s Performance**

In [25]:
lr_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, logistic_regression_test_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The balanced accuracy score for logistic regression from actual vs. test predictions is {:.2f}%' \
         .format(lr_balanced_accuracy_score_float) + '\033[0m')

[1mThe balanced accuracy score for logistic regression from actual vs. test predictions is 91.05%[0m


In [26]:
lr_accuracy_score_float, lr_confusion_matrix_dataframe, lr_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         logistic_regression_test_predictions_nparray, 
         'LOGISTIC REGRESSION MODEL',
         'Spam', 'Not Spam')

[1mLOGISTIC REGRESSION MODEL
[0m
1) [1mOverall Accuracy Score: [0m89.94%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 665                  34
Actual Not Spam              69                 383

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.91      0.95      0.93       699
    not spam       0.92      0.85      0.88       452

    accuracy                           0.91      1151
   macro avg       0.91      0.90      0.90      1151
weighted avg       0.91      0.91      0.91      1151




## **2.5: Save the Logistic Regression Model**

In [27]:
pickle.dump(logistic_regression_model, open(CONSTANT_LR_MODEL_FILE_PATH, 'wb'))

# <br> **Section 3: Decision Tree Model**

## **3.1: Fit a Decision Tree Model by Using the Normalized Training Data.**

In [28]:
decision_tree_model \
    = DecisionTreeClassifier(random_state = 21).fit(normalized_x_train_dataframe, y_train_series)

## **3.2: Display the Model Scores Using the Normalized Training and Testing data.**

In [29]:
accuracy_score_float = decision_tree_model.score(normalized_x_train_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The decision tree model score from normalized training data is {:.2f}%' \
         .format(accuracy_score_float) + '\033[0m')

[1mThe decision tree model score from normalized training data is 99.91%[0m


In [30]:
accuracy_score_float = decision_tree_model.score(normalized_x_test_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The decision tree model score from normalized test data is {:.2f}%' \
         .format(accuracy_score_float) + '\033[0m')

[1mThe decision tree model score from normalized test data is 89.49%[0m


## **3.3: Calculate and Display Training and Test Predictions.**

### **Training Predictions**

In [31]:
decision_tree_train_predictions_nparray = decision_tree_model.predict(normalized_x_train_dataframe)

decision_tree_train_predictions_dictionary \
    = {'prediction': decision_tree_train_predictions_nparray, 'actual': y_train_series}

decision_tree_train_predictions_dataframe = pd.DataFrame(decision_tree_train_predictions_dictionary)

logx.log_write_object(decision_tree_train_predictions_dataframe)

In [32]:
pandas_processx.return_formatted_table \
    (decision_tree_train_predictions_dataframe, 
     'Table 3.3.1: Decision Tree Training Predictions')

prediction,actual
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0
1,1
0,0


### **Testing Predictions**

In [33]:
decision_tree_test_predictions_nparray = decision_tree_model.predict(normalized_x_test_dataframe)

decision_tree_test_predictions_dictionary \
    = {'prediction': decision_tree_test_predictions_nparray, 'actual': y_test_series}

decision_tree_test_predictions_dataframe = pd.DataFrame(decision_tree_test_predictions_dictionary)

logx.log_write_object(decision_tree_test_predictions_dataframe)

In [34]:
pandas_processx.return_formatted_table \
    (decision_tree_test_predictions_dataframe, 
     'Table 3.3.2: Decision Tree Test Predictions')

prediction,actual
1,1
0,0
0,0
0,0
1,1
1,1
1,1
1,1
0,1
0,0


## **3.4: Evaluate the Model’s Performance**

In [35]:
dt_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, decision_tree_test_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The balanced accuracy score for decision tree from actual vs. test predictions is {:.2f}%' \
         .format(dt_balanced_accuracy_score_float) + '\033[0m')

[1mThe balanced accuracy score for decision tree from actual vs. test predictions is 89.49%[0m


In [36]:
dt_accuracy_score_float, dt_confusion_matrix_dataframe, dt_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         decision_tree_test_predictions_nparray, 
         'DECISION TREE',
         'Spam', 'Not Spam')

[1mDECISION TREE
[0m
1) [1mOverall Accuracy Score: [0m88.26%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 657                  42
Actual Not Spam              79                 373

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.89      0.94      0.92       699
    not spam       0.90      0.83      0.86       452

    accuracy                           0.89      1151
   macro avg       0.90      0.88      0.89      1151
weighted avg       0.90      0.89      0.89      1151




## **3.5: Save the Decision Tree Model**

In [37]:
pickle.dump(decision_tree_model, open(CONSTANT_DT_MODEL_FILE_PATH, 'wb'))

# <br> **Section 4: Random Forest Classifier Model**

## **4.1: Fit a Random Forest Model by Using the Normalized Training Data.**

In [38]:
random_forest_model \
    = RandomForestClassifier(random_state = 21).fit(normalized_x_train_dataframe, y_train_series)

## **4.2: Display the Model Scores Using the Normalized Training and Testing data.**

In [39]:
accuracy_score_float = random_forest_model.score(normalized_x_train_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The random forest model score from normalized training data is {:.2f}%' \
         .format(accuracy_score_float) + '\033[0m')

[1mThe random forest model score from normalized training data is 99.91%[0m


In [40]:
accuracy_score_float = random_forest_model.score(normalized_x_test_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The random forest model score from normalized test data is {:.2f}%'.format(accuracy_score_float) + '\033[0m')

[1mThe random forest model score from normalized test data is 92.70%[0m


## **4.3: Calculate and Display Training and Test Predictions.**

### **Training Predictions**

In [41]:
random_forest_train_predictions_nparray = random_forest_model.predict(normalized_x_train_dataframe)

random_forest_train_predictions_dictionary \
    = {'prediction': random_forest_train_predictions_nparray, 'actual': y_train_series}

random_forest_train_predictions_dataframe = pd.DataFrame(random_forest_train_predictions_dictionary)

logx.log_write_object(random_forest_train_predictions_dataframe)

In [42]:
pandas_processx.return_formatted_table \
    (random_forest_train_predictions_dataframe, 
     'Table 4.3.1: Random Forest Training Predictions')

prediction,actual
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0
1,1
0,0


### **Testing Predictions**

In [43]:
random_forest_test_predictions_nparray = random_forest_model.predict(normalized_x_test_dataframe)

random_forest_test_predictions_dictionary \
    = {'prediction': random_forest_test_predictions_nparray, 'actual': y_test_series}

random_forest_test_predictions_dataframe = pd.DataFrame(random_forest_test_predictions_dictionary)

logx.log_write_object(random_forest_test_predictions_dataframe)

In [44]:
pandas_processx.return_formatted_table \
    (random_forest_test_predictions_dataframe, 
     'Table 4.3.2: Random Forest Test Predictions')

prediction,actual
1,1
0,0
0,0
0,0
1,1
1,1
1,1
1,1
0,1
0,0


## **4.4: Evaluate the Model’s Performance**

In [45]:
rf_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, random_forest_test_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The random forest balanced accuracy score from actual vs. test predictions is {:.2f}%' \
         .format(rf_balanced_accuracy_score_float) + '\033[0m')

[1mThe random forest balanced accuracy score from actual vs. test predictions is 92.70%[0m


In [46]:
rf_accuracy_score_float, rf_confusion_matrix_dataframe, rf_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         random_forest_test_predictions_nparray, 
         'RANDOM FOREST',
         'Spam', 'Not Spam')

[1mRANDOM FOREST
[0m
1) [1mOverall Accuracy Score: [0m91.22%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 686                  13
Actual Not Spam              71                 381

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.91      0.98      0.94       699
    not spam       0.97      0.84      0.90       452

    accuracy                           0.93      1151
   macro avg       0.94      0.91      0.92      1151
weighted avg       0.93      0.93      0.93      1151




## **4.5: Save the Random Forest Model**

In [47]:
pickle.dump(random_forest_model, open(CONSTANT_RF_MODEL_FILE_PATH, 'wb'))

# <br> **Section 5: Support Vector Machine (SVM) Model**

## **5.1: Fit a SVM Model by Using the Normalized Training Data.**

In [48]:
svm_model \
    = SVC(random_state = 21).fit(normalized_x_train_dataframe, y_train_series)

## **5.2: Display the Model Scores Using the Normalized Training and Testing data.**

In [49]:
accuracy_score_float = svm_model.score(normalized_x_train_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The support vector machine model score from normalized training data is {:.2f}%' \
         .format(accuracy_score_float) + '\033[0m')

[1mThe support vector machine model score from normalized training data is 94.81%[0m


In [50]:
accuracy_score_float = svm_model.score(normalized_x_test_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The support vector machine model score from normalized test data is {:.2f}%'.format(accuracy_score_float) 
     + '\033[0m')

[1mThe support vector machine model score from normalized test data is 92.53%[0m


## **5.3: Calculate and Display Training and Test Predictions.**

### **Training Predictions**

In [51]:
svm_train_predictions_nparray = svm_model.predict(normalized_x_train_dataframe)

svm_train_predictions_dictionary \
    = {'prediction': svm_train_predictions_nparray, 'actual': y_train_series}

svm_train_predictions_dataframe = pd.DataFrame(svm_train_predictions_dictionary)

logx.log_write_object(svm_train_predictions_dataframe)

In [52]:
pandas_processx.return_formatted_table \
    (svm_train_predictions_dataframe, 
     'Table 5.3.1: SVC Training Predictions')

prediction,actual
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0
1,1
0,0


### **Testing Predictions**

In [53]:
svm_test_predictions_nparray = svm_model.predict(normalized_x_test_dataframe)

svm_test_predictions_dictionary \
    = {'prediction': svm_test_predictions_nparray, 'actual': y_test_series}

svm_test_predictions_dataframe = pd.DataFrame(svm_test_predictions_dictionary)

logx.log_write_object(svm_test_predictions_dataframe)

In [54]:
pandas_processx.return_formatted_table \
    (svm_test_predictions_dataframe, 
     'Table 5.3.2: SVC Test Predictions')

prediction,actual
1,1
0,0
0,0
0,0
1,1
1,1
1,1
1,1
0,1
0,0


## **5.4: Evaluate the Model’s Performance**

In [55]:
svm_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, svm_test_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The svm balanced accuracy score from actual vs. test predictions is {:.2f}%' \
         .format(svm_balanced_accuracy_score_float) + '\033[0m')

[1mThe svm balanced accuracy score from actual vs. test predictions is 92.53%[0m


In [56]:
svm_accuracy_score_float, svm_confusion_matrix_dataframe, svm_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         svm_test_predictions_nparray, 
         'SUPPORT VECTOR MACHINE',
         'Spam', 'Not Spam')

[1mSUPPORT VECTOR MACHINE
[0m
1) [1mOverall Accuracy Score: [0m91.7%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 668                  31
Actual Not Spam              55                 397

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.92      0.96      0.94       699
    not spam       0.93      0.88      0.90       452

    accuracy                           0.93      1151
   macro avg       0.93      0.92      0.92      1151
weighted avg       0.93      0.93      0.92      1151




## **5.5: Save the Support Vector Machine Model**

In [57]:
pickle.dump(svm_model, open(CONSTANT_SVM_MODEL_FILE_PATH, 'wb'))

# <br> **Section 6: K-Nearest Neighbor (KNN) Model**

## **6.1: Fit a KNN Model by Using the Normalized Training Data.**

In [58]:
knn_model \
    = KNeighborsClassifier().fit(normalized_x_train_dataframe, y_train_series)

## **6.2: Display the Model Scores Using the Normalized Training and Testing data.**

In [59]:
accuracy_score_float = knn_model.score(normalized_x_train_dataframe, y_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The KNN model score from normalized training data is {:.2f}%' \
         .format(accuracy_score_float) + '\033[0m')

[1mThe KNN model score from normalized training data is 93.86%[0m


In [60]:
accuracy_score_float = knn_model.score(normalized_x_test_dataframe, y_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The KNN model score from normalized test data is {:.2f}%'.format(accuracy_score_float) 
     + '\033[0m')

[1mThe KNN model score from normalized test data is 91.05%[0m


## **6.3: Calculate and Display Training and Test Predictions.**

### **Training Predictions**

In [61]:
knn_train_predictions_nparray = knn_model.predict(normalized_x_train_dataframe)

knn_train_predictions_dictionary \
    = {'prediction': knn_train_predictions_nparray, 'actual': y_train_series}

knn_train_predictions_dataframe = pd.DataFrame(knn_train_predictions_dictionary)

logx.log_write_object(knn_train_predictions_dataframe)

In [62]:
pandas_processx.return_formatted_table \
    (knn_train_predictions_dataframe, 
     'Table 6.3.1: KNN Training Predictions')

prediction,actual
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0
1,1
0,0


### **Testing Predictions**

In [63]:
knn_test_predictions_nparray = knn_model.predict(normalized_x_test_dataframe)

knn_test_predictions_dictionary \
    = {'prediction': knn_test_predictions_nparray, 'actual': y_test_series}

knn_test_predictions_dataframe = pd.DataFrame(knn_test_predictions_dictionary)

logx.log_write_object(knn_test_predictions_dataframe)

In [64]:
pandas_processx.return_formatted_table \
    (knn_test_predictions_dataframe, 
     'Table 6.3.2: KNN Test Predictions')

prediction,actual
1,1
1,0
0,0
0,0
1,1
1,1
1,1
1,1
0,1
0,0


## **6.4: Evaluate the Model’s Performance**

In [65]:
knn_balanced_accuracy_score_float \
    = accuracy_score(y_test_series, knn_test_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The knn balanced accuracy score from actual vs. test predictions is {:.2f}%' \
         .format(knn_balanced_accuracy_score_float) + '\033[0m')

[1mThe knn balanced accuracy score from actual vs. test predictions is 91.05%[0m


In [66]:
knn_accuracy_score_float, knn_confusion_matrix_dataframe, knn_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_test_series, 
         knn_test_predictions_nparray, 
         'K-NEAREST NEIGHBOR',
         'Spam', 'Not Spam')

[1mK-NEAREST NEIGHBOR
[0m
1) [1mOverall Accuracy Score: [0m90.6%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 648                  51
Actual Not Spam              52                 400

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.93      0.93      0.93       699
    not spam       0.89      0.88      0.89       452

    accuracy                           0.91      1151
   macro avg       0.91      0.91      0.91      1151
weighted avg       0.91      0.91      0.91      1151




## **6.5: Save the K-Nearest Neighbor Model**

In [67]:
pickle.dump(knn_model, open(CONSTANT_KNN_MODEL_FILE_PATH, 'wb'))

# <br> **Section 7: Resampled Spam Data**

## **7.1: Resample Data to Improve Models**

### **Instantiate the Random Oversampler Model**

In [68]:
x_resampled_dataframe, y_resampled_series \
    = RandomOverSampler(random_state = 21).fit_resample(x_dataframe, y_series)

In [69]:
logx.log_write_object(x_resampled_dataframe)

logx.log_write_object(y_resampled_series)

### **Check the Balance of the Resampled Labels Variable (`y`) by Using the `value_counts` Function.**

In [70]:
y_resampled_series.value_counts()

spam
1    2788
0    2788
Name: count, dtype: int64

## **7.2: Split the Resampled Data into Training and Testing Datasets by Using `train_test_split`.**

In [71]:
x_resampled_train_dataframe, x_resampled_test_dataframe, \
y_resampled_train_series, y_resampled_test_series \
    = train_test_split(x_resampled_dataframe, y_resampled_series, random_state = 21)

In [72]:
logx.log_write_object(x_resampled_train_dataframe)

logx.log_write_object(x_resampled_test_dataframe)

logx.log_write_object(y_resampled_train_series)

logx.log_write_object(y_resampled_test_series)

## **7.3: Use the StandardScaler to Scale the Resampled X Variables**

### **Normalize Resampled Training and Test Data as Numpy Arrays**

In [73]:
normalized_x_resampled_train_nparray = StandardScaler().fit_transform(x_resampled_train_dataframe)

logx.log_write_object(normalized_x_resampled_train_nparray)

In [74]:
normalized_x_resampled_test_nparray = StandardScaler().fit_transform(x_resampled_test_dataframe)

logx.log_write_object(normalized_x_resampled_test_nparray)

### **Create Resampled Normalized X Variable DataFrames**

In [75]:
normalized_x_resampled_train_dataframe \
    = pd.DataFrame \
        (normalized_x_resampled_train_nparray, 
         columns = x_resampled_train_dataframe.columns, 
         index = x_resampled_train_dataframe.index)
    
logx.log_write_object(normalized_x_resampled_train_dataframe)

In [76]:
normalized_x_resampled_test_dataframe \
    = pd.DataFrame \
        (normalized_x_resampled_test_nparray, 
         columns = x_resampled_test_dataframe.columns, 
         index = x_resampled_test_dataframe.index)
    
logx.log_write_object(normalized_x_resampled_test_dataframe)

### **Display Normalized Resampled Training and Testing Data**

In [77]:
pandas_processx.return_formatted_table \
    (normalized_x_resampled_train_dataframe, 
     'Table 7.3.1: Normalized Resampled X Training Table')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
-0.38,1.39,0.93,-0.06,0.62,-0.38,-0.33,0.71,-0.36,-0.41,1.45,-0.66,4.51,-0.21,-0.2,0.12,0.43,1.54,1.39,-0.2,4.26,-0.13,-0.33,-0.24,-0.3,-0.27,-0.09,-0.2,-0.14,-0.2,-0.14,-0.12,-0.16,-0.13,-0.2,-0.22,-0.29,-0.06,-0.16,-0.17,-0.1,-0.16,-0.18,-0.11,-0.27,-0.18,-0.07,-0.1,-0.16,-0.43,-0.15,-0.28,-0.34,-0.12,-0.13,-0.23,-0.32
6.42,-0.17,-0.6,-0.06,0.27,1.44,-0.33,-0.32,-0.36,2.16,-0.34,1.27,-0.33,-0.21,-0.2,1.68,0.74,0.52,2.91,-0.2,0.54,-0.13,0.99,0.81,-0.3,-0.27,-0.21,-0.2,-0.14,-0.2,-0.14,-0.12,-0.16,-0.13,-0.2,-0.22,-0.29,-0.06,-0.16,-0.17,-0.1,-0.16,-0.18,-0.11,-0.27,-0.18,-0.07,-0.1,-0.16,1.03,-0.15,-0.05,0.31,-0.12,-0.09,0.08,-0.12
1.8,-0.17,-0.6,-0.06,-0.5,-0.38,-0.33,-0.32,-0.36,-0.41,-0.34,0.17,-0.33,-0.21,-0.2,0.51,-0.35,-0.35,2.09,-0.2,-0.75,-0.13,-0.33,-0.24,0.15,-0.27,0.01,-0.2,-0.14,-0.2,-0.14,-0.12,-0.16,-0.13,-0.2,1.68,-0.29,-0.06,-0.16,-0.17,-0.1,-0.16,-0.18,-0.11,0.42,-0.18,-0.07,-0.1,-0.16,0.0,-0.15,-0.43,-0.34,-0.12,-0.12,-0.23,-0.41
0.36,-0.17,0.31,-0.06,-0.5,-0.38,-0.33,0.31,-0.36,-0.03,-0.34,-0.66,-0.33,-0.21,-0.2,-0.37,-0.35,-0.35,1.08,-0.2,-0.19,-0.13,-0.33,-0.24,0.91,0.01,-0.21,-0.2,-0.14,0.35,-0.14,-0.12,-0.16,-0.13,-0.2,-0.22,0.28,-0.06,-0.16,-0.17,-0.1,-0.16,-0.18,-0.11,-0.27,-0.18,-0.07,-0.1,-0.16,0.36,-0.15,-0.43,-0.23,-0.12,-0.11,-0.17,-0.11
-0.38,-0.17,-0.6,-0.06,6.43,-0.38,-0.33,-0.32,-0.36,-0.41,-0.34,5.13,-0.33,-0.21,-0.2,-0.37,-0.35,-0.35,-1.01,-0.2,-0.75,-0.13,-0.33,-0.24,-0.3,-0.27,-0.21,-0.2,8.34,-0.2,-0.14,-0.12,-0.16,-0.13,-0.2,-0.22,-0.29,-0.06,-0.16,-0.17,-0.1,10.25,-0.18,-0.11,-0.27,-0.18,-0.07,-0.1,-0.16,-0.43,-0.15,-0.43,-0.34,-0.12,-0.12,-0.23,-0.42
-0.38,-0.17,-0.6,-0.06,-0.5,-0.38,-0.33,-0.32,-0.36,0.09,-0.34,-0.28,-0.33,-0.21,-0.2,-0.37,-0.35,-0.35,-0.47,-0.2,0.02,-0.13,-0.33,-0.24,0.54,0.51,0.4,1.04,0.98,1.3,1.52,2.03,-0.16,2.01,1.31,1.54,2.08,-0.06,1.55,1.82,6.86,-0.16,3.06,-0.11,0.37,2.22,-0.07,-0.1,0.06,0.52,0.74,-0.43,-0.34,-0.12,-0.1,-0.19,-0.22
-0.38,0.04,1.02,-0.06,1.49,-0.38,1.6,-0.32,-0.36,0.92,-0.34,0.34,-0.33,-0.21,-0.2,0.69,0.21,-0.35,0.55,-0.2,0.14,-0.13,1.71,0.3,-0.3,-0.27,-0.21,-0.2,-0.14,-0.2,-0.14,-0.12,0.36,-0.13,-0.2,-0.22,-0.29,-0.06,-0.16,-0.17,-0.1,-0.16,-0.18,-0.11,-0.27,-0.18,-0.07,-0.1,-0.16,0.31,-0.15,-0.11,0.31,-0.12,-0.03,0.22,-0.04
-0.38,-0.17,2.47,-0.06,-0.5,2.27,-0.33,0.71,-0.36,-0.41,1.45,0.75,-0.33,-0.21,-0.2,0.12,-0.35,1.55,0.08,-0.2,-0.44,-0.13,2.56,0.51,-0.3,-0.27,-0.21,-0.2,-0.14,-0.2,-0.14,-0.12,-0.16,-0.13,-0.2,-0.22,-0.29,-0.06,-0.16,-0.17,-0.1,-0.16,-0.18,-0.11,-0.27,-0.18,-0.07,-0.1,-0.16,-0.24,-0.15,0.4,0.08,-0.12,-0.09,-0.16,-0.23
-0.38,0.13,0.91,-0.06,0.05,-0.38,0.12,-0.32,-0.36,0.51,0.56,-0.66,0.26,-0.21,0.52,-0.12,-0.35,1.21,-0.9,0.14,0.02,-0.13,-0.33,-0.24,-0.3,-0.27,-0.21,-0.2,-0.14,-0.2,-0.14,-0.12,-0.16,-0.13,-0.2,-0.22,-0.29,-0.06,-0.16,0.43,-0.1,-0.16,-0.18,-0.11,-0.27,-0.18,-0.07,-0.1,-0.16,-0.43,-0.15,0.44,-0.23,0.09,-0.0,0.25,0.73
-0.38,-0.17,-0.05,-0.06,-0.27,0.24,-0.33,-0.32,-0.36,-0.41,-0.24,-0.55,0.01,-0.21,-0.2,-0.37,-0.35,-0.35,-0.93,-0.2,-0.73,-0.13,-0.33,-0.16,-0.3,-0.27,-0.21,-0.2,-0.14,-0.2,-0.14,-0.12,-0.16,0.11,-0.2,-0.22,-0.29,-0.06,-0.11,-0.17,-0.1,-0.16,-0.18,-0.11,-0.27,-0.18,-0.07,-0.1,-0.16,-0.07,0.04,-0.39,-0.34,-0.12,0.2,0.34,11.85


In [78]:
pandas_processx.return_formatted_table \
    (normalized_x_resampled_test_dataframe, 
     'Table 7.3.2: Normalized Resampled X Test Table')

word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
-0.33,-0.18,-0.62,-0.05,-0.53,-0.4,-0.32,-0.24,-0.35,-0.38,-0.33,1.25,-0.36,-0.17,-0.22,-0.29,-0.37,-0.37,2.6,-0.21,-0.74,-0.11,-0.32,-0.2,-0.31,-0.29,-0.2,-0.25,-0.18,3.61,-0.18,-0.15,-0.18,-0.15,-0.15,-0.24,-0.33,-0.08,-0.18,-0.2,-0.14,-0.15,-0.22,-0.14,-0.36,1.54,-0.07,-0.13,-0.17,2.58,-0.25,-0.3,-0.4,-0.09,-0.11,-0.38,-0.53
-0.33,-0.18,-0.62,-0.05,-0.53,-0.4,-0.32,-0.24,-0.35,-0.38,-0.33,-0.64,-0.36,-0.17,-0.22,-0.29,-0.37,-0.37,-1.0,-0.21,-0.74,-0.11,-0.32,-0.2,0.68,7.35,-0.2,-0.25,-0.18,-0.23,-0.18,-0.15,-0.18,-0.15,-0.15,-0.24,3.64,-0.08,-0.18,-0.2,-0.14,-0.15,-0.22,-0.14,-0.36,-0.2,-0.07,-0.13,-0.17,-0.67,-0.25,-0.3,-0.4,-0.09,-0.11,-0.37,-0.49
2.76,-0.02,0.21,-0.05,-0.27,0.47,-0.32,0.09,-0.35,0.33,0.09,0.03,3.04,-0.17,0.28,-0.29,-0.37,0.2,0.32,-0.21,0.28,-0.11,0.99,1.57,-0.31,-0.29,-0.2,-0.25,-0.18,-0.23,-0.18,-0.15,-0.18,-0.15,-0.15,-0.24,-0.33,-0.08,-0.18,-0.2,-0.14,-0.15,-0.22,-0.14,-0.36,-0.2,-0.07,-0.13,-0.17,-0.18,-0.25,0.41,0.88,0.08,-0.03,0.21,0.78
-0.33,-0.18,-0.62,-0.05,-0.53,-0.4,-0.32,-0.24,-0.35,-0.38,-0.33,-0.64,-0.36,1.69,-0.22,-0.29,-0.37,-0.37,2.12,-0.21,2.57,-0.11,-0.32,2.28,-0.31,-0.29,-0.2,-0.25,-0.18,-0.23,-0.18,-0.15,-0.18,-0.15,-0.15,-0.24,-0.33,-0.08,-0.18,-0.2,-0.14,-0.15,-0.22,-0.14,-0.36,-0.2,-0.07,-0.13,0.41,0.59,-0.25,-0.3,-0.4,-0.09,-0.1,-0.37,-0.52
-0.33,-0.18,-0.62,-0.05,-0.53,-0.4,-0.32,-0.24,-0.35,1.15,-0.33,-0.64,-0.36,-0.17,-0.22,-0.29,-0.37,-0.37,-1.0,-0.21,-0.74,-0.11,-0.32,-0.2,-0.31,-0.29,0.16,-0.25,-0.18,-0.23,-0.18,-0.15,-0.18,-0.15,-0.15,-0.24,-0.33,-0.08,-0.18,-0.2,-0.14,-0.15,-0.22,-0.14,-0.36,-0.2,-0.07,-0.13,-0.17,-0.67,6.11,-0.3,-0.4,-0.09,-0.03,-0.22,-0.33
-0.33,-0.18,-0.07,-0.05,0.34,-0.4,0.3,-0.24,-0.35,0.01,-0.33,-0.0,-0.36,-0.17,-0.22,1.0,-0.37,-0.37,-0.39,-0.21,1.28,10.9,-0.32,0.64,-0.31,-0.29,-0.2,-0.25,-0.18,-0.23,-0.18,-0.15,-0.18,-0.15,-0.15,-0.24,-0.33,-0.08,-0.18,-0.2,-0.14,-0.15,-0.22,-0.14,-0.36,-0.2,-0.07,-0.13,-0.17,-0.67,-0.25,0.02,0.31,2.35,1.23,1.47,1.57
-0.33,-0.18,-0.62,-0.05,-0.53,-0.4,-0.32,-0.24,-0.35,5.11,-0.33,-0.64,-0.36,-0.17,-0.22,-0.29,-0.37,-0.37,0.05,-0.21,-0.74,-0.11,-0.32,2.72,-0.31,-0.29,-0.2,-0.25,-0.18,-0.23,-0.18,-0.15,-0.18,-0.15,-0.15,-0.24,-0.33,-0.08,-0.18,-0.2,-0.14,-0.15,-0.22,-0.14,-0.36,-0.2,-0.07,-0.13,1.07,-0.67,-0.25,-0.3,-0.4,-0.09,-0.03,-0.32,-0.41
-0.33,-0.18,-0.62,-0.05,0.92,-0.4,-0.32,-0.24,-0.35,-0.38,-0.33,-0.64,-0.36,-0.17,-0.22,-0.29,-0.37,-0.37,0.51,-0.21,0.01,-0.11,-0.32,-0.2,-0.31,-0.29,-0.2,-0.25,-0.18,-0.23,-0.18,-0.15,-0.18,-0.15,-0.15,-0.24,4.34,-0.08,-0.18,-0.2,-0.14,-0.15,7.48,-0.14,0.84,-0.2,-0.07,-0.13,-0.17,-0.67,-0.25,-0.03,-0.4,-0.09,-0.11,-0.32,-0.42
1.47,-0.18,2.87,-0.05,-0.53,-0.4,-0.32,0.92,-0.35,-0.38,-0.33,0.03,6.13,-0.17,-0.22,-0.29,0.96,-0.37,1.56,-0.21,-0.74,-0.11,-0.32,0.68,-0.31,-0.29,-0.2,-0.25,-0.18,-0.23,-0.18,-0.15,-0.18,-0.15,-0.15,-0.24,-0.33,-0.08,-0.18,-0.2,-0.14,-0.15,-0.22,-0.14,-0.36,-0.2,-0.07,-0.13,-0.17,-0.16,-0.25,0.08,0.1,-0.09,-0.09,-0.33,-0.43
-0.33,-0.18,-0.62,-0.05,-0.53,-0.4,-0.32,-0.24,-0.35,-0.38,-0.33,3.56,-0.36,-0.17,-0.22,-0.29,-0.37,-0.37,-1.0,-0.21,-0.74,-0.11,-0.32,-0.2,0.84,1.92,-0.2,-0.25,-0.18,-0.23,5.25,-0.15,-0.18,-0.15,-0.15,-0.24,-0.33,-0.08,-0.18,-0.2,-0.14,-0.15,-0.22,7.42,-0.36,-0.2,-0.07,-0.13,-0.17,-0.67,-0.25,-0.3,-0.4,0.44,-0.09,-0.31,-0.47


# <br> **Section 8: Logistic Regression Model with Resampled Data**

## **8.1: Fit a Logistic Regression Model by Using the Normalized Resampled Training Data.**

In [79]:
logistic_regression_resampled_model \
    = LogisticRegression(random_state = 21).fit(normalized_x_resampled_train_dataframe, y_resampled_train_series)

## **8.2: Display the Model Scores Using the Normalized Resampled Training and Testing Data.**

In [80]:
accuracy_score_float \
    = logistic_regression_resampled_model.score(normalized_x_resampled_train_dataframe, y_resampled_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The logistic regression model score from normalized resampled training data is {:.2f}%' \
         .format(accuracy_score_float) + '\033[0m')

[1mThe logistic regression model score from normalized resampled training data is 92.90%[0m


In [81]:
accuracy_score_float \
    = logistic_regression_resampled_model.score(normalized_x_resampled_test_dataframe, y_resampled_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The logistic regression model score from normalized resampled test data is {:.2f}%' \
         .format(accuracy_score_float) + '\033[0m')

[1mThe logistic regression model score from normalized resampled test data is 92.54%[0m


## **8.3: Calculate and Display Training and Test Predictions.**

### **Training Predictions**

In [82]:
logistic_regression_resampled_train_predictions_nparray \
    = logistic_regression_resampled_model.predict(normalized_x_resampled_train_dataframe)

logistic_regression_resampled_train_predictions_dictionary \
    = {'prediction': logistic_regression_resampled_train_predictions_nparray, 'actual': y_resampled_train_series}

logistic_regression_resampled_train_predictions_dataframe \
    = pd.DataFrame(logistic_regression_resampled_train_predictions_dictionary)

logx.log_write_object(logistic_regression_resampled_train_predictions_dataframe)

In [83]:
pandas_processx.return_formatted_table \
    (logistic_regression_resampled_train_predictions_dataframe, 
     'Table 8.3.1: Logistic Regression Resampled Training Predictions')

prediction,actual
1,0
1,1
0,0
0,0
0,0
0,0
1,1
1,1
1,1
1,1


### **Testing Predictions**

In [84]:
logistic_regression_resampled_test_predictions_nparray \
    = logistic_regression_resampled_model.predict(normalized_x_resampled_test_dataframe)

logistic_regression_resampled_test_predictions_dictionary \
    = {'prediction': logistic_regression_resampled_test_predictions_nparray, 'actual': y_resampled_test_series}

logistic_regression_resampled_test_predictions_dataframe \
    = pd.DataFrame(logistic_regression_resampled_test_predictions_dictionary)

logx.log_write_object(logistic_regression_resampled_test_predictions_dataframe)

In [85]:
pandas_processx.return_formatted_table \
    (logistic_regression_resampled_test_predictions_dataframe, 
     'Table 8.3.2: Logistic Regression Resampled Test Predictions')

prediction,actual
0,0
0,0
1,1
1,0
0,0
1,1
1,1
0,0
1,1
0,0


## **8.4: Evaluate the Resampled Model’s Performance**

In [86]:
lr_resampled_balanced_accuracy_score_float \
    = accuracy_score(y_resampled_test_series, logistic_regression_resampled_test_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The balanced accuracy score for logistic regression resampled from actual vs. test predictions is {:.2f}%' \
         .format(lr_resampled_balanced_accuracy_score_float) + '\033[0m')

[1mThe balanced accuracy score for logistic regression resampled from actual vs. test predictions is 92.54%[0m


In [87]:
lr_resampled_accuracy_score_float, lr_resampled_confusion_matrix_dataframe, lr_resampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_resampled_test_series, 
         logistic_regression_resampled_test_predictions_nparray, 
         'LOGISTIC REGRESSION MODEL (Resampled)',
         'Spam', 'Not Spam')

[1mLOGISTIC REGRESSION MODEL (Resampled)
[0m
1) [1mOverall Accuracy Score: [0m92.59%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 662                  63
Actual Not Spam              41                 628

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.94      0.91      0.93       725
    not spam       0.91      0.94      0.92       669

    accuracy                           0.93      1394
   macro avg       0.93      0.93      0.93      1394
weighted avg       0.93      0.93      0.93      1394




## **8.5: Save the Resampled Logistic Regression Model**

In [88]:
pickle.dump(logistic_regression_resampled_model, open(CONSTANT_LR_RESAMPLED_MODEL_FILE_PATH, 'wb'))

# <br> **Section 9: Decision Tree Model with Resampled Data**

## **9.1: Fit a Decision Tree Model by Using the Normalized Resampled Training Data.**

In [89]:
decision_tree_resampled_model \
    = DecisionTreeClassifier(random_state = 21).fit(normalized_x_resampled_train_dataframe, y_resampled_train_series)

## **9.2: Display the Model Scores Using the Normalized Resampled Training and Testing Data.**

In [90]:
accuracy_score_float \
    = decision_tree_resampled_model.score(normalized_x_resampled_train_dataframe, y_resampled_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The decision tree model score from normalized resampled training data is {:.2f}%' \
         .format(accuracy_score_float) + '\033[0m')

[1mThe decision tree model score from normalized resampled training data is 99.93%[0m


In [91]:
accuracy_score_float \
    = decision_tree_resampled_model.score(normalized_x_resampled_test_dataframe, y_resampled_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The decision tree model score from normalized resampled test data is {:.2f}%' \
         .format(accuracy_score_float) + '\033[0m')

[1mThe decision tree model score from normalized resampled test data is 87.95%[0m


## **9.3: Calculate and Display Training and Test Predictions.**

### **Training Predictions**

In [92]:
decision_tree_resampled_train_predictions_nparray \
    = decision_tree_resampled_model.predict(normalized_x_resampled_train_dataframe)

decision_tree_resampled_train_predictions_dictionary \
    = {'prediction': decision_tree_resampled_train_predictions_nparray, 'actual': y_resampled_train_series}

decision_tree_resampled_train_predictions_dataframe \
    = pd.DataFrame(decision_tree_resampled_train_predictions_dictionary)

logx.log_write_object(decision_tree_resampled_train_predictions_dataframe)

In [93]:
pandas_processx.return_formatted_table \
    (decision_tree_resampled_train_predictions_dataframe, 
     'Table 9.3.1: Decision Tree Resampled Training Predictions')

prediction,actual
0,0
1,1
0,0
0,0
0,0
0,0
1,1
1,1
1,1
1,1


### **Testing Predictions**

In [94]:
decision_tree_resampled_test_predictions_nparray \
    = decision_tree_resampled_model.predict(normalized_x_resampled_test_dataframe)

decision_tree_resampled_test_predictions_dictionary \
    = {'prediction': decision_tree_resampled_test_predictions_nparray, 'actual': y_resampled_test_series}

decision_tree_resampled_test_predictions_dataframe \
    = pd.DataFrame(decision_tree_resampled_test_predictions_dictionary)

logx.log_write_object(decision_tree_resampled_test_predictions_dataframe)

In [95]:
pandas_processx.return_formatted_table \
    (decision_tree_resampled_test_predictions_dataframe, 
     'Table 9.3.2: Decision Tree Resampled Test Predictions')

prediction,actual
0,0
0,0
1,1
0,0
0,0
1,1
0,1
0,0
1,1
0,0


## **9.4: Evaluate the Resampled Model’s Performance**

In [96]:
dt_resampled_balanced_accuracy_score_float \
    = accuracy_score(y_resampled_test_series, decision_tree_resampled_test_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The balanced accuracy score for decision tree resampled from actual vs. test predictions is {:.2f}%' \
         .format(dt_resampled_balanced_accuracy_score_float) + '\033[0m')

[1mThe balanced accuracy score for decision tree resampled from actual vs. test predictions is 87.95%[0m


In [97]:
dt_resampled_accuracy_score_float, dt_resampled_confusion_matrix_dataframe, dt_resampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_resampled_test_series, 
         decision_tree_resampled_test_predictions_nparray, 
         'DECISION TREE (Resampled)',
         'Spam', 'Not Spam')

[1mDECISION TREE (Resampled)
[0m
1) [1mOverall Accuracy Score: [0m87.99%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 630                  95
Actual Not Spam              73                 596

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.90      0.87      0.88       725
    not spam       0.86      0.89      0.88       669

    accuracy                           0.88      1394
   macro avg       0.88      0.88      0.88      1394
weighted avg       0.88      0.88      0.88      1394




## **9.5: Save the Resampled Decision Tree Model**

In [98]:
pickle.dump(decision_tree_resampled_model, open(CONSTANT_DT_RESAMPLED_MODEL_FILE_PATH, 'wb'))

# <br> **Section 10: Random Forest Model with Resampled Data**

## **10.1: Fit a Random Forest Model by Using the Normalized Resampled Training Data.**

In [99]:
random_forest_resampled_model \
    = RandomForestClassifier(random_state = 21).fit(normalized_x_resampled_train_dataframe, y_resampled_train_series)

## **10.2: Display the Model Scores Using the Normalized Resampled Training and Testing Data.**

In [100]:
accuracy_score_float \
    = random_forest_resampled_model.score(normalized_x_resampled_train_dataframe, y_resampled_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The random forest model score from normalized resampled training data is {:.2f}%' \
         .format(accuracy_score_float) + '\033[0m')

[1mThe random forest model score from normalized resampled training data is 99.93%[0m


In [101]:
accuracy_score_float \
    = random_forest_resampled_model.score(normalized_x_resampled_test_dataframe, y_resampled_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The random forest model score from normalized resampled test data is {:.2f}%' \
         .format(accuracy_score_float) + '\033[0m')

[1mThe random forest model score from normalized resampled test data is 93.47%[0m


## **10.3: Calculate and Display Training and Test Predictions.**

### **Training Predictions**

In [102]:
random_forest_resampled_train_predictions_nparray \
    = random_forest_resampled_model.predict(normalized_x_resampled_train_dataframe)

random_forest_resampled_train_predictions_dictionary \
    = {'prediction': random_forest_resampled_train_predictions_nparray, 'actual': y_resampled_train_series}

random_forest_resampled_train_predictions_dataframe \
    = pd.DataFrame(random_forest_resampled_train_predictions_dictionary)

logx.log_write_object(random_forest_resampled_train_predictions_dataframe)

In [103]:
pandas_processx.return_formatted_table \
    (random_forest_resampled_train_predictions_dataframe, 
     'Table 10.3.1: Random Forest Resampled Training Predictions')

prediction,actual
0,0
1,1
0,0
0,0
0,0
0,0
1,1
1,1
1,1
1,1


### **Testing Predictions**

In [104]:
random_forest_resampled_test_predictions_nparray \
    = random_forest_resampled_model.predict(normalized_x_resampled_test_dataframe)

random_forest_resampled_test_predictions_dictionary \
    = {'prediction': random_forest_resampled_test_predictions_nparray, 'actual': y_resampled_test_series}

random_forest_resampled_test_predictions_dataframe \
    = pd.DataFrame(random_forest_resampled_test_predictions_dictionary)

logx.log_write_object(random_forest_resampled_test_predictions_dataframe)

In [105]:
pandas_processx.return_formatted_table \
    (random_forest_resampled_test_predictions_dataframe, 
     'Table 10.3.2: Random Forest Resampled Test Predictions')

prediction,actual
0,0
0,0
1,1
0,0
0,0
1,1
1,1
0,0
1,1
0,0


## **10.4: Evaluate the Resampled Model’s Performance**

In [106]:
rf_resampled_balanced_accuracy_score_float \
    = accuracy_score(y_resampled_test_series, random_forest_resampled_test_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The balanced accuracy score for random forest resampled from actual vs. test predictions is {:.2f}%' \
         .format(rf_resampled_balanced_accuracy_score_float) + '\033[0m')

[1mThe balanced accuracy score for random forest resampled from actual vs. test predictions is 93.47%[0m


In [107]:
rf_resampled_accuracy_score_float, rf_resampled_confusion_matrix_dataframe, rf_resampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_resampled_test_series, 
         random_forest_resampled_test_predictions_nparray, 
         'RANDOM FOREST (Resampled)',
         'Spam', 'Not Spam')

[1mRANDOM FOREST (Resampled)
[0m
1) [1mOverall Accuracy Score: [0m93.32%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 704                  21
Actual Not Spam              70                 599

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.91      0.97      0.94       725
    not spam       0.97      0.90      0.93       669

    accuracy                           0.93      1394
   macro avg       0.94      0.93      0.93      1394
weighted avg       0.94      0.93      0.93      1394




## **10.5: Save the Resampled Decision Tree Model**

In [108]:
pickle.dump(random_forest_resampled_model, open(CONSTANT_RF_RESAMPLED_MODEL_FILE_PATH, 'wb'))

# <br> **Section 11: Support Vector Machine (SVM) Model with Resampled Data**

## **11.1: Fit a SVM Model by Using the Normalized Resampled Training Data.**

In [109]:
svm_resampled_model \
    = SVC(random_state = 21).fit(normalized_x_resampled_train_dataframe, y_resampled_train_series)

## **11.2: Display the Model Scores Using the Normalized Resampled Training and Testing Data.**

In [110]:
accuracy_score_float \
    = svm_resampled_model.score(normalized_x_resampled_train_dataframe, y_resampled_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The svm model score from normalized resampled training data is {:.2f}%' \
         .format(accuracy_score_float) + '\033[0m')

[1mThe svm model score from normalized resampled training data is 94.64%[0m


In [111]:
accuracy_score_float \
    = svm_resampled_model.score(normalized_x_resampled_test_dataframe, y_resampled_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The svm model score from normalized resampled test data is {:.2f}%' \
         .format(accuracy_score_float) + '\033[0m')

[1mThe svm model score from normalized resampled test data is 93.26%[0m


## **11.3: Calculate and Display Training and Test Predictions.**

### **Training Predictions**

In [112]:
svm_resampled_train_predictions_nparray \
    = svm_resampled_model.predict(normalized_x_resampled_train_dataframe)

svm_resampled_train_predictions_dictionary \
    = {'prediction': svm_resampled_train_predictions_nparray, 'actual': y_resampled_train_series}

svm_resampled_train_predictions_dataframe \
    = pd.DataFrame(svm_resampled_train_predictions_dictionary)

logx.log_write_object(svm_resampled_train_predictions_dataframe)

In [113]:
pandas_processx.return_formatted_table \
    (svm_resampled_train_predictions_dataframe, 
     'Table 11.3.1: SVM Resampled Training Predictions')

prediction,actual
1,0
1,1
0,0
0,0
0,0
0,0
1,1
1,1
1,1
1,1


### **Testing Predictions**

In [114]:
svm_resampled_test_predictions_nparray \
    = svm_resampled_model.predict(normalized_x_resampled_test_dataframe)

svm_resampled_test_predictions_dictionary \
    = {'prediction': svm_resampled_test_predictions_nparray, 'actual': y_resampled_test_series}

svm_resampled_test_predictions_dataframe \
    = pd.DataFrame(svm_resampled_test_predictions_dictionary)

logx.log_write_object(svm_resampled_test_predictions_dataframe)

In [115]:
pandas_processx.return_formatted_table \
    (svm_resampled_test_predictions_dataframe, 
     'Table 11.3.2: SVM Resampled Test Predictions')

prediction,actual
0,0
0,0
1,1
1,0
0,0
1,1
1,1
0,0
0,1
0,0


## **11.4: Evaluate the Resampled Model’s Performance**

In [116]:
svm_resampled_balanced_accuracy_score_float \
    = accuracy_score(y_resampled_test_series, svm_resampled_test_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The balanced accuracy score for svm resampled from actual vs. test predictions is {:.2f}%' \
         .format(svm_resampled_balanced_accuracy_score_float) + '\033[0m')

[1mThe balanced accuracy score for svm resampled from actual vs. test predictions is 93.26%[0m


In [117]:
svm_resampled_accuracy_score_float, svm_resampled_confusion_matrix_dataframe, svm_resampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_resampled_test_series, 
         svm_resampled_test_predictions_nparray, 
         'SUPPORT VECTOR MACHINE (Resampled)',
         'Spam', 'Not Spam')

[1mSUPPORT VECTOR MACHINE (Resampled)
[0m
1) [1mOverall Accuracy Score: [0m93.26%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 675                  50
Actual Not Spam              44                 625

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.94      0.93      0.93       725
    not spam       0.93      0.93      0.93       669

    accuracy                           0.93      1394
   macro avg       0.93      0.93      0.93      1394
weighted avg       0.93      0.93      0.93      1394




## **11.5: Save the Resampled SVM Model**

In [118]:
pickle.dump(svm_resampled_model, open(CONSTANT_SVM_RESAMPLED_MODEL_FILE_PATH, 'wb'))

# <br> **Section 12: K-Nearest Neighbor (KNN) Model with Resampled Data**

## **12.1: Fit a KNN Model by Using the Normalized Resampled Training Data.**

In [119]:
knn_resampled_model \
    = KNeighborsClassifier().fit(normalized_x_resampled_train_dataframe, y_resampled_train_series)

## **12.2: Display the Model Scores Using the Normalized Resampled Training and Testing Data.**

In [120]:
accuracy_score_float \
    = knn_resampled_model.score(normalized_x_resampled_train_dataframe, y_resampled_train_series) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The knn model score from normalized resampled training data is {:.2f}%' \
         .format(accuracy_score_float) + '\033[0m')

[1mThe knn model score from normalized resampled training data is 93.85%[0m


In [121]:
accuracy_score_float \
    = knn_resampled_model.score(normalized_x_resampled_test_dataframe, y_resampled_test_series) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The knn model score from normalized resampled test data is {:.2f}%' \
         .format(accuracy_score_float) + '\033[0m')

[1mThe knn model score from normalized resampled test data is 91.32%[0m


## **12.3: Calculate and Display Training and Test Predictions.**

### **Training Predictions**

In [122]:
knn_resampled_train_predictions_nparray \
    = knn_resampled_model.predict(normalized_x_resampled_train_dataframe)

knn_resampled_train_predictions_dictionary \
    = {'prediction': knn_resampled_train_predictions_nparray, 'actual': y_resampled_train_series}

knn_resampled_train_predictions_dataframe \
    = pd.DataFrame(knn_resampled_train_predictions_dictionary)

logx.log_write_object(knn_resampled_train_predictions_dataframe)

In [123]:
pandas_processx.return_formatted_table \
    (knn_resampled_train_predictions_dataframe, 
     'Table 12.3.1: KNN Resampled Training Predictions')

prediction,actual
0,0
1,1
0,0
0,0
0,0
0,0
1,1
1,1
1,1
1,1


### **Testing Predictions**

In [124]:
knn_resampled_test_predictions_nparray \
    = knn_resampled_model.predict(normalized_x_resampled_test_dataframe)

knn_resampled_test_predictions_dictionary \
    = {'prediction': knn_resampled_test_predictions_nparray, 'actual': y_resampled_test_series}

knn_resampled_test_predictions_dataframe \
    = pd.DataFrame(knn_resampled_test_predictions_dictionary)

logx.log_write_object(knn_resampled_test_predictions_dataframe)

In [125]:
pandas_processx.return_formatted_table \
    (knn_resampled_test_predictions_dataframe, 
     'Table 12.3.2: KNN Resampled Test Predictions')

prediction,actual
0,0
0,0
1,1
1,0
0,0
1,1
1,1
0,0
0,1
0,0


## **12.4: Evaluate the Resampled Model’s Performance**

In [126]:
knn_resampled_balanced_accuracy_score_float \
    = accuracy_score(y_resampled_test_series, knn_resampled_test_predictions_nparray) * 100

logx.print_and_log_text \
    ('\033[1m' + 'The balanced accuracy score for svm resampled from actual vs. test predictions is {:.2f}%' \
         .format(knn_resampled_balanced_accuracy_score_float) + '\033[0m')

[1mThe balanced accuracy score for svm resampled from actual vs. test predictions is 91.32%[0m


In [127]:
knn_resampled_accuracy_score_float, knn_resampled_confusion_matrix_dataframe, knn_resampled_classification_report_string \
    = classificationsx.return_binary_classification_confusion_matrix \
        (y_resampled_test_series, 
         knn_resampled_test_predictions_nparray, 
         'K-NEAREST NEIGHBOR (Resampled)',
         'Spam', 'Not Spam')

[1mK-NEAREST NEIGHBOR (Resampled)
[0m
1) [1mOverall Accuracy Score: [0m91.37%

2) [1mConfusion Matrix:
[0m
                 Predicted Spam  Predicted Not Spam
Actual Spam                 653                  72
Actual Not Spam              49                 620

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

        spam       0.93      0.90      0.92       725
    not spam       0.90      0.93      0.91       669

    accuracy                           0.91      1394
   macro avg       0.91      0.91      0.91      1394
weighted avg       0.91      0.91      0.91      1394




## **12.5: Save the Resampled KNN Model**

In [128]:
pickle.dump(knn_resampled_model, open(CONSTANT_KNN_RESAMPLED_MODEL_FILE_PATH, 'wb'))

In [129]:
# logx.end_program()