In [1]:
# Importing modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA

In [2]:
# read the CSV file
df = pd.read_csv('spambase.csv', header=None)
# Printing the length of the dataset
len(df)

4601

In [3]:
# Let's have a look at the dataset
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [4]:
# Assign the column labels
df.columns = [
    'word_freq_make',
    'word_freq_address',
    'word_freq_all',
    'word_freq_3d',
    'word_freq_our',
    'word_freq_over',
    'word_freq_remove',
    'word_freq_internet',
    'word_freq_order',
    'word_freq_mail',
    'word_freq_receive',
    'word_freq_will',
    'word_freq_people',
    'word_freq_report',
    'word_freq_addresses',
    'word_freq_free',
    'word_freq_business',
    'word_freq_email',
    'word_freq_you',
    'word_freq_credit',
    'word_freq_your',
    'word_freq_font',
    'word_freq_000',
    'word_freq_money',
    'word_freq_hp',
    'word_freq_hpl',
    'word_freq_george',
    'word_freq_650',
    'word_freq_lab',
    'word_freq_labs',
    'word_freq_telnet',
    'word_freq_857',
    'word_freq_data',
    'word_freq_415',
    'word_freq_85',
    'word_freq_technology',
    'word_freq_1999',
    'word_freq_parts',
    'word_freq_pm',
    'word_freq_direct',
    'word_freq_cs',
    'word_freq_meeting',
    'word_freq_original',
    'word_freq_project',
    'word_freq_re',
    'word_freq_edu',
    'word_freq_table',
    'word_freq_conference',
    'char_freq_;',
    'char_freq_(',
    'char_freq_[',
    'char_freq_!',
    'char_freq_$',
    'char_freq_#',
    'capital_run_length_average',
    'capital_run_length_longest',
    'capital_run_length_total',
    'spam'
]

In [5]:
# Now let's look at the labelled data
df.sample(n = 100)

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
4404,0.00,3.36,1.92,0.0,0.00,0.00,0.0,0.00,0.00,4.32,...,0.000,0.695,0.000,0.347,0.000,0.000,6.137,107,178,0
2492,0.00,0.00,0.00,0.0,4.54,0.00,0.0,0.00,0.00,0.00,...,0.000,0.000,0.000,0.000,0.000,0.000,2.000,5,16,0
2823,0.00,0.00,0.00,0.0,1.05,0.00,0.0,0.00,0.00,0.00,...,0.000,0.072,0.000,0.000,0.000,0.000,1.486,10,55,0
1723,0.00,0.00,0.00,0.0,0.00,0.27,0.0,0.00,0.82,0.00,...,0.037,0.226,0.000,0.037,0.000,0.000,2.666,33,208,1
1255,0.65,0.00,0.00,0.0,1.30,0.00,0.0,0.00,0.00,0.00,...,0.000,0.000,0.000,0.515,0.103,0.000,2.040,12,51,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1737,0.10,0.10,0.71,0.0,0.61,0.30,0.4,0.10,1.42,0.81,...,0.000,0.000,0.264,1.010,0.397,0.033,3.199,56,1043,1
3562,0.00,0.00,0.00,0.0,0.00,0.00,0.0,0.00,0.00,0.00,...,0.000,1.142,0.000,0.000,0.000,0.000,3.333,11,30,0
359,0.00,0.02,0.05,0.0,0.02,0.00,0.0,0.05,0.00,0.35,...,0.000,0.004,0.000,0.107,0.017,0.017,3.922,489,3271,1
788,0.09,0.00,0.27,0.0,0.36,0.09,0.0,0.18,0.09,0.00,...,0.015,0.047,0.031,0.252,0.031,0.031,3.816,69,542,1


In [6]:
# Checking if there are duplicate rows in the dataset

# get boolean mask of duplicated rows
duplicated_rows = df.duplicated()

# count the number of duplicated rows
num_duplicate_rows = sum(duplicated_rows)

print(f"Number of duplicate rows: {num_duplicate_rows}")

Number of duplicate rows: 391


In [7]:
# Removing duplicate rows
df = df.drop_duplicates()

In [8]:
# Setting the spam value to 0 if the "word_freq_george" or the "word_freq_650" columns are greater than 0.0
# Reason: In the documentation(spambase.DOCUMENTATION) it has clearly mentioned it
df.loc[(df['word_freq_george'] > 0) | (df['word_freq_650'] > 0), 'spam'] = 0

In [9]:
# Let's have a look at the dataset and check if its updated or not
columns_to_display = ['word_freq_george', 'word_freq_650', 'spam']
df_subset = df.loc[:, columns_to_display]
df_subset

Unnamed: 0,word_freq_george,word_freq_650,spam
0,0.0,0.0,1
1,0.0,0.0,1
2,0.0,0.0,1
3,0.0,0.0,1
4,0.0,0.0,1
...,...,...,...
4596,0.0,0.0,0
4597,0.0,0.0,0
4598,0.0,0.0,0
4599,0.0,0.0,0


In [10]:
# Save the modified dataframe to a new csv file
df.to_csv('modified_dataset.csv', index=False)

In [11]:
# Creating a reference to the updated dataset
df_new = pd.read_csv('modified_dataset.csv', header=0)
df_new.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [12]:
# Splitting the data into input features and target variable
X = df_new.iloc[:, 0:57]
y = df_new.iloc[:, 57]

In [13]:
# Feature scaling
sc_X = StandardScaler()
X_scaled = sc_X.fit_transform(X)
X_scaled

array([[-0.34792164,  1.16102457,  0.67588944, ..., -0.04911671,
         0.04439849, -0.02130997],
       [ 0.3521501 ,  0.3684328 ,  0.40439129, ..., -0.00814327,
         0.24484101,  1.1911417 ],
       [-0.14790114, -0.24802746,  0.81163851, ...,  0.13387587,
         2.16908914,  3.18117903],
       ...,
       [ 0.65218084, -0.24802746,  0.01653679, ..., -0.12008102,
        -0.23120996, -0.27996632],
       [ 2.85240628, -0.24802746, -0.56524496, ..., -0.12783519,
        -0.23622103, -0.34463041],
       [-0.34792164, -0.24802746,  0.69528216, ..., -0.12472749,
        -0.23622103, -0.40606129]])

##### Reducing the dimensions (no of columns in the dataset) using PCA

In [15]:
# We can insert the no of PCA components we want within the brackets using n_components
# This will retain 95% of useful features and then create new dimensions, no of components will get decided by the system
pca = PCA(0.95)
X_pca = pca.fit_transform(X_scaled)
X.shape,X_pca.shape

((4210, 57), (4210, 49))

##### In the original dataframe, there are 57 columns, and it got reduced to 49 columns after doing the PCA

In [16]:
# Splitting the data into test and train
X_train, X_test, y_train, y_test = train_test_split(X_pca,y,random_state=0, test_size=0.2)

In [17]:
# Finding the value of n_neighbors
import math
math.sqrt(len(y_test))

29.017236257093817

Always the n_neighbors value has to be an odd number, therefore n_neighbors value is 29
n_neighbor = len(y_test) - 1

In [18]:
# Define the KNN model
# Setting p value into 2, because "spam" column has only 2 values, either 0 or 1
classifier = KNeighborsClassifier(n_neighbors=29, p=2, metric='euclidean')
classifier.fit(X_train, y_train)

In [19]:
# Predict the test set result
y_pred = classifier.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,

In [19]:
# Evaluate the model using confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[472,  31],
       [ 70, 269]], dtype=int64)

In the above matrix,
The true negative (TN) value is 472, which means that the model correctly identified 472 non-spam emails.
The false positive (FP) value is 31, which means that the model incorrectly identified 31 non-spam emails as spam.
The true positive (TP) value is 269, which means that the model correctly identified 269 spam emails.
The false negative (FN) value is 70, which means that the model incorrectly identified 70 spam emails as non-spam.

In [20]:
# printing the experimental results
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.94      0.90       503
           1       0.90      0.79      0.84       339

    accuracy                           0.88       842
   macro avg       0.88      0.87      0.87       842
weighted avg       0.88      0.88      0.88       842



## Using Decision Trees

In [21]:
dTree = DecisionTreeClassifier() # Creating an instance
dTree.fit(X_train,y_train) # Training the model

In [22]:
# Predict the test set result
predictions = dTree.predict(X_test)
predictions

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,

In [23]:
# Evaluate the model using confusion matrix
cm = confusion_matrix(y_test, predictions)
cm

array([[442,  61],
       [ 54, 285]], dtype=int64)

In the above matrix,
The true negative (TN) value is 444, which means that the model correctly identified 444 non-spam emails.
The false positive (FP) value is 59, which means that the model incorrectly identified 59 non-spam emails as spam.
The false negative (FN) value is 51, which means that the model incorrectly identified 51 spam emails as non-spam.
The true positive (TP) value is 288, which means that the model correctly identified 288 spam emails.

In [24]:
# printing the experimental results
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.89      0.88      0.88       503
           1       0.82      0.84      0.83       339

    accuracy                           0.86       842
   macro avg       0.86      0.86      0.86       842
weighted avg       0.86      0.86      0.86       842

