## Scenario
Hopkins et al. (1999) created the Spambase data set donated to the UCI Machine Learning Repository. The data set contains 4,601 emails marked as spam or non-spam by a postmaster or individuals. Fifty-seven features aid in classifying emails as spam (e.g. word frequencies and email characteristics). The Spambase data set is used for developing and benchmarking spam detection models, providing a base for analysing the effectiveness of various machine learning techniques in distinguishing between spam and legitimate emails.

As a data professional, you were tasked by your company to develop a neural network with TensorFlow that can classify emails as spam or non-spam. You were tasked to develop a model based on the Spambase data set.

In [None]:
# URL to import data set from GitHub.
url = 'https://raw.githubusercontent.com/fourthrevlxd/cam_dsb/main/spamdata.csv'

In [None]:
#import relevant libraries
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [None]:
#create a dataframe of the data
df = pd.read_csv(url)
df.head(5)

Unnamed: 0,0,0.64,0.64.1,0.1,0.32,0.2,0.3,0.4,0.5,0.6,...,0.41,0.42,0.43,0.778,0.44,0.45,3.756,61,278,1
0,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
1,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
2,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,1.85,0.0,0.0,1.85,0.0,0.0,...,0.0,0.223,0.0,0.0,0.0,0.0,3.0,15,54,1


In [None]:
#assign all features excluding the last column - the target variable
X = df.iloc[:, :-1]

y = df.iloc[:,-1]

In [None]:
#split the data into train and test sets, using a test percentage of 20%
X_train_full, X_test, y_train_full, y_test = train_test_split(X,
                                                              y,
                                                              test_size = 0.2)

#create a validation data set with a split of 0.1
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full,
                                                      y_train_full,
                                                      test_size = 0.1
                                                      )

In [None]:
#standardise the features
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)

#do not fit the validation and test datasets to ensure they remain unseen
X_test = scaler.transform(X_test)
X_valid = scaler.transform(X_valid)

In [None]:
#define the sequential model
def create_model(learning_rate=0.001):
    model = Sequential([
        Input(shape=(X_train.shape[1],)),
        Dense(units=64, activation='relu'),
        Dense(units=32, activation='relu'),
        Dense(units=16, activation='relu'),
        #output layer with 1 neuron to determine whether an email is spam or not
        Dense(units=1, activation='sigmoid')
    ])
    #for testing different leraning rates
    optimizer = Adam(learning_rate=learning_rate)
    #binary_crossentropy as it is classification model
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model


In [None]:
#train and evaluate model with different number of epochs and batch sizes to find optimum
def train_evaluate_model(model, X_train, y_train,
                         X_test, y_test, epochs=10,
                         batch_size=32):
    history = model.fit(X_train,y_train, epochs=epochs,
              batch_size=batch_size,
              validation_data=(X_valid, y_valid),
              verbose=0)

    #evaluate on test set
    y_pred = (model.predict(X_test)>0.5).astype(int)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy, history

In [None]:
#hyperameter tunining
learning_rates = [0.001, 0.1, 0.01]
batch_sizes = [16, 32, 64]
epochs = [10, 20, 50]

results = []

#create a for loop for each vector
for lr in learning_rates:
    for bs in batch_sizes:
        for ep in epochs:
            model = create_model(learning_rate=lr)
            accuracy, history = train_evaluate_model(model, X_train,
                                                     y_train, X_test,
                                                     y_test, batch_size=bs,
                                                     epochs=ep)
            results.append({'learning_rate': lr,
                            'batch_size': bs,
                            'epochs': ep,
                            'accuracy': accuracy})

[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

In [None]:
#display the results
results_df = pd.DataFrame(results)

#pivot the table for better visibility
pivoted_df = results_df.pivot_table(index=['learning_rate',
                                           'batch_size'],
                                    columns='epochs',
                                    values='accuracy')

#view the df
pivoted_df

Unnamed: 0_level_0,epochs,10,20,50
learning_rate,batch_size,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.001,16,0.936957,0.943478,0.936957
0.001,32,0.936957,0.932609,0.936957
0.001,64,0.943478,0.932609,0.944565
0.01,16,0.95,0.938043,0.945652
0.01,32,0.93587,0.93587,0.933696
0.01,64,0.943478,0.942391,0.944565
0.1,16,0.586957,0.586957,0.586957
0.1,32,0.85,0.708696,0.586957
0.1,64,0.915217,0.586957,0.931522


In [None]:
# Unpivot the DataFrame to find the row with max accuracy
unpivoted_df = pivoted_df.stack().reset_index()
unpivoted_df.columns = ['learning_rate', 'batch_size', 'epochs', 'accuracy']

# Find the row with max accuracy
best_config = unpivoted_df.loc[unpivoted_df['accuracy'].idxmax()]
print(best_config)

learning_rate     0.01
batch_size       16.00
epochs           10.00
accuracy          0.95
Name: 9, dtype: float64


In [None]:
#given results above, run the model with optimal hyperparameters
best_model = Sequential([
        Input(shape=(X_train.shape[1],)),
        Dense(units=64, activation='relu'),
        Dense(units=32, activation='relu'),
        Dense(units=16, activation='relu'),
        Dense(units=1, activation='sigmoid')
    ])
best_model.compile(loss='binary_crossentropy',
                  optimizer=Adam(learning_rate=0.01),
                  metrics=['accuracy'])

best_model.fit(X_train,y_train, epochs=10,
              batch_size=16,
              validation_data=(X_valid, y_valid),
              verbose=0)

<keras.src.callbacks.history.History at 0x7db0b7a6e150>

In [None]:
#get evaluation metrics
y_pred = (best_model.predict(X_test)>0.5).astype(int)
accuracy_score = accuracy_score(y_test, y_pred)
precision_score= precision_score(y_test, y_pred)
recall_score = recall_score(y_test, y_pred)
f1_score = f1_score(y_test, y_pred)


#print metrics
print("\nEvaluation Metrics:")
print(f"Accuracy : {accuracy_score:.4f}")
print(f"Precision: {precision_score:.4f}")
print(f"Recall   : {recall_score:.4f}")
print(f"F1 Score : {f1_score:.4f}\n")

[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step

Evaluation Metrics:
Accuracy : 0.9370
Precision: 0.9423
Recall   : 0.9026
F1 Score : 0.9220



# References

Hopkins, M., Reeber, E., Forman, G., Suermondt, J., 1999. Spambase. [online]. Available at: https://archive.ics.uci.edu/dataset/94. [Accessed 5 March 2024].