Training and test splitting

In [6]:
import pandas as pd
import numpy as np

# load the dataset
file_path = 'data/emails.csv'
emails_data = pd.read_csv(file_path)

# drop the 'Email No.' column as it's not useful for the model
emails_data_cleaned = emails_data.drop(columns=['Email No.'])

# separate features (X) and labels (y)
X = emails_data_cleaned.drop(columns=['Prediction']).values
y = emails_data_cleaned['Prediction'].values

# set a seed for reproducibility
np.random.seed(42)

# shuffle the indices
indices = np.arange(X.shape[0])
np.random.shuffle(indices)

# split the data (80% training, 20% testing)
split_index = int(0.8 * len(indices))
train_indices = indices[:split_index]
test_indices = indices[split_index:]

X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]

# print shapes to verify
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(4137, 3000) (1035, 3000) (4137,) (1035,)


Implement the model

In [9]:
import numpy as np

# calculate prior probabilities for spam and not spam
def calculate_prior(y):
    total_emails = len(y)
    spam_count = np.sum(y)  # count of emails labeled as spam
    not_spam_count = total_emails - spam_count  # count of emails labeled as not spam

    spam_prior = spam_count / total_emails
    not_spam_prior = not_spam_count / total_emails

    return spam_prior, not_spam_prior

# calculate word likelihoods with laplace smoothing
def calculate_likelihood(X, y):
    # separate emails by class
    spam_emails = X[y == 1]
    not_spam_emails = X[y == 0]

    # calculate word counts for each class
    spam_word_counts = np.sum(spam_emails, axis=0)
    not_spam_word_counts = np.sum(not_spam_emails, axis=0)

    # calculate total word counts for each class
    total_spam_words = np.sum(spam_word_counts)
    total_not_spam_words = np.sum(not_spam_word_counts)

    # apply laplace smoothing to avoid zero probabilities
    vocab_size = X.shape[1]
    spam_likelihood = (spam_word_counts + 1) / (total_spam_words + vocab_size)
    not_spam_likelihood = (not_spam_word_counts + 1) / (total_not_spam_words + vocab_size)

    return spam_likelihood, not_spam_likelihood

# predict the class for a single email
def predict_email(email, spam_prior, not_spam_prior, spam_likelihood, not_spam_likelihood):
    # calculate the log-probabilities for spam and not spam
    spam_score = np.log(spam_prior) + np.sum(email * np.log(spam_likelihood))
    not_spam_score = np.log(not_spam_prior) + np.sum(email * np.log(not_spam_likelihood))

    # assign the class with the higher score
    if spam_score > not_spam_score:
        return 1  # spam
    else:
        return 0  # not spam

# predict the class for all emails in the dataset
def predict(X, spam_prior, not_spam_prior, spam_likelihood, not_spam_likelihood):
    predictions = []
    for email in X:
        predicted_class = predict_email(
            email,
            spam_prior,
            not_spam_prior,
            spam_likelihood,
            not_spam_likelihood
        )
        predictions.append(predicted_class)
    return np.array(predictions)

# training phase: calculate prior probabilities and likelihoods
spam_prior, not_spam_prior = calculate_prior(y_train)
spam_likelihood, not_spam_likelihood = calculate_likelihood(X_train, y_train)

# prediction phase: classify test emails
y_pred = predict(X_test, spam_prior, not_spam_prior, spam_likelihood, not_spam_likelihood)

# evaluate accuracy
accuracy = np.mean(y_pred == y_test)
print(f'Accuracy: {accuracy:.2f}')


Accuracy: 0.94
