Training and test splitting

In [6]:
import pandas as pd
import numpy as np

# load the dataset
file_path = 'data/emails.csv'
emails_data = pd.read_csv(file_path)

# drop the 'Email No.' column as it's not useful for the model
emails_data_cleaned = emails_data.drop(columns=['Email No.'])

# separate features (X) and labels (y)
X = emails_data_cleaned.drop(columns=['Prediction']).values
y = emails_data_cleaned['Prediction'].values

# set a seed for reproducibility
np.random.seed(42)

# shuffle the indices
indices = np.arange(X.shape[0])
np.random.shuffle(indices)

# split the data (80% training, 20% testing)
split_index = int(0.8 * len(indices))
train_indices = indices[:split_index]
test_indices = indices[split_index:]

X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]

# print shapes to verify
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(4137, 3000) (1035, 3000) (4137,) (1035,)


Next step

In [None]:
import numpy as np

# calculate prior probabilities
def calculate_prior(y):
    spam_prior = np.sum(y) / len(y)
    not_spam_prior = 1 - spam_prior
    return spam_prior, not_spam_prior

# calculate likelihoods
def calculate_likelihood(X, y):
    spam_likelihood = (X[y == 1].sum(axis=0) + 1) / (np.sum(X[y == 1]) + X.shape[1])
    not_spam_likelihood = (X[y == 0].sum(axis=0) + 1) / (np.sum(X[y == 0]) + X.shape[1])
    return spam_likelihood, not_spam_likelihood

# predict class for a single email
def predict_email(email, spam_prior, not_spam_prior, spam_likelihood, not_spam_likelihood):
    spam_score = np.log(spam_prior) + np.sum(email * np.log(spam_likelihood))
    not_spam_score = np.log(not_spam_prior) + np.sum(email * np.log(not_spam_likelihood))
    return 1 if spam_score > not_spam_score else 0

# predict class for the entire dataset
def predict(X, spam_prior, not_spam_prior, spam_likelihood, not_spam_likelihood):
    predictions = []
    for email in X:
        predictions.append(predict_email(email, spam_prior, not_spam_prior, spam_likelihood, not_spam_likelihood))
    return np.array(predictions)

# training phase
spam_prior, not_spam_prior = calculate_prior(y_train)
spam_likelihood, not_spam_likelihood = calculate_likelihood(X_train, y_train)

# prediction phase
y_pred = predict(X_test, spam_prior, not_spam_prior, spam_likelihood, not_spam_likelihood)

# evaluate accuracy
accuracy = np.mean(y_pred == y_test)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.94
