In [1]:
import pandas as pd
import numpy as np



In [2]:
data = pd.read_csv('emails.csv')


In [3]:
#Getting the information about the email
data.head(5)

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [4]:
data.info()  #getting information about the number of  rows and columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Columns: 3002 entries, Email No. to Prediction
dtypes: int64(3001), object(1)
memory usage: 118.5+ MB


In [5]:
data.tail(2)

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
5170,Email 5171,2,7,1,0,2,1,28,2,0,...,0,0,0,0,0,0,0,1,0,1
5171,Email 5172,22,24,5,1,6,5,148,8,2,...,0,0,0,0,0,0,0,0,0,0


In [6]:
X = data.iloc[:, 1:-1]  # Features
y = data.iloc[:, -1]    # Labels

In [7]:
X_train = X.iloc[:4500]
y_train = y.iloc[:4500]
X_test = X.iloc[4500:]
y_test = y.iloc[4500:]

In [8]:
class NaiveBayesClassifier:
    def __init__(self, alpha=1):
        self.alpha = alpha
        self.priors = {}
        self.likelihoods = {}

    def fit(self, X, y):
        self.priors = y.value_counts(normalize=True).to_dict()

        total_counts_per_class = X.groupby(y).sum()
        total_counts_per_class += self.alpha 
        total_word_counts = total_counts_per_class.sum(axis=1)
        self.likelihoods = np.log(total_counts_per_class.div(total_word_counts, axis=0))

    def predict(self, X):
        predictions = []
        for _, row in X.iterrows():
            class_probs = {}
            for class_label, prior in self.priors.items():
                likelihood = self.likelihoods.loc[class_label]
                log_prob = (row * likelihood).sum() + np.log(prior)
                class_probs[class_label] = log_prob
            predictions.append(max(class_probs, key=class_probs.get))
        return predictions

In [9]:
# Experiment with different values of alpha
alphas = [0.1, 0.5, 1, 2, 5]
best_accuracy = 0
best_alpha = 0

In [10]:

for alpha in alphas:
    classifier = NaiveBayesClassifier(alpha=alpha)
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    accuracy = (predictions == y_test).mean()
    print("Accuracy with alpha={}: {:.2f}%".format(alpha, accuracy*100))
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_alpha = alpha

Accuracy with alpha=0.1: 89.14%
Accuracy with alpha=0.5: 88.39%
Accuracy with alpha=1: 87.65%
Accuracy with alpha=2: 87.35%
Accuracy with alpha=5: 86.61%


In [11]:
print("\nThe best model Accuracy is: {:.2f}% with alpha = {}".format(best_accuracy*100, best_alpha))



The best model Accuracy is: 89.14% with alpha = 0.1
