### 1.

In [1]:
# Load the training.txt validation.txt test1.txt and test.txt dataset files into Python data frames and the two list files into lists.
# (Please do NOT shuffle the data frames)

import pandas as pd

training_df = pd.read_csv('training.txt')
validation_df = pd.read_csv('validation.txt')
test1_df = pd.read_csv('test1.txt')
test2_df = pd.read_csv('test2.txt')


# Load censored lists
def load_list(file_name):
    with open(file_name, "r") as f:
        return [line.strip() for line in f]

censored_list_test1 = load_list("censored_list_test1.txt")
censored_list_test2 = load_list("censored_list_test2.txt")

### 2.

In [2]:
# Step 2: Pre-process the SMS messages
import string

def preprocess_message(message):
    # Remove punctuation and numbers
    message = message.translate(str.maketrans("", "", string.punctuation + string.digits))
    # Convert to lowercase
    return message.lower()

# Apply the preprocessing function to all datasets
training_df["sms"] = training_df["sms"].apply(preprocess_message)
validation_df["sms"] = validation_df["sms"].apply(preprocess_message)
test1_df["sms"] = test1_df["sms"].apply(preprocess_message)
test2_df["sms"] = test2_df["sms"].apply(preprocess_message)

# Display the first few rows of each dataset to ensure preprocessing is correct
print(training_df.head())



  label                                                sms
0   ham  hi darlin i cantdo anythingtomorrow as myparen...
1   ham                  kkhow about your training process
2   ham  k actually can you guys meet me at the sunoco ...
3   ham                             ok lor msg me b u call
4  spam                    freemsgfav xmas tonesreply real


### 4

In [60]:
import numpy as np
import pandas as pd
class NaiveBayesForSpam:
    def train (self, hamMessages, spamMessages):
        self.words = set(' '.join (hamMessages + spamMessages).split())
        self.priors = np.zeros (2)
        self.priors[0] = float (len (hamMessages)) / (len (hamMessages) + len (spamMessages))
        self.priors[1] = 1.0 - self.priors[0]
        self.likelihoods = []
        for i, w in enumerate (self.words):
            prob1 = (1.0 + len ([m for m in hamMessages if w in m])) / len (hamMessages)
            prob2 = (1.0 + len ([m for m in spamMessages if w in m])) / len (spamMessages)
            self.likelihoods.append ([min (prob1, 0.95), min (prob2, 0.95)])
        self.likelihoods = np.array (self.likelihoods).T
        
    def train2 (self, hamMessages, spamMessages):
        self.words = set (' '.join (hamMessages + spamMessages).split())
        self.priors = np.zeros (2)
        self.priors[0] = float (len (hamMessages)) / (len (hamMessages) + len (spamMessages))
        self.priors[1] = 1.0 - self.priors[0]
        self.likelihoods = []
        spamkeywords = []
        for i, w in enumerate (self.words):
            prob1 = (1.0 + len ([m for m in hamMessages if w in m])) / len (hamMessages)
            prob2 = (1.0 + len ([m for m in spamMessages if w in m])) / len (spamMessages)
            if prob1 * 20 < prob2:
                self.likelihoods.append ([min (prob1, 0.95), min (prob2, 0.95)])
                spamkeywords.append(w)
        self.words = spamkeywords
        self.likelihoods = np.array (self.likelihoods).T

    def predict (self, message):
        posteriors = np.copy (self.priors)
        for i, w in enumerate (self.words):
            if w in message.lower():  # convert to lower-case
                posteriors *= self.likelihoods[:,i]
            else:                                   
                posteriors *= np.ones (2) - self.likelihoods[:,i]
            posteriors = posteriors / np.linalg.norm (posteriors)  # normalise
        if posteriors[1] > 0.5:
            return ['spam', posteriors[1]]
        return ['ham', posteriors[0]]    

    def score(self, messages, labels):
        confusion = np.zeros(4).reshape(2, 2)
        for m, l in zip(messages, labels):
            prediction = self.predict(m)[0]  # Call self.predict only once
            if prediction == 'ham' and l == 'ham':
                confusion[0, 0] += 1
            elif prediction == 'ham' and l == 'spam':
                confusion[0, 1] += 1
            elif prediction == 'spam' and l == 'ham':
                confusion[1, 0] += 1
            elif prediction == 'spam' and l == 'spam':
                confusion[1, 1] += 1
        return (confusion[0, 0] + confusion[1, 1]) / float(confusion.sum()), confusion



### 5

In [57]:
import time
# Separate ham and spam messages
training_hamMessages = training_df[training_df["label"] == "ham"]["sms"].tolist()
training_spamMessages = training_df[training_df["label"] == "spam"]["sms"].tolist()

start_time1 = time.time()
classifier1 = NaiveBayesForSpam()
classifier1.train(training_hamMessages, training_spamMessages)
train_time1 = time.time() - start_time1

start_time2 = time.time()
classifier2 = NaiveBayesForSpam()
classifier2.train2(training_hamMessages, training_spamMessages)
train_time2 = time.time() - start_time2
print("Training completed for both classifiers.")
print("Training time for train1 classifier:", train_time1)
print("Training time for train2 classifier:", train_time2)

Training completed for both classifiers.
Training time for train1 classifier: 1.5821468830108643
Training time for train2 classifier: 0.9254882335662842


### 6

In [58]:
# Step 3: Evaluate classifiers on the validation set
validation_messages = validation_df["sms"].tolist()
validation_labels = validation_df["label"].tolist()

accuracy1, confusion1 = classifier1.score(validation_messages, validation_labels)
accuracy2, confusion2 = classifier2.score(validation_messages, validation_labels)

print("Classifier 1 (train):")
print("Accuracy:", accuracy1)
print("Confusion Matrix:\n", confusion1)

print("Classifier 2 (train2):")
print("Accuracy:", accuracy2)
print("Confusion Matrix:\n", confusion2)

Classifier 1 (train):
Accuracy: 0.956
Confusion Matrix:
 [[845.  29.]
 [ 15. 111.]]
Classifier 2 (train2):
Accuracy: 0.963
Confusion Matrix:
 [[857.  34.]
 [  3. 106.]]


### 7

### 8

number of false positives: confusion[1, 0]
just need to edit this threshold posteriors[0] > 0.8 
!! nothing changed with diff threshold

In [61]:
# Evaluate classifiers on the validation set with adjusted threshold for producing less false positive

accuracy1_adj, confusion1_adj = classifier1.score(validation_messages, validation_labels)
accuracy2_adj, confusion2_adj = classifier2.score(validation_messages, validation_labels)

print("Classifier 1 (train):")
print("Accuracy:", accuracy1_adj)
print("Confusion Matrix:\n", confusion1_adj)

print("Classifier 2 (train2):")
print("Accuracy:", accuracy2_adj)
print("Confusion Matrix:\n", confusion2_adj)

Classifier 1 (train):
Accuracy: 0.956
Confusion Matrix:
 [[845.  29.]
 [ 15. 111.]]
Classifier 2 (train2):
Accuracy: 0.963
Confusion Matrix:
 [[857.  34.]
 [  3. 106.]]


### 9

In [40]:
class NaiveBayesForSpam2:
    def train(self, hamMessages, spamMessages):
        self.words = set(' '.join(hamMessages + spamMessages).split())
        self.priors = np.zeros(2)
        self.priors[0] = float(len(hamMessages)) / (len(hamMessages) + len(spamMessages))
        self.priors[1] = 1.0 - self.priors[0]
        self.likelihoods = []
        for i, w in enumerate(self.words):
            prob1 = (1.0 + len([m for m in hamMessages if w in m])) / len(hamMessages)
            prob2 = (1.0 + len([m for m in spamMessages if w in m])) / len(spamMessages)
            self.likelihoods.append([min(prob1, 0.95), min(prob2, 0.95)])
        self.likelihoods = np.array(self.likelihoods).T

    def train2(self, hamMessages, spamMessages):
        self.words = set(' '.join(hamMessages + spamMessages).split())
        self.priors = np.zeros(2)
        self.priors[0] = float(len(hamMessages)) / (len(hamMessages) + len(spamMessages))
        self.priors[1] = 1.0 - self.priors[0]
        self.likelihoods = []
        spamkeywords = []
        for i, w in enumerate(self.words):
            prob1 = (1.0 + len([m for m in hamMessages if w in m])) / len(hamMessages)
            prob2 = (1.0 + len([m for m in spamMessages if w in m])) / len(spamMessages)
            if prob1 * 20 < prob2:
                self.likelihoods.append([min(prob1, 0.95), min(prob2, 0.95)])
                spamkeywords.append(w)
        self.words = spamkeywords
        self.likelihoods = np.array(self.likelihoods).T

    def predict(self, message, censored_words=None):
        if censored_words is None:
            censored_words = []
        posteriors = np.copy(self.priors)
        for i, w in enumerate(self.words):
            if w in censored_words:
                continue  # Skip likelihood calculation for censored words
            if w in message.lower():
                posteriors *= self.likelihoods[:, i]
            else:
                posteriors *= np.ones(2) - self.likelihoods[:, i]
            posteriors = posteriors / np.linalg.norm(posteriors)
        if posteriors[0] > 0.5:
            return ['ham', posteriors[0]]
        return ['spam', posteriors[1]]

    def score(self, messages, labels, censored_words=None):
        confusion = np.zeros(4).reshape(2, 2)
        for m, l in zip(messages, labels):
            prediction = self.predict(m, censored_words=censored_words)[0]
            if prediction == 'ham' and l == 'ham':
                confusion[0, 0] += 1
            elif prediction == 'ham' and l == 'spam':
                confusion[0, 1] += 1
            elif prediction == 'spam' and l == 'ham':
                confusion[1, 0] += 1
            elif prediction == 'spam' and l == 'spam':
                confusion[1, 1] += 1
        return (confusion[0, 0] + confusion[1, 1]) / float(confusion.sum()), confusion


### 10


In [41]:
# Step 2: Train the classifiers

classifier1 = NaiveBayesForSpam2()
classifier1.train(training_hamMessages, training_spamMessages)


classifier2 = NaiveBayesForSpam2()
classifier2.train2(training_hamMessages, training_spamMessages)

print("Training completed for both classifiers.")



Training completed for both classifiers.


In [43]:
# Step 3: Evaluate classifiers on test1 with censored words
test1_messages = test1_df["sms"].tolist()
test1_labels = test1_df["label"].tolist()


accuracy1, confusion1 = classifier1.score(test1_messages, test1_labels, censored_words=censored_list_test1)
accuracy2, confusion2 = classifier2.score(test1_messages, test1_labels, censored_words=censored_list_test1)

print("Classifier 1 (train) on test1 with censored words:")
print("Accuracy:", accuracy1)
print("Confusion Matrix:\n", confusion1)


print("Classifier 2 (train2) on test1 with censored words:")
print("Accuracy:", accuracy2)
print("Confusion Matrix:\n", confusion2)



Classifier 1 (train) on test1 with censored words:
Accuracy: 0.9688715953307393
Confusion Matrix:
 [[1099.   29.]
 [  11.  146.]]
Classifier 2 (train2) on test1 with censored words:
Accuracy: 0.9735408560311284
Confusion Matrix:
 [[1106.   30.]
 [   4.  145.]]


### 11

In [44]:
# Step 3: Evaluate classifiers on test1 with censored words
test2_messages = test2_df["sms"].tolist()
test2_labels = test2_df["label"].tolist()


accuracy1, confusion1 = classifier1.score(test2_messages, test2_labels, censored_words=censored_list_test2)
accuracy2, confusion2 = classifier2.score(test2_messages, test2_labels, censored_words=censored_list_test2)

print("Classifier 1 (train) on test2 with censored words:")
print("Accuracy:", accuracy1)
print("Confusion Matrix:\n", confusion1)


print("Classifier 2 (train2) on test12 with censored words:")
print("Accuracy:", accuracy2)
print("Confusion Matrix:\n", confusion2)


Classifier 1 (train) on test2 with censored words:
Accuracy: 0.9618973561430794
Confusion Matrix:
 [[1091.   38.]
 [  11.  146.]]
Classifier 2 (train2) on test12 with censored words:
Accuracy: 0.9611197511664075
Confusion Matrix:
 [[1098.   46.]
 [   4.  138.]]
