# AI201 Programming Assignment 2
## Naive Bayes Spam Filter

*Submitted by: Mike Allan Nillo*

### Table of Contents
- Loading of Data
- Classifier Construction and Evaluation
- Lambda Smoothing
- Improving your Classifier
- Conclusion

### Loading of Data

In [15]:
import os
import csv

# Step 1: Load the Labels
labels = {}
with open('/home/mikeallan/analytics/meng-ai/ai-201/AI201_PA2_Spam_Filter_2SAY23-24/AI201_PA2_Spam_Filter_2SAY23-24/trec06p-ai201/labels', 'r') as f:
    for line in f:
        label, rel_path = line.strip().split(' ')
        filename = os.path.basename(rel_path)
        labels[filename] = label


# Step 2: Load the Dataset
data = []
base_path = '/home/mikeallan/analytics/meng-ai/ai-201/AI201_PA2_Spam_Filter_2SAY23-24/AI201_PA2_Spam_Filter_2SAY23-24/trec06p-ai201/data'

# Walk through each directory and file in the base_path
for dirpath, dirnames, filenames in os.walk(base_path):
    for filename in filenames:
        file_path = os.path.join(dirpath, filename)
        with open(file_path, 'r', errors='ignore') as f:
            # Read the file content
            content = f.read()
            # Get the label from the labels dictionary
            label = labels.get(filename, 'unknown')
            data.append((content, label))

### Classifier Construction and Evaluation

In [17]:
import random

# Assume 'data' is a list of tuples, where each tuple contains the file content and the label
random.shuffle(data)

# Calculate the index that separates the training data from the test data
split_index = int(len(data) * 0.7)

# Split the data into a training set and a test set
train_data = data[:split_index]
test_data = data[split_index:]

# Separate the texts and the labels
texts_train, labels_train = zip(*train_data)
texts_test, labels_test = zip(*test_data)

In [19]:
# Initialize dictionaries for spam and ham words
spam_words = {}
ham_words = {}

# Initialize counters for spam and ham documents
spam_docs = 0
ham_docs = 0

# Parse the documents in the training set
for text, label in zip(texts_train, labels_train):
    # Convert the text to lower case and replace commas and periods with spaces
    text = text.lower().replace(',', ' ').replace('.', ' ')
    # Split the text into words
    words = text.split()
    # Update the appropriate dictionaries
    if label == 'spam':
        spam_docs += 1
        for word in words:
            if word.isalpha():  # Check if the word contains only alphabetic characters
                spam_words[word] = spam_words.get(word, 0) + 1
    else:
        ham_docs += 1
        for word in words:
            if word.isalpha():  # Check if the word contains only alphabetic characters
                ham_words[word] = ham_words.get(word, 0) + 1

# Form the vocabulary of unique words in the training data
vocabulary = set(spam_words.keys()).union(set(ham_words.keys()))

# Count the total number of documents
total_docs = spam_docs + ham_docs

# Calculate and report the prior probabilities for spam and ham
prior_spam = spam_docs / total_docs
prior_ham = ham_docs / total_docs

print(f'Prior probability for spam: {prior_spam}')
print(f'Prior probability for ham: {prior_ham}')

Prior probability for spam: 0.7821045475147304
Prior probability for ham: 0.21789545248526968


In [20]:
# Calculate the total number of words in spam and ham documents
total_spam_words = sum(spam_words.values())
total_ham_words = sum(ham_words.values())

# Calculate the total number of words in the vocabulary
total_words = len(vocabulary)

# Initialize dictionaries for spam and ham probabilities
spam_probs = {}
ham_probs = {}

# Calculate the word probabilities for spam and ham
for word in vocabulary:
    spam_probs[word] = (spam_words.get(word, 0) + 1) / (total_spam_words + total_words)
    ham_probs[word] = (ham_words.get(word, 0) + 1) / (total_ham_words + total_words)

# Define a function to classify a text as spam or ham
def classify(text):
    # Convert the text to lower case and replace commas and periods with spaces
    text = text.lower().replace(',', ' ').replace('.', ' ')
    # Split the text into words
    words = text.split()
    # Initialize the spam and ham probabilities with the prior probabilities
    spam_prob = prior_spam
    ham_prob = prior_ham
    # Update the probabilities for each word in the text
    for word in words:
        if word in vocabulary:
            spam_prob *= spam_probs[word]
            ham_prob *= ham_probs[word]
    # Return the class with the highest probability
    return 'spam' if spam_prob > ham_prob else 'ham'

In [21]:
# Initialize counters for correct and total predictions
correct_predictions = 0
total_predictions = 0

# Classify the documents in the test set
for text, true_label in zip(texts_test, labels_test):
    # Classify the text
    predicted_label = classify(text)
    # Update the counters
    if predicted_label == true_label:
        correct_predictions += 1
    total_predictions += 1

# Calculate and print the accuracy
accuracy = correct_predictions / total_predictions
print(f'Accuracy: {accuracy}')

Accuracy: 0.4299814929056138


In [22]:
def calculate_precision_recall(predictions, labels):
    # Initialize counters for true positives, false positives, and false negatives
    tp = fp = fn = 0

    # Count the true positives, false positives, and false negatives
    for predicted, true in zip(predictions, labels):
        if predicted == 'spam':
            if true == 'spam':
                tp += 1
            else:
                fp += 1
        elif true == 'spam':
            fn += 1

    # Calculate and return the precision and recall
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    return precision, recall

# Use the function to calculate the precision and recall
predictions = [classify(text) for text in texts_test]
precision, recall = calculate_precision_recall(predictions, labels_test)
print(f'Precision: {precision}')
print(f'Recall: {recall}')

Precision: 0.7907692307692308
Recall: 0.37434173669467785
