importing libraries

In [2]:
import pandas as pd

In [3]:
# Load the dataset
emails = pd.read_csv('emails.csv')

# Helper function that converts text to lowercase and splits words into a list
def process_email(text):
    """
    Processes the given email text by converting it to lowercase, splitting it into words,
    and returning a list of unique words.

    Parameters:
    - text (str): The email text to be processed.

    Returns:
    - list: A list of unique words extracted from the email text.
    """

    text = text.lower()
    return list(set(text.split()))

# Create an extra column with the text converted to a lower-cased list of words
emails['words'] = emails['text'].apply(process_email)

# Show the first 5 rows
emails.head(5)

Unnamed: 0,text,spam,words
0,Subject: naturally irresistible your corporate...,1,"[formats, -, your, are, organization, ', %, sp..."
1,Subject: the stock trading gunslinger fanny i...,1,"[tight, muzo, waterway, morristown, penultimat..."
2,Subject: unbelievable new homes made easy im ...,1,"[unconditionally, being, in, limited, -, we, t..."
3,Subject: 4 color printing special request add...,1,"[goldengraphix, printable, -, message, ramsey,..."
4,"Subject: do not have money , get software cds ...",1,"[death, get, comedies, old, along, subject:, a..."


In [5]:
def word_freq_per_class(df):

    
    word_freq_dict = {}
    
    for _, row in df.iterrows():
        words = row['words']
        # Iterate over the words in each email
        for word in words:
            # Check if word doesn't exist within the dictionary
            if word not in word_freq_dict:
                word_freq_dict[word] = {'spam': 0, 'ham': 0}
            
            # Check if the email was spam
            match row['spam']:
                case 0: 
                    
                    word_freq_dict[word]['ham'] += 1
                case 1: 
                    # If spam then add 1 to the count of spam
                    word_freq_dict[word]['spam'] += 1
                    


    return word_freq_dict

In [6]:
word_freq = word_freq_per_class(emails)
print(f"Frequency in both classes for word 'lottery': {word_freq['lottery']}\n")
print(f"Frequency in both classes for word 'sale': {word_freq['sale']}\n")

try:
    word_freq['asdfg']
except KeyError:
    print("Word 'asdfg' not in corpus")

Frequency in both classes for word 'lottery': {'spam': 8, 'ham': 0}

Frequency in both classes for word 'sale': {'spam': 38, 'ham': 41}

Word 'asdfg' not in corpus


In [7]:
def class_frequencies(df):
   

    
    class_freq_dict = { 
        "spam": len(df[df['spam'] == 1]),
        "ham": len(df[df['spam'] == 0])
    } 
    

    
    return class_freq_dict

In [8]:
class_freq = class_frequencies(emails)
print(f"The frequencies for each class are {class_freq}\n")
print(f"The proportion of spam in the dataset is: {100*class_freq['spam']/len(emails):.2f}%\n")
print(f"The proportion of ham in the dataset is: {100*class_freq['ham']/len(emails):.2f}%")

The frequencies for each class are {'spam': 1368, 'ham': 4360}

The proportion of spam in the dataset is: 23.88%

The proportion of ham in the dataset is: 76.12%


In [9]:
def naive_bayes_classifier(text, word_freq=word_freq, class_freq=class_freq):


    text = text.lower()
    words = set(text.split())
    cumulative_product_spam = 1.0
    cumulative_product_ham = 1.0
    
    ### START CODE HERE ###
    
    # Iterate over the words in the email
    for word in words:
        # You should only include words that exist in the corpus in your calculations
        if word in word_freq:
            word_freq_dict = word_freq[word]
            spam_count = word_freq_dict['spam']
            ham_count = word_freq_dict['ham']
            cumulative_product_spam *= spam_count / class_freq['spam']
            cumulative_product_ham *= ham_count / class_freq['ham']
    
     # Calculate the likelihood of the words appearing in the email given that it is spam
    likelihood_word_given_spam = cumulative_product_spam * (class_freq['spam'] / (class_freq['spam'] + class_freq['ham']))
    
    # Calculate the likelihood of the words appearing in the email given that it is ham
    likelihood_word_given_ham = cumulative_product_ham * (class_freq['ham'] / (class_freq['spam'] + class_freq['ham']))
    
    # Calculate the posterior probability of the email being spam given that the words appear in the email (the probability of being a spam given the email content)
    prob_spam = likelihood_word_given_spam / (likelihood_word_given_spam + likelihood_word_given_ham)
    
    ### END CODE HERE ###
    
    return prob_spam

In [10]:
msg = "enter the lottery to win three million dollars"
print(f"Probability of spam for email '{msg}': {100*naive_bayes_classifier(msg):.2f}%\n")

msg = "meet me at the lobby of the hotel at nine am"
print(f"Probability of spam for email '{msg}': {100*naive_bayes_classifier(msg):.2f}%\n")

msg = "9898 asjfkjfdj"
print(f"Probability of spam for email '{msg}': {100*naive_bayes_classifier(msg):.2f}%\n")

Probability of spam for email 'enter the lottery to win three million dollars': 100.00%

Probability of spam for email 'meet me at the lobby of the hotel at nine am': 0.00%

Probability of spam for email '9898 asjfkjfdj': 23.88%

