# SENG 474
# Assignment 2 - Problem 3
# Nolan Kurylo
# V00893175
To execute notebook, ensure ALL cells are run from top to bottom 

The Naive Bayes classifier required use of the following formulas:

A Priori Probabilities:
$$ 
P(spam) = \frac{\#\ spam\ messages}{\#\ total\ messages}
$$
$$
P(ham) = \frac{\#\ ham\ messages}{\#\ total\ messages}  
$$

Conditional Probabilities:
$$ 
P(w_i|spam) = \frac{\#\ spam\ messages\ containing\ word\ w_i}{\#\ spam\ messages} 
$$
$$
P(w_i|ham) = \frac{\#\ ham\ messages\ containing\ word\ w_i}{\#\ ham\ messages} 
$$

A Posteriori Probabilities:

$$
P(spam|w_1,w_2,...,w_n) = P(spam) \prod_i^n P(w_i|spam) 
$$
$$
P(ham|w_1,w_2,...,w_n) = P(ham) \prod_i^n P(w_i|ham) 
$$

Bayes Decision Function:

If:
$$
P(spam|w_1,w_2,...,w_n) > P(ham|w_1,w_2,...,w_n)
$$
Decide: Spam

Otherwise 
$$
P(spam|w_1,w_2,...,w_n) < P(ham|w_1,w_2,...,w_n)
$$
Decide: Ham

References
1) https://www.kdnuggets.com/2020/07/spam-filter-python-naive-bayes-scratch.html
2) https://towardsdatascience.com/na%C3%AFve-bayes-spam-filter-from-scratch-12970ad3dae7

In [1]:
import pandas as pd
import numpy as np
np.random.seed(1337)


# Slighty modified function I made for A1
def split_training_validation_sets(df): 
    """ Find 70% of the original dataset as the training set, 30% as the validaiton set
    :param df: dataframe to be split
    :return: training and validation splits as dataframes
    """
    shuffled_dataset = df.sample(frac=1).reset_index(drop=True) # shuffle the dataset
    
    split_70_30 = int(df.shape[0] * 0.7) # find the index for the 70 / 30 split to split on

    training_set = shuffled_dataset.iloc[:split_70_30].reset_index(drop=True) # 70% of dataset
    training_set['MSG'] = training_set['MSG'].str.lower().replace('[^\s\da-zA-Z]', ' ', regex = True) # replace all no alphanumeric characters with a space and convert to lowercase
    training_set['MSG'] = training_set['MSG'].str.split()

    validation_set = shuffled_dataset.iloc[split_70_30:].reset_index(drop=True) # 30% of dataset
    validation_set['MSG'] = validation_set['MSG'].str.lower().replace('[^\s\da-zA-Z]', ' ', regex = True) # replace all no alphanumeric characters with a space and convert to lowercase
    validation_set['MSG'] = validation_set['MSG'].str.split()

    return training_set, validation_set

def fit(train_X, train_y): 
    """ Finds the conditional probabilities of each word i given if it was a spam or ham message as well as the a priori probabilities for spam and ham across the dataset
    :param train_X: training set
    :param train_y: training label vector
    :return: P(wi|spam), P(wi|ham), P(spam), P(ham)
    """
    print("Training...(takes ~20-25 seconds)")

    words = [] # list of all unique words in train_X
    for index, bag in train_X.iteritems(): #find all unique words in train_X and add each to a list
        for word in bag: # each row is a bag of words
            if word not in words: # make sure unique words added to list only
                words.append(word)

    word_counts = {} # dict for each word in the dataset that contains an array of counts for the number of times it appears in each row (bag)
    Pwi_spam = {} # dict for each word in the dataset that will eventually be given a conditional probability of that word appearing given that it is a spam message
    Pwi_ham= {} # dict for each word in the dataset that will eventually be given a conditional probability of that word appearing given that it is a ham message

    num_rows = len(train_X)
    
    for word in words: 
        word_counts[word] = [0] * num_rows # for each unique word in the dataset, initialize the number of times the word is present to 0 for each row of training set
        Pwi_spam[word] = 0 # for each unique word in the dataset, initialize its conditional probaility of a word being a spam word given that it is spam
        Pwi_ham[word] = 0 # for each unique word in the dataset, initialize its conditional probaility of a word being a ham word given that it is ham
    
    for index, bag in train_X.iteritems(): # for each bag in the training set, increment the number of times each unique word is present
        for word in bag:
            word_counts[word][index] += 1

    train_X = pd.DataFrame(word_counts) # transform word counts matrix to a dataframe where each unique word is a column and each row is the number of times that column word is present mapping to each row in train_X

    spams = train_X.iloc[np.where(train_y == 'spam')[0]] # get all spam messages
    num_spam = len(spams)

    hams = train_X.iloc[np.where(train_y == 'ham')[0]] # get all ham messages
    num_ham = len(hams)

    Pspam = num_spam / num_rows # P(spam) = # spam messages / total # messages -> a priori
    Pham = num_ham / num_rows # P(ham) = # ham messages / total # messages -> a priori

    for word in words: #find conditional probability of each word given than it is ham or spam
        Pwi_spam[word] = spams[word].sum() / num_spam # P(wi|spam) = #spam messages containing word / # spam messages
        Pwi_ham[word] = hams[word].sum() / num_ham # P(wi|ham) = #ham messages containing word / # ham messages

    return Pwi_spam, Pwi_ham, Pspam, Pham


def predict(test_X, test_y, Pwi_spam, Pwi_ham, Pspam, Pham): 
    """ Finds the accuarcy of the naive bayes classifier by calculating the a posterior probabilities for each bag of words
    :param test_X: testing set
    :param test_y: testing label vector
    :param Pwi_spam: P(wi|spam)
    :param Pwi_ham: P(wi|ham)
    :param Pspam: P(spam)
    :param Pham: P(ham)
    :return: accuracy (%)

    """
    num_correct = 0 # keep track of number of correctly predicted messages
    for index, bag in test_X.iteritems():
        Pspam_wi = [] # P(spam|w1,w2,...,wn) = P(spam) * mult[P(wi|spam)]
        Pham_wi = [] # P(ham|w1,w2,...,wn) = P(ham) * mult[P(wi|ham)]

        actual = test_y.loc[index] #spam or ham
        predicted = "" 
        for word in bag: 
            if word in Pwi_spam:
                Pspam_wi.append(Pwi_spam[word]) # add all P(wi|ham) for the current bag to a list
            if word in Pwi_ham: 
                Pham_wi.append(Pwi_ham[word]) # add all P(wi|ham) for the current bag to a list

        # Find A posteriori probabilities for given bag
        Pspam_w = Pspam * np.prod(Pspam_wi) # P(spam) mult[P(wi|spam)]
        Pham_w = Pham * np.prod(Pham_wi) # P(ham) mult[P(wi|ham)]

        if(Pspam_w > Pham_w ): # decision function for spam
            predicted = "spam"
        elif(Pham_w > Pspam_w): # decision function for ham
            predicted = "ham"

        if(actual == predicted):
            num_correct += 1

    num_rows = len(test_X)
    accuracy = (num_correct / num_rows) * 100
    return accuracy
 
# MAIN Program

df = pd.read_csv('SMSSpamCollection', sep='\t', names=['TARGET', 'MSG']) # captilized incase these names are present in the csv (csv will be converted to lower case)

training_set, validation_set = split_training_validation_sets(df)

train_X = training_set['MSG']
train_y = training_set['TARGET']

test_X = validation_set['MSG']
test_y = validation_set['TARGET']

Pwi_spam, Pwi_ham, Pspam, Pham = fit(train_X, train_y) # train the naive bayes classifier using the training set

training_acc = predict(train_X, train_y, Pwi_spam, Pwi_ham, Pspam, Pham) # validate the naive bayes classifier on the validation test
training_err = 100 - training_acc

validation_acc = predict(test_X, test_y, Pwi_spam, Pwi_ham, Pspam, Pham) # validate the naive bayes classifier on the validation test
validation_err = 100 - validation_acc

print()
print("Training Accuracy = " + str (training_acc)  + "%")
print("Training Error = " + str (training_err) + "%")
print()
print("Validation Accuracy = " + str (validation_acc)  + "%")
print("Validation Error = " + str (validation_err) + "%")






Training...(takes ~20-25 seconds)

Training Accuracy = 99.53846153846155%
Training Error = 0.4615384615384528%

Validation Accuracy = 90.07177033492823%
Validation Error = 9.928229665071768%
