In [1]:
FILE = 'SpamDetection.csv'

## Parse the CSV file into training and testing sets

In [2]:
import csv

all_set = []
training_set = []
testing_set = []
with open(FILE) as csv_file:
    csv_read=csv.reader(csv_file, delimiter=',')
    for i, row in enumerate(csv_read):
        if i == 0:
            continue # skip the header row
        all_set.append(row)
        if i <= 20: # First 20 are training, rest are testing
            training_set.append(row)
        else:
            testing_set.append(row)

## Seperate the training data into spam and ham

In [3]:
prob_spam = 0
prob_ham = 0
spam_set: list[str] = []
ham_set: list[str] = []
for i in training_set:
    if i[0] == 'spam':
        prob_spam += 1
        spam_set.append(i[1])
    elif i[0] == 'ham':
        prob_ham += 1
        ham_set.append(i[1])
training_sentences = spam_set + ham_set

prob_spam = prob_spam / len(training_set)
prob_ham = prob_ham / len(training_set)

print(f"Probability of ham: {prob_ham}")
print(f"Probability of spam: {prob_spam}")

Probability of ham: 0.55
Probability of spam: 0.45


## Define a basic tokenize function
This will remove punctuation and lowercase all words so that it isn't case sensitive

In [4]:
import string
def tokenize(sentence: str) -> list[str]:
    sentence = sentence.translate(str.maketrans('', '', string.punctuation)).lower() # remove punctuation and make lowercase
    return sentence.split()

# Word Counts

## Function that counts how many of each word is in the set

In [5]:
def get_word_counts(sentence_list: list[str]) -> tuple[dict[str, int], int]:
    word_counts = {}
    total_words = 0
    for sentence in sentence_list:
        sentence = tokenize(sentence)
        for word in sentence:
            total_words += 1
            if word not in word_counts:
                word_counts[word] = 1
            else:
                word_counts[word] += 1

    return word_counts, total_words

## Function that gets the amount of unique words in a set

In [6]:
def get_unique_word_count(sentence_list):
    count = 0
    for sentence in sentence_list:
        sentence = tokenize(sentence)
        for word in sentence:
            count += 1

    return count

# Probabilities

## Conditional Probability
Performs laplace smoothing to avoid 0 prob

In [7]:
def get_conditional_prob(word_counts: int, word: str, total_words_in_list: int, unique_word_count: int) -> float:

    prob = 1
    if word in word_counts:
        this_word_count = word_counts[word]
    else:
        this_word_count = 0
    prob *= ((this_word_count + 1) / (total_words_in_list + unique_word_count))

    conditional_prob = prob

    return conditional_prob

## Posterior Probability
Converts a conditional probability to a posterior probability by multiplying it by the prior

In [8]:
def get_posterior_prob(conditional_prob: int, prior_prob: int):
    posterior_prob = conditional_prob * prior_prob
    return posterior_prob

# Testing

## Define test
Get the posterior probability of the sentence for both spam and ham, compare and decide. Then, check against the truth to see if it's correct. Finally, return the total accuracy.

In [9]:
def test(test_set, spam_word_counts, spam_total_word_count, ham_word_counts, ham_total_word_count, unique_word_count, prob_spam_general, prob_ham_general):
    correct = 0
    for item in test_set:
        sentence = item[1]
        truth = item[0]

        words = tokenize(sentence)
        spam_posterior = 1
        ham_posterior = 1
        for word in words:
            spam_conditional = get_conditional_prob(spam_word_counts, word, spam_total_word_count, unique_word_count)
            spam_posterior *= get_posterior_prob(spam_conditional, prob_spam_general)

            ham_conditional = get_conditional_prob(ham_word_counts, word, ham_total_word_count, unique_word_count)
            ham_posterior *= get_posterior_prob(ham_conditional, prob_ham_general)



        if spam_posterior > ham_posterior:
            guess = "spam"
        else:
            guess = "ham"

        print(f"Sentence: {sentence}")
        print(f"Spam Prob: {spam_posterior}")
        print(f"Ham prob: {ham_posterior}")
        print(f"Guess: {guess}")
        print(f"Truth: {truth}")
        print("\n")

        if guess == truth:
            correct += 1

    return correct / len(test_set)



#### Initialize values from training set

In [10]:
spam_word_counts, spam_total_word_count = get_word_counts(spam_set)
ham_word_counts, ham_total_word_count = get_word_counts(ham_set)
unique_word_count = get_unique_word_count(ham_set + spam_set)


## Run The Test

In [11]:
accuracy = test(testing_set, spam_word_counts, spam_total_word_count, ham_word_counts, ham_total_word_count, unique_word_count, prob_spam, prob_ham)
print(f"Accuracy: {accuracy}")

Sentence: Tell where you reached
Spam Prob: 8.294857291727621e-12
Ham prob: 1.2050870120331484e-10
Guess: ham
Truth: ham


Sentence: Your gonna have to pick up a burger for yourself on your way home
Spam Prob: 8.970648332408384e-38
Ham prob: 9.005455843645994e-37
Guess: ham
Truth: ham


Sentence: As a valued customer I am pleased to advise you that for your recent review you are awarded a Bonus Prize
Spam Prob: 9.107459206397537e-56
Ham prob: 8.881175033547704e-54
Guess: ham
Truth: spam


Sentence: Urgent you are awarded a complimentary trip to EuroDisinc To claim text immediately
Spam Prob: 4.992123079612254e-34
Ham prob: 1.3590051545865772e-34
Guess: spam
Truth: spam


Sentence: Finished class where are you
Spam Prob: 1.7988847138686408e-14
Ham prob: 1.996379086199493e-13
Guess: ham
Truth: ham


Sentence: where are you how did you perform 
Spam Prob: 1.2690636825956735e-19
Ham prob: 7.01297341936823e-17
Guess: ham
Truth: ham


Sentence: you can call me now
Spam Prob: 1.88882894956207