# Email Spam Naive Bayes

## Overview

Building a naive bayes classifier from scratch that can determine whether email text should be labled spam or not spam based on its contents

## Review

A naive bayes classifier realizes the following probability:

$$P(Y|X_1,X_2,...,X_n) \propto P(Y)*P(X_1|Y)*P(X_2|Y)*...*P(X_n|Y)$$

Where $Y$ is a binary class {0,1}

Where $X_i$ is a feature of the input

The classifier decides what class each input belongs to based on highest probability from the equation above

In [13]:
#import cell
import numpy as np
import pandas as pd
import math as m
import random
import csv

## Function template

In [2]:

def prior(df):
    ham_prior = 0
    spam_prior =  0
    spam_count = 0
    ham_count = 0
    total = 0
    #counting the total spam and ham in the train data
    for i in range(len(df['label'])):
        total += 1
        if df['label'][i] == 'ham' :
            ham_count += 1
        else:
            spam_count += 1
    ham_prior=ham_count/total
    spam_prior=spam_count/total
        
    return ham_prior, spam_prior

def likelihood(df):
    ham_like_dict = {}
    spam_like_dict = {}
    data = df['text']
    scount=0
    hcount=0
    #removing subject: and characters other then the alphabet or spaces from text
    for x in range(len(data)):
        i=data[x]
        i=i.split(" ", 1)[1]
        i= "".join(c for c in data[x] if (c.isalpha() | (c == ' '))).strip()
        data[x]=i
    #splitting text into words
    for i in range(len(data)): 
        x=data[i].split(" ")
        subject = {}
        #counting amount of spam and ham training emails
        if(df['label'][i]=='spam'):
            scount += 1
        else:
            hcount += 1
        #removing excess spaces from words and making them all lower case
        for a in range(len(x)):
            x[a]=x[a].replace(" ", "").lower()
            if(not(x[a] in subject)):
                #making a dictionary for words in an email to make sure duplicate words don't get counted twice
                subject[x[a]]=1
                #labeling words as either spam or ham depending on what the email they are from is labeled as
                if(df['label'][i]=='spam'):
                    if(x[a] in spam_like_dict):
                        spam_like_dict[x[a]] += 1
                    else:
                        spam_like_dict[x[a]]=1
                else:
                    if(x[a] in ham_like_dict):
                        ham_like_dict[x[a]] += 1
                    else:
                        ham_like_dict[x[a]]=1
        subject={}
    for key in ham_like_dict:
        ham_like_dict[key]=ham_like_dict[key]/hcount
    for key in spam_like_dict:
        spam_like_dict[key]=spam_like_dict[key]/scount
    return ham_like_dict, spam_like_dict

def posterior(ham_like_dict, spam_like_dict, text):
    #a method to calculate the posterior to make the predict function more readable
    #splitting text into words and removing subject: and non-alphabet/space characters
    i=text.split(" ", 1)[1]
    i= "".join(c for c in i if (c.isalpha() | (c == ' '))).strip()
    data=i.split(" ")
    spam=0
    ham=0
    for x in range(len(data)):
        word=data[x].replace(" ", "").lower()
        
        #filtering out words that are almost equally likely to occur in spam or ham which throws off the model, I tested my code and a value of .28 optimizes the model
        #taking the log of the probability to avoid really low numbers that might get truncated to 0
        if((word in ham_like_dict and word in spam_like_dict and abs(ham_like_dict[word]-spam_like_dict[word])>.28) or (word in spam_like_dict and not word in ham_like_dict ) or (word in ham_like_dict and not word in spam_like_dict ) ):
            if(word in ham_like_dict):
                ham += m.log(ham_like_dict[word])*-1
            if(word in spam_like_dict):
                spam += m.log(spam_like_dict[word])*-1
       
    return ham, spam
            
        
def predict(ham_prior, spam_prior, ham_like_dict, spam_like_dict, text):
    
    #ham_posterior = posterior probability that the email is normal/ham
    ham_posterior, spam_posterior = posterior(ham_like_dict, spam_like_dict, text)
    
    if(ham_posterior + ((-1)*m.log(ham_prior))>spam_posterior + ((-1)*m.log(spam_prior))):
        return 0
    elif(ham_posterior + ((-1)*m.log(ham_prior))<spam_posterior + ((-1)*m.log(spam_prior))):
        return 1
    else:
    #if the probability is equal then I use a random number generator to randomly pick between spam or ham
        return random.randint(0, 1)


def metrics(ham_prior, spam_prior, ham_dict, spam_dict, df):
    corrects=0
    count=0
    incorrect=0
    lspam=0
    lham=0
    spam=0
    correcth = 0
    for i in range(len(df['label'])):
        x=(predict(ham_prior, spam_prior, ham_dict, spam_dict, df['text'][i]))
        count += 1
        if(x==1):
            lspam +=1
            if(df['label'][i]=="spam"):
                corrects += 1
                spam += 1
        elif(df['label'][i]=="ham"):
            correcth += 1
        elif(df['label'][i]=="spam"):
            spam += 1
    correct=corrects+correcth
    incorrect=count-correct
    lham=count-lspam
    acc=correct/count
    precision=corrects/lspam
    ham=count-spam
    recall=corrects/spam
    #precision, recall
    return acc, precision, recall

## Generate answers

In [14]:
#loading in the training data
train_df = pd.read_csv("./TRAIN_balanced_ham_spam.csv")
test_df = pd.read_csv("./TEST_balanced_ham_spam.csv")
df = train_df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2398 entries, 0 to 2397
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    2398 non-null   int64 
 1   Unnamed: 0.1  2398 non-null   int64 
 2   label         2398 non-null   object
 3   text          2398 non-null   object
 4   label_num     2398 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 93.8+ KB


In [15]:
#compute the prior

ham_prior, spam_prior = prior(df)

print(ham_prior, spam_prior)

0.5 0.5


In [16]:
# compute likelihood

ham_like_dict, spam_like_dict = likelihood(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[x]=i


In [17]:
# Test your predict function with some example TEXT

some_text_example = " "
print(predict(ham_prior, spam_prior, ham_like_dict, spam_like_dict, some_text_example))

1


In [26]:
# Predict on test_df and compute metrics 
    
df = test_df
acc, precision, recall = metrics(ham_prior, spam_prior, ham_like_dict, spam_like_dict, df)
print(acc, precision, recall)

0.965 0.9543973941368078 0.9766666666666667
