## Spam Classifier

Implement a Naive Bayes classification `naiveBayes_classify(word_probs, message)` for classifying an email message into spam or non-spam by using the word probability distributions, word_probs,  learned from a set of training data. 

Implemented the Naive Bayes method from scratch by implementing the following functions. To simplify the implementation, we assume that any message is equally likely to be spam or not-spam.
* `tokenize(message)`: extracts a set of unique words from the given text message.
* `count_words(training_set)`: creates a dictionary containing the mappings from unique words to the frequencies of the words in 
    spam and non-spam messages in the training set
*  `word_probabilities(counts, total_spams, total_non_spams, k=0.5)`: turns the word_counts into a list of triplets w, p(w | spam) and p(w | ~spam)
* `spam_probability(word_probs, message, total_spams, total_non_spams, k = 0.5)`: computes the probablity of spam for the given message.
* `naiveBayes_classify(word_probs, message, total_spams, total_non_spams, k)`: classifies the message as spam or ham

Using the data set `spam.csv` to evaluate the classification in terms of accuracy, recall, precision, and F1-score.

### Implement the following functions

In [6]:
from collections import Counter, defaultdict
import math,re

def tokenize(message):
    
    message = message.lower()                       # convert to lowercase
    all_words = re.findall("[a-z0-9']+", message)   # extract the words
    return set(all_words)                           # remove duplicates

In [7]:
def count_words(training_set):
   
    counts = defaultdict(lambda: [0, 0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

In [8]:
def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    
    return [(w,
             (spam + k) / (total_spams + 2 * k),
             (non_spam + k) / (total_non_spams + 2 * k))
             for w, (spam, non_spam) in counts.items()]

In [9]:
def spam_probability(word_probs, message, total_spams, total_non_spams, k = 0.5):

    
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0

    
    for word, total_spams, total_non_spams in word_probs:

       
        if word in message_words:
            log_prob_if_spam += math.log(total_spams)
            log_prob_if_not_spam += math.log(total_non_spams)

       
        else:
            log_prob_if_spam += math.log(1.0 - total_spams)
            log_prob_if_not_spam += math.log(1.0 - total_non_spams)

    prob_spam = math.exp(log_prob_if_spam)
    prob_ham = math.exp(log_prob_if_not_spam)
   
    
    return prob_spam / (prob_spam + prob_ham)

    

In [10]:
def naiveBayes_classify(word_probs, message, total_spams, total_non_spams, k):
    
    spam_prob = spam_probability(word_probs, message, total_spams, total_non_spams, k)
    return "spam" if spam_prob > 0.5 else "ham"
    

### Test and Evaluate

In [11]:
import pandas as pd
import numpy as np
spam = pd.read_csv("spam.csv", encoding = 'ISO-8859-1')

In [12]:
spam.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
spam.shape

(5572, 2)

In [14]:
spam['is_spam'] = spam['label'].map({'spam':1, 'ham':0})

In [15]:
spam.head()

Unnamed: 0,label,text,is_spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [16]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(spam['text'], spam['is_spam'], test_size = 0.2, random_state = 0)

In [17]:
y_test = list(y_test.map({0:'ham',1:'spam'}))

In [18]:
training_set = zip(X_train,y_train)

In [19]:
counts = count_words(training_set)

In [20]:
counts

defaultdict(<function __main__.count_words.<locals>.<lambda>()>,
            {'no': [52, 216],
             'is': [117, 481],
             'this': [73, 193],
             'durban': [0, 2],
             'amla': [0, 1],
             'kallis': [0, 5],
             'ground': [0, 3],
             'town': [2, 24],
             'home': [2, 128],
             'am': [8, 161],
             'going': [3, 133],
             'now': [151, 227],
             'theatre': [0, 4],
             'few': [0, 36],
             'i': [28, 1296],
             'in': [60, 612],
             'a': [228, 687],
             'minutes': [5, 21],
             'kavalan': [0, 2],
             'watch': [0, 27],
             'to': [372, 970],
             'escape': [0, 4],
             'walked': [0, 3],
             'address': [4, 13],
             'lt': [0, 189],
             'we': [36, 215],
             'right': [1, 63],
             'stagwood': [0, 1],
             'pass': [4, 7],
             'my': [11, 480],
           

In [21]:
total_spams = y_train.sum()
total_spams

581

In [22]:
total_non_spams = y_train.shape[0] - total_spams
total_non_spams

3876

In [23]:
word_probs = word_probabilities(counts, total_spams, total_non_spams, k=0.5)

In [24]:
#just check if this works for any given text in the dataset.
naiveBayes_classify(word_probs, spam['text'][2], total_spams, total_non_spams, 0.5)

'spam'

In [25]:
X_train.iloc[0]

'No no:)this is kallis home ground.amla home town is durban:)'

In [26]:
X_test.iloc[0]

'Aight should I just plan to come up later tonight?'

In [27]:
y_pred = []
for i in range(X_test.shape[0]):
    y_pred.append(naiveBayes_classify(word_probs, X_test.iloc[i], total_spams, total_non_spams, 0.5))

In [28]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       949
        spam       0.99      0.87      0.92       166

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [29]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

print("Accuracy score: ", accuracy_score(y_test, y_pred))
print("Recall score: ", recall_score(y_test, y_pred, average = 'weighted'))
print("Precision score: ", precision_score(y_test, y_pred, average = 'weighted'))
print("F1 score: ", f1_score(y_test, y_pred, average = 'weighted'))

Accuracy score:  0.97847533632287
Recall score:  0.97847533632287
Precision score:  0.9786368643629143
F1 score:  0.9778976677801595
