In [1]:
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
df = pd.read_csv('datasets/emails.csv')
print(Counter(df['spam']))
df.head(2)

Counter({0: 4360, 1: 1368})


Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1


In [3]:
df['text'].iloc[0]

"Subject: naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordability : your  ma

### csv file first contains all spam (1) emails and then non-spam (0) emails.

Let's take first 500 emails and last 500 emails for our test set. Rest we use for training our classifier.

In [4]:
test_df1 = df.head(500).copy()
test_df2 = df.tail(500).copy()
test_df = pd.concat([test_df1, test_df2], axis=0)
Counter(test_df['spam'])

Counter({1: 500, 0: 500})

In [5]:
train_df = df[~df.apply(tuple,1).isin(test_df.apply(tuple,1))]
print(Counter(train_df['spam']))

Counter({0: 3850, 1: 868})


In [25]:
def get_vocab(texts):
    vocab = []
    for text in texts:
        words = text.split(' ')
        for word in words:
            if word not in vocab:
                vocab.append(word)
    return vocab

def text_to_vector(text, vocab, one_hot = False):
    text_vector = np.zeros((1,len(vocab)))
    words = text.split(' ')
    for word in words:
        idx = word_to_idx[word]
        if one_hot:
            text_vector[0,idx] = 1
        else:
            text_vector[0,idx] += 1
    return text_vector

In [20]:
vocab = get_vocab(list(train_df['text'].values))
print('Total number of unique words in the Dictionary or Vocabulary: ', len(vocab))

Total number of unique words in the Dictionary or Vocabulary:  33378


All emails have variable length. Let's make them uniform with making a vector of size vocab which 1s at all those indexes for which the word is present in the email.

In [24]:
word_to_idx = {w:i for i,w in enumerate(vocab)}

In [26]:
X = [text_to_vector(train_df['text'].iloc[i], vocab, one_hot = True) for i in range(len(train_df))]
X = np.concatenate(X, axis = 0); print('X.shape', X.shape)

y = [train_df['spam'].iloc[i] for i in range(len(train_df))]
y = np.array(y); print('y.shape',y.shape)

X.shape (4718, 33378)
y.shape (4718,)


In [27]:
Prior_0 = (y==0).sum()/len(y)
Prior_1 = (y==1).sum()/len(y)
print(f"Prior_0: {Prior_0}, Prior_1: {Prior_1}")

texts_0 = X[np.where(y==0)[0]] # nonspam emails
texts_1 = X[np.where(y==1)[0]] # spam emails

words_count_0 = np.sum(texts_0, axis=0)
words_count_1 = np.sum(texts_1, axis=0)

words_prob_0 = words_count_0/ texts_0.shape[0]
words_prob_1 = words_count_1/ texts_1.shape[0]

for x,gt in zip(X,y):
    idxs = np.where(x>=1)[0]
    pi_prob_0 = np.prod(words_prob_0[idxs])
    pi_prob_1 = np.prod(words_prob_1[idxs])
    spam_probability = pi_prob_1*Prior_1/(pi_prob_1*Prior_1 + pi_prob_0*Prior_0)
    print(gt, '--->', round(spam_probability,3))

Prior_0: 0.8160237388724035, Prior_1: 0.18397626112759644
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> nan
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> nan
1 ---> 1.0
1 ---> nan
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> nan
1 ---> 1.0
1 ---> nan
1 ---> 1.0
1 ---> nan
1 ---> nan
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> nan
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> nan
1 ---> nan
1 ---> 1.0
1 ---> 1.0
1 ---> nan
1 ---> nan
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 1.0
1 ---> 

  spam_probability = pi_prob_1*Prior_1/(pi_prob_1*Prior_1 + pi_prob_0*Prior_0)


0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> nan
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> nan
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> nan
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> nan
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> nan
0 ---> 0.0
0 ---> 0.0
0 ---> nan
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> nan
0 ---> nan
0 ---> nan
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> nan
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> nan

__Observations:__

1. There are many values with `spam_probability` as `nan`. Why? because, if words in emails belong to less common words then each word's probability is very-very low, even though non-zero. But if we multiply say 250 different numbers which are close to zero -> the product becomes 0.0 effectively. So, 0 in `numerator` and 0 in `denominator` -> Naive Bayes breaks even on training data (__Underflow__-> avoid by using log)
2. Naive Bayes breaks for test data a lot, because there can be many 'novel' words and we will get 0/0
3. There is no relevance of __semantics__ in Naive Bayes. 'I am a good person' is same as 'Person is good am I'
4. count of word is not into account

__Improvements__
1. use count
2. unkwon word in `[<unk>]` token: custom user-defined
3. smoothing

In [28]:
vocab.append('[<unk>]')

In [29]:
word_to_idx = {w:i for i,w in enumerate(vocab)}

In [None]:
def text_to_vector_v2(text, vocab, one_hot = False):
    text_vector = np.zeros((1,len(vocab)))
    words = text.split(' ')
    for word in words:
        if word in vocab:
            idx = word_to_idx[word]
        else:
            word = '[<unk>]'
            idx = word_to_idx[word]
        if one_hot:
            text_vector[0,idx] = 1
        else:
            text_vector[0,idx] += 1
    return text_vector

In [33]:
X = [text_to_vector_v2(train_df['text'].iloc[i], vocab, one_hot = False) for i in range(len(train_df))]
X = np.concatenate(X, axis = 0); print('X.shape', X.shape)

y = [train_df['spam'].iloc[i] for i in range(len(train_df))]
y = np.array(y); print('y.shape',y.shape)

X.shape (4718, 33379)
y.shape (4718,)


In [34]:
alpha = 1

In [35]:
Prior_0 = (y==0).sum()/len(y)
Prior_1 = (y==1).sum()/len(y)
print(f"Prior_0: {Prior_0}, Prior_1: {Prior_1}")

texts_0 = X[np.where(y==0)[0]] # nonspam emails
texts_1 = X[np.where(y==1)[0]] # spam emails

words_count_0 = np.sum(texts_0, axis=0)
words_count_1 = np.sum(texts_1, axis=0)

# words_prob_0 = texts_0/words_count_0 # fail due to non-smoothing

words_prob_0 = (words_count_0 + alpha)/ (texts_0.shape[0] + alpha*len(vocab)) # also smoothing here
words_prob_1 = (words_count_1 + alpha)/ (texts_1.shape[0] + alpha*len(vocab)) # also smoothing here

print('words_prob_0\n', words_prob_0)
print('words_prob_1\n', words_prob_1)

for x,gt in zip(X,y):
    idxs = np.where(x>=1)[0]
    pi_prob_0 = np.prod(words_prob_0[idxs])
    pi_prob_1 = np.prod(words_prob_1[idxs])
    spam_probability = pi_prob_1*Prior_1/(pi_prob_1*Prior_1 + pi_prob_0*Prior_0)
    print(gt, '--->', round(spam_probability,4))

Prior_0: 0.8160237388724035, Prior_1: 0.18397626112759644
words_prob_0
 [1.03440866e-01 9.91162803e-03 3.20986328e-01 ... 5.37215611e-05
 5.37215611e-05 2.68607806e-05]
words_prob_1
 [2.53744854e-02 3.82515257e-03 6.05600491e-02 ... 2.91996379e-05
 2.91996379e-05 2.91996379e-05]
1 ---> 0.0
1 ---> 0.0
1 ---> 0.0
1 ---> 0.0
1 ---> 0.0
1 ---> nan
1 ---> 0.0
1 ---> 0.0
1 ---> 0.0
1 ---> 0.0
1 ---> 0.9999
1 ---> nan
1 ---> 0.0
1 ---> nan
1 ---> 0.0
1 ---> 0.0
1 ---> 0.0
1 ---> 0.0
1 ---> nan
1 ---> 0.0
1 ---> nan
1 ---> 1.0
1 ---> 0.9956
1 ---> 0.0
1 ---> nan
1 ---> 0.0
1 ---> 0.3204
1 ---> 0.0
1 ---> 0.0
1 ---> 0.21
1 ---> 0.0
1 ---> nan
1 ---> nan
1 ---> nan
1 ---> 0.0
1 ---> nan
1 ---> nan
1 ---> 0.5058
1 ---> 0.0
1 ---> nan
1 ---> 0.0
1 ---> 0.0
1 ---> 0.0
1 ---> nan
1 ---> nan
1 ---> 0.0
1 ---> 0.0
1 ---> 0.0
1 ---> 0.0
1 ---> nan
1 ---> nan
1 ---> 1.0
1 ---> 0.0
1 ---> nan
1 ---> nan
1 ---> 0.0
1 ---> 0.0
1 ---> 0.7663
1 ---> nan
1 ---> nan
1 ---> 0.0
1 ---> 0.0
1 ---> nan
1 ---> nan


  spam_probability = pi_prob_1*Prior_1/(pi_prob_1*Prior_1 + pi_prob_0*Prior_0)


0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> nan
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> nan
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> nan
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> nan
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> nan
0 ---> 0.0
0 ---> nan
0 ---> nan
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> nan
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> nan
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> nan
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> nan
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> nan
0 ---> nan
0 ---> 0.0
0 ---> nan
0 ---> nan
0 ---> nan
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> nan
0 ---> nan
0 ---> nan
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> nan
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> nan
0 ---> 0.0
0 ---> nan
0 ---> 0.0
0 ---> 0.0

__Problem__: Because of very-very large size of vocab -> all words prob ->0 and causing huge underflow leading most of probs = 0.