In [1]:
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
df = pd.read_csv('datasets/emails.csv')
print(Counter(df['spam']))
df.head(2)

Counter({0: 4360, 1: 1368})


Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1


In [79]:
df['text'].iloc[0]

"Subject: naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordability : your  ma

### csv file first contains all spam (1) emails and then non-spam (0) emails.

Let's take first 500 emails and last 500 emails for our test set. Rest we use for training our classifier.

In [3]:
test_df1 = df.head(500).copy()
test_df2 = df.tail(500).copy()
test_df = pd.concat([test_df1, test_df2], axis=0)
Counter(test_df['spam'])

Counter({1: 500, 0: 500})

In [4]:
train_df = df[~df.apply(tuple,1).isin(test_df.apply(tuple,1))]
print(Counter(train_df['spam']))

Counter({0: 3850, 1: 868})


In [5]:
vocab = {}
for i, msg in enumerate(train_df['text']):
    words = msg.split(' ')
    for word in words:
        if word not in vocab:
            #print(word)
            vocab[word] = 1
        else:
            vocab[word] += 1      

In [6]:
print('Total number of unique words in the Dictionary or Vocabulary: ', len(vocab))
n = len(vocab)

Total number of unique words in the Dictionary or Vocabulary:  33378


All emails have variable length. Let's make them uniform with making a one-hot vector of size vocab which 1s at all those indexes for which the word is present in the email.

In [7]:
word_to_idx = {w:i for i,w in enumerate(vocab)}
idx_to_word = {i:w for i,w in enumerate(vocab)}

In [20]:
def email_to_onehot(text):
    onehot = np.zeros((1,n))
    words = text.split(' ')
    for word in words:
        idx = word_to_idx[word]
        onehot[0,idx] = 1
    return onehot

In [23]:
X_train = [email_to_onehot(train_df['text'].iloc[i]) for i in range(len(train_df))]
y_train = [train_df['spam'].iloc[i] for i in range(len(train_df))]
X_train = np.concatenate(X_train, axis = 0); print('X_train.shape', X_train.shape)
y_train = np.array(y_train); print('y_train.shape',y_train.shape)

## Gives error because of words in test not present in train
# X_test = [email_to_onehot(test_df['text'].iloc[i]) for i in range(len(test_df))]
# y_test = [test_df['spam'].iloc[i] for i in range(len(test_df))]

X_train.shape (4718, 33378)
y_train.shape (4718,)


In [25]:
spam_emails = X_train[np.where(y_train==1)[0]]
nonspam_emails = X_train[np.where(y_train==0)[0]]
spam_emails.shape, nonspam_emails.shape

((868, 33378), (3850, 33378))

In [61]:
P_1 = (y_train == 1).sum()/y_train.shape[0] # P(y=1)
P_0 = (y_train == 0).sum()/y_train.shape[0] # P(y=0)

p_xj_1 = np.sum(spam_emails, axis=0)/ spam_emails.shape[0]
p_xj_0 = np.sum(nonspam_emails, axis=0)/ nonspam_emails.shape[0]


for email, gt in zip(X_train,y_train):
    # np.where(email == 1) indicates those words are present in the email
    words_idxs = np.where(email==1)[0]
    p_xj_1_all = p_xj_1[words_idxs]
    pi_xj_1 = np.prod(p_xj_1_all)
    
    p_xj_0_all = p_xj_0[words_idxs]
    pi_xj_0 = np.prod(p_xj_0_all)

    numerator = pi_xj_1 * P_1
    denominator =  pi_xj_1 * P_1 + pi_xj_0 * P_0
    
    spam_probability = numerator/denominator
    print('GT --> Pred', gt, '-->' ,spam_probability)

GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> nan
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> nan
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> nan
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> nan
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> nan
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> nan
GT --> Pred 1 --> nan
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pred 1 --> 1.0
GT --> Pre

  spam_probability = numerator/denominator


GT --> Pred 0 --> 0.0
GT --> Pred 0 --> nan
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> nan
GT --> Pred 0 --> nan
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> nan
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> nan
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> nan
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pred 0 --> 0.0
GT --> Pre

__Observations:__

1. There are many values with `spam_probability` as `nan`. Why? because, if words in emails belong to less common words then each word's probability is very-very low, even though non-zero. But if we multiply say 250 different numbers which are close to zero -> the product becomes 0.0 effectively. So, 0 in `numerator` and 0 in `denominator` -> Naive Bayes breaks even on training data
2. Naive Bayes breaks for test data a lot, because there can be many 'novel' words and we will get 0/0
3. There is no relevance of __semantics__ in Naive Bayes. 'I am a good person' is same as 'Person is good am I'

__Improvements__