In [1]:
import numpy as np

In [33]:
def get_vocab(texts):
    vocab = []
    for text in texts:
        words = text.split(' ')
        for word in words:
            if word not in vocab:
                vocab.append(word)
    return vocab

def text_to_vector(text, vocab, one_hot = False):
    text_vector = np.zeros((1,len(vocab)))
    words = text.split(' ')
    for word in words:
        idx = word_to_idx[word]
        if one_hot:
            text_vector[0,idx] = 1
        else:
            text_vector[0,idx] += 1
    return text_vector

In [38]:
# DATASET
spam = ['million dollar offer','secret secret secret secret offer today','secret is secret']
not_spam = ['low price for valued customer','play secret sports today','sports is healthy','low price pizza']
all_messages = spam + not_spam

# # PREPROCESS- VOCAB
vocab = get_vocab(all_messages)
print('Total number of unique words in the Dictionary or Vocabulary: ', len(vocab))

word_to_idx = {w:i for i,w in enumerate(vocab)}
idx_to_word = {i:w for i,w in enumerate(vocab)}

# # MAKE X AND y
X = [text_to_vector(text, vocab, one_hot=True) for text in all_messages]
X = np.concatenate(X, axis = 0); print('X.shape', X.shape)
y = [1,1,1,0,0,0,0]
y = np.array(y); print('y.shape',y.shape)

Prior_0 = (y==0).sum()/len(y)
Prior_1 = (y==1).sum()/len(y)
print(f"Prior_0: {Prior_0}, Prior_1: {Prior_1}")

texts_0 = X[np.where(y==0)[0]] # nonspam emails
texts_1 = X[np.where(y==1)[0]] # spam emails

words_count_0 = np.sum(texts_0, axis=0)
words_count_1 = np.sum(texts_1, axis=0)

words_prob_0 = words_count_0/ texts_0.shape[0]
words_prob_1 = words_count_1/ texts_1.shape[0]

for x,gt in zip(X,y):
    idxs = np.where(x>=1)[0]
    pi_prob_0 = np.prod(words_prob_0[idxs])
    pi_prob_1 = np.prod(words_prob_1[idxs])
    spam_probability = pi_prob_1*Prior_1/(pi_prob_1*Prior_1 + pi_prob_0*Prior_0)
    print(gt, '--->', round(spam_probability,3))

Total number of unique words in the Dictionary or Vocabulary:  15
X.shape (7, 15)
y.shape (7,)
Prior_0: 0.5714285714285714, Prior_1: 0.42857142857142855
1 ---> 1.0
1 ---> 1.0
1 ---> 0.727
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0


### Taking account the __count__ of the words in the email
- without smoothing

In [41]:
# # MAKE X AND y
X = [text_to_vector(text, vocab, one_hot=False) for text in all_messages] # get counts
X = np.concatenate(X, axis = 0); print('X.shape', X.shape)
y = [1,1,1,0,0,0,0]
y = np.array(y); print('y.shape',y.shape)

Prior_0 = (y==0).sum()/len(y)
Prior_1 = (y==1).sum()/len(y)
print(f"Prior_0: {Prior_0}, Prior_1: {Prior_1}")

texts_0 = X[np.where(y==0)[0]] # nonspam emails
texts_1 = X[np.where(y==1)[0]] # spam emails

words_count_0 = np.sum(texts_0, axis=0)
words_count_1 = np.sum(texts_1, axis=0)

# words_prob_0 = texts_0/words_count_0 # fail due to non-smoothing

words_prob_0 = words_count_0/ texts_0.shape[0]
words_prob_1 = words_count_1/ texts_1.shape[0]

print('words_prob_0\n', words_prob_0)
print('words_prob_1\n', words_prob_1)

for x,gt in zip(X,y):
    idxs = np.where(x>=1)[0]
    pi_prob_0 = np.prod(words_prob_0[idxs])
    pi_prob_1 = np.prod(words_prob_1[idxs])
    spam_probability = pi_prob_1*Prior_1/(pi_prob_1*Prior_1 + pi_prob_0*Prior_0)
    print(gt, '--->', round(spam_probability,3))

X.shape (7, 15)
y.shape (7,)
Prior_0: 0.5714285714285714, Prior_1: 0.42857142857142855
words_prob_0
 [0.   0.   0.   0.25 0.25 0.25 0.5  0.5  0.25 0.25 0.25 0.25 0.5  0.25
 0.25]
words_prob_1
 [0.33333333 0.33333333 0.66666667 2.         0.33333333 0.33333333
 0.         0.         0.         0.         0.         0.
 0.         0.         0.        ]
1 ---> 1.0
1 ---> 1.0
1 ---> 0.889
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0
0 ---> 0.0


__Downside of counting the frequency of words without smoothing is that the probabilty can become greater than 1 -> which makes no sense__

__Another downside of counting the frequency of words without smoothing is that for unseen words probab=0 -> spam_probab = 0/0 -> not good__

### Taking account the __count__ of the words in the email
- with smoothing

In [48]:
alpha = 1

In [49]:
# # MAKE X AND y
X = [text_to_vector(text, vocab, one_hot=False) for text in all_messages] # get counts
X = np.concatenate(X, axis = 0); print('X.shape', X.shape)
y = [1,1,1,0,0,0,0]
y = np.array(y); print('y.shape',y.shape)

Prior_0 = (y==0).sum()/len(y)
Prior_1 = (y==1).sum()/len(y)
print(f"Prior_0: {Prior_0}, Prior_1: {Prior_1}")

texts_0 = X[np.where(y==0)[0]] # nonspam emails
texts_1 = X[np.where(y==1)[0]] # spam emails

words_count_0 = np.sum(texts_0, axis=0)
words_count_1 = np.sum(texts_1, axis=0)

# words_prob_0 = texts_0/words_count_0 # fail due to non-smoothing

words_prob_0 = (words_count_0 + alpha)/ (texts_0.shape[0] + alpha*len(vocab)) # also smoothing here
words_prob_1 = (words_count_1 + alpha)/ (texts_1.shape[0] + alpha*len(vocab)) # also smoothing here

print('words_prob_0\n', words_prob_0)
print('words_prob_1\n', words_prob_1)

for x,gt in zip(X,y):
    idxs = np.where(x>=1)[0]
    pi_prob_0 = np.prod(words_prob_0[idxs])
    pi_prob_1 = np.prod(words_prob_1[idxs])
    spam_probability = pi_prob_1*Prior_1/(pi_prob_1*Prior_1 + pi_prob_0*Prior_0)
    print(gt, '--->', round(spam_probability,4))

X.shape (7, 15)
y.shape (7,)
Prior_0: 0.5714285714285714, Prior_1: 0.42857142857142855
words_prob_0
 [0.05263158 0.05263158 0.05263158 0.10526316 0.10526316 0.10526316
 0.15789474 0.15789474 0.10526316 0.10526316 0.10526316 0.10526316
 0.15789474 0.10526316 0.10526316]
words_prob_1
 [0.11111111 0.11111111 0.16666667 0.38888889 0.11111111 0.11111111
 0.05555556 0.05555556 0.05555556 0.05555556 0.05555556 0.05555556
 0.05555556 0.05555556 0.05555556]
1 ---> 0.9137
1 ---> 0.9026
1 ---> 0.7452
0 ---> 0.0135
0 ---> 0.352
0 ---> 0.1282
0 ---> 0.0467


##### Above version still does not handle case when __new words__ are encountered which are not present in dictionary__

To handle this, there are 2 ways:
1. __Ignore it__. Implement a check in `text_to_vector` to verify word exists in vocab only then process else ignore
2. __Add `unknown` token in vocab__. whenever word is not in vocab -> count it as `unkown` -> practically, keep record of which words as classified as `unkown` and later add them in the vocabulary in future