# Naive Bayes (Discrete)

The idea of this project is to write a simple Naive Bayes model to predict if a SMS message is spam or not.
Let us derive the necessary probabilities.
Naive Bayes is a models that relies on the Bayes' theorem:

$$
P(Y|X) = \frac{P(X|Y)\times P(Y)}{P(X)}
$$

For this dataset we can write the equation as:

$$\begin{aligned}
P(y|W_1, ... W_n) &= \frac{P(W_0, ... W_n|y)\times P(y)}{P(W_1, ... W_n)} \\
P(y|W_1, ... W_n) &= \frac{P(W_0 | W_1, ... W_n,y)\times ...\times P(y)}{P(W_0, ... W_n)} \\
P(y|W_1, ... W_n) &= \frac{P(y) \times \prod_{i=0}^{n}P(W_i|y)}{P(W_0, ... W_n)} \\
P(y|W_1, ... W_n) &= \frac{P(y) \times \prod_{i=0}^{n}P(W_i|y)}{P(y) \times \prod_{i=0}^{n}P(W_i|y) + P(\neg y) \times \prod_{i=0}^{n}P(W_i|\neg y)}
\end{aligned}$$

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud

## Load the dataset

In [None]:
data = pd.read_csv('https://media.githubusercontent.com/media/mariolpantunes/ml101/main/datasets/spam.csv', encoding='latin-1')
data = data.rename(columns={"Target":"label", "SMS":"sms"})
data.head()

In [None]:
data['label'].value_counts().plot(kind='bar')

## Preprocessing the dataset

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

ham_words = []
for sms in data[data['label'] == 'ham'].sms:
    text = sms.lower()
    tokens = nltk.word_tokenize(text)
    # remove stop words
    filtered_tokens = [w.lower() for w in tokens if not w in stop_words and w.isalpha() and len(w) > 2]
    # filter with lemmatizer
    filtered_tokens = [lemmatizer.lemmatize(w) for w in filtered_tokens]
    ham_words.extend(filtered_tokens)
print(f'HAM {len(ham_words)}')

spam_words = []
for sms in data[data['label'] == 'spam'].sms:
    text = sms.lower()
    tokens = nltk.word_tokenize(text)
    # remove stop words
    filtered_tokens = [w.lower() for w in tokens if not w in stop_words and w.isalpha() and len(w) > 2]
    # filter with lemmatizer
    filtered_tokens = [lemmatizer.lemmatize(w) for w in filtered_tokens]
    spam_words.extend(filtered_tokens)
print(f'SPAM {len(spam_words)}')

ham_string=(' ').join(ham_words)
ham_wordcloud = WordCloud(width=500, height=300).generate(ham_string)

spam_string=(' ').join(spam_words)
spam_wordcloud = WordCloud(width=500, height=300).generate(spam_string)

In [None]:
#Creating Ham wordcloud
plt.figure( figsize=(10,8), facecolor='g')
plt.imshow(ham_wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

In [None]:
#Spam Word cloud
plt.figure( figsize=(10,8), facecolor='w')
plt.imshow(spam_wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

In [None]:
class NB:
    def __init__(self):
        self.p_spam = 0.0
        self.p_ham = 0.0
        self.spam_vocab = {}
        self.ham_vocab = {}
    
    def fit(self, ham_words, spam_words):
        total = len(ham_words)+len(spam_words)
        # compute prior
        
        # compute likelihood
        
    
    def predict(self, sms):
        text = sms.lower()
        tokens = nltk.word_tokenize(text)
        # remove stop words
        filtered_tokens = [w.lower() for w in tokens if not w in stop_words and w.isalpha() and len(w) > 2]
        # filter with lemmatizer
        filtered_tokens = [lemmatizer.lemmatize(w) for w in filtered_tokens]
        
        prob_likelihood_spam = 1.0
        prob_likelihood_ham = 1.0
        
        prob = self.p_spam*prob_likelihood_spam / (self.p_spam*prob_likelihood_spam + self.p_ham*prob_likelihood_ham)
        if prob >= 0.5:
            return True
        else:
            return False

    def __str__(self):
        return f'Prior ({self.p_ham}/{self.p_spam}) Vocab ({len(self.ham_vocab)}/{len(self.spam_vocab)})'

In [None]:
nb = NB()
nb.fit(ham_words, spam_words)
print(nb)

print(nb.predict('Ok lar... Joking wif u oni...'))
print(nb.predict('Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&Cs apply 08452810075over18s'))

In [None]:
data = data.replace(['ham','spam'],[False, True]) 
results = data.apply(lambda row : nb.predict(row['sms']) == row['label'], axis = 1)
acc = results.value_counts()[True]/results.count()
print(acc)