We know from the training dataset:
* $P(spam)$
* P(ham)
* P(w_i | spam)
* P(w_j | spam)

We want to make predictions. So we have to predict
* P(spam | w_k)
* P(ham | w_k)

When classifying an email we take the larger of the following probabilities:
* P(spam | w1 w2...w_n)
* P(ham | w1 w2...w_n)

We use Bayes' Rule.
* P(A|B) = P(B|A)P(A)/P(B)

It's also useful to know that log(ab) = log a + log b.

Sooooo when we try to calculate
* P(spam|w1 w2...w_n) = P(w1 w2...w_n | spam)P(spam)/P(w1 w2...w_n)

But the denominator doesn't really help us here...we don't have the probability of a particular word arrangement for language. So we naively assume that the probablity of each word appearing is independent of each other.

So we find that in order to simplifiy things we can do:
$$\log p(spam|w_1 w_2...w_n) \propto \log p(spam) + \sum_{i=0}^n \log p(w_i|spam))$$
* same for ham.

### Spam Detector from https://pythonmachinelearning.pro/text-classification-tutorial-with-naive-bayes/

In [15]:
# import os
# import re
# import string
# import math
 
# DATA_DIR = 'enron'
# target_names = ['ham', 'spam']
 
# def get_data(DATA_DIR):
#     subfolders = ['enron%d' % i for i in range(1,7)]

#     data = []
#     target = []
#     for subfolder in subfolders:
#         # spam
#         spam_files = os.listdir(os.path.join(DATA_DIR, subfolder, 'spam'))
#         for spam_file in spam_files:
#             with open(os.path.join(DATA_DIR, subfolder, 'spam', spam_file), encoding="latin-1") as f:
#                 data.append(f.read())
#                 target.append(1)

#         # ham
#         ham_files = os.listdir(os.path.join(DATA_DIR, subfolder, 'ham'))
#         for ham_file in ham_files:
#             with open(os.path.join(DATA_DIR, subfolder, 'ham', ham_file), encoding="latin-1") as f:
#                 data.append(f.read())
#                 target.append(0)

#     return data, target

In [16]:
# class SpamDetector(object):
#     """Implementation of Naive Bayes for binary classification"""
#     def clean(self, s):
#         translator = str.maketrans("", "", string.punctuation)
#         return s.translate(translator)
 
#     def tokenize(self, text):
#         text = self.clean(text).lower()
#         return re.split("\W+", text)
 
#     def get_word_counts(self, words):
#         word_counts = {}
#         for word in words:
#             word_counts[word] = word_counts.get(word, 0.0) + 1.0
#         return word_counts

In [17]:
# def fit(self, X, Y):
#     self.num_messages = {}
#     self.log_class_priors = {}
#     self.word_counts = {}
#     self.vocab = set()
 
#     n = len(X)
#     self.num_messages['spam'] = sum(1 for label in Y if label == 1)
#     self.num_messages['ham'] = sum(1 for label in Y if label == 0)
#     self.log_class_priors['spam'] = math.log(self.num_messages['spam'] / n)
#     self.log_class_priors['ham'] = math.log(self.num_messages['ham'] / n)
#     self.word_counts['spam'] = {}
#     self.word_counts['ham'] = {}
 
#     for x, y in zip(X, Y):
#         c = 'spam' if y == 1 else 'ham'
#         counts = self.get_word_counts(self.tokenize(x))
#         for word, count in counts.items():
#             if word not in self.vocab:
#                 self.vocab.add(word)
#             if word not in self.word_counts[c]:
#                 self.word_counts[c][word] = 0.0
 
#             self.word_counts[c][word] += count

In [18]:
# def predict(self, X):
#     result = []
#     for x in X:
#         counts = self.get_word_counts(self.tokenize(x))
#         spam_score = 0
#         ham_score = 0
#         for word, _ in counts.items():
#             if word not in self.vocab: continue
            
#             # add Laplace smoothing
#             log_w_given_spam = math.log( (self.word_counts['spam'].get(word, 0.0) + 1) / (self.num_messages['spam'] + len(self.vocab)) )
#             log_w_given_ham = math.log( (self.word_counts['ham'].get(word, 0.0) + 1) / (self.num_messages['ham'] + len(self.vocab)) )
 
#             spam_score += log_w_given_spam
#             ham_score += log_w_given_ham
 
#         spam_score += self.log_class_priors['spam']
#         ham_score += self.log_class_priors['ham']
 
#         if spam_score > ham_score:
#             result.append(1)
#         else:
#             result.append(0)
#     return result

### Milad's Version

In [21]:
import os
import re
import string
import math
import pandas as pd

class SpamDetector(object):
    """Implementation of Naive Bayes for binary classification"""
    def clean(self, s):
        translator = str.maketrans("", "", string.punctuation)
        return s.translate(translator)

    def tokenize(self, text):
        text = self.clean(text).lower()
        return re.split("\W+", text)

    def get_word_counts(self, words):
        word_counts = {}
        for word in words:
            word_counts[word] = word_counts.get(word, 0.0) + 1.0
        return word_counts

    def fit(self, X, Y):
        """Fit our classifier
        Arguments:
            X {list} -- list of document contents
            y {list} -- correct labels
        """
        self.num_messages = {}
        self.log_class_priors = {}
        self.word_counts = {}
        self.vocab = set()

        n = len(X)
        self.num_messages['spam'] = sum(1 for label in Y if label == 'spam')
        self.num_messages['ham'] = sum(1 for label in Y if label == 'ham')
        self.log_class_priors['spam'] = math.log(self.num_messages['spam'] / n )
        self.log_class_priors['ham'] = math.log(self.num_messages['ham'] / n )
        self.word_counts['spam'] = {}
        self.word_counts['ham'] = {}

        for x, y in zip(X, Y):
            c = 'spam' if y == 'spam' else 'ham'
            counts = self.get_word_counts(self.tokenize(x))
            for word, count in counts.items():
                if word not in self.vocab:
                    self.vocab.add(word)
                if word not in self.word_counts[c]:
                    self.word_counts[c][word] = 0.0

                self.word_counts[c][word] += count

    def predict(self, X):
        result = []
        for x in X:
            counts = self.get_word_counts(self.tokenize(x))
            spam_score = 0
            ham_score = 0
            for word, _ in counts.items():
                if word not in self.vocab: continue
                
                # add Laplace smoothing
                log_w_given_spam = math.log( (self.word_counts['spam'].get(word, 0.0) + 1) / (self.num_messages['spam'] + len(self.vocab)) )
                log_w_given_ham = math.log( (self.word_counts['ham'].get(word, 0.0) + 1) / (self.num_messages['ham'] + len(self.vocab)) )

                spam_score += log_w_given_spam
                ham_score += log_w_given_ham

            spam_score += self.log_class_priors['spam']
            ham_score += self.log_class_priors['ham']

            if spam_score > ham_score:
                result.append('spam')
            else:
                result.append('ham')
        return result
        

if __name__ == '__main__':
    from sklearn.model_selection import train_test_split
    data = pd.read_csv('../datasets/spam.csv',encoding='latin-1')
    data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
    data = data.rename(columns={"v1":'label', "v2":'text'})
    print(data.head())
    tags = data["label"]
    texts = data["text"]
    X, y = texts, tags
    print(len(X))
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    MNB = SpamDetector()
    MNB.fit(X_train.values, y_train.values)
    print(MNB.num_messages)
#     print(MNB.word_counts)
    pred = MNB.predict(X_test.values)
    true = y_test.values
    accuracy = sum(1 for i in range(len(pred)) if pred[i] == true[i]) / float(len(pred))
    print("{0:.4f}".format(accuracy))

  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
5572
{'spam': 567, 'ham': 3612}
0.9641


### Using SKLearn and CountVectorizer

In [29]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics

data = pd.read_csv('../datasets/spam.csv',encoding='latin-1')
print(data.head())
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(columns={"v1":'label', "v2":'text'})
print(data.head())
tags = data["label"]
texts = data["text"]

X, y = texts, tags

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


vectorizer = CountVectorizer()
X_train_dtm = vectorizer.fit_transform(X_train)
print(X_train_dtm)

nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
X_test_dtm = vectorizer.transform(X_test)
y_pred_class = nb.predict(X_test_dtm)
metrics.accuracy_score(y_test, y_pred_class)

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
  (0, 3286)	1
  (0, 4747)	2
  (0, 1896)	1
  (0, 875

0.9856424982053122

In [30]:
vectorizer.get_feature_names()

['00',
 '000',
 '000pes',
 '008704050406',
 '0121',
 '01223585236',
 '01223585334',
 '0125698789',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '02085076972',
 '021',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '0578',
 '06',
 '07',
 '07008009200',
 '07046744435',
 '07090298926',
 '07099833605',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '0776xxxxxxx',
 '07781482378',
 '07786200117',
 '077xxx',
 '078',
 '07801543489',
 '07808247860',
 '07808726822',
 '07815296484',
 '07821230901',
 '07880867867',
 '0789xxxxxxx',
 '07946746291',
 '0796xxxxxx',
 '07973788240',
 '07xxxxxxxxx',
 '08',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08006344447',
 '0808',
 '08081560665',
 '0825',
 '083',
 '0844',
 '08448714184',
 '0845',
 '08452810071',
 '08452810073',
 '08452810075over18',
 '0870',
 '08700435505150p',
 '08700469649',
 '08700621170150p',
 '0870121318

In [31]:
X_train_dtm.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [35]:
# first number is the amount of emails, the second number is the vocabulary size
X_train_dtm.shape

(4179, 7496)