In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.base import ClassifierMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from math import log, factorial
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split



class NaiveBayesFilter(ClassifierMixin):
    '''
    A Naive Bayes Classifier that sorts messages in to spam or ham.
    '''

    def __init__(self):
        return

    def fit(self, X, y):
        '''
        Create a table that will allow the filter to evaluate P(H), P(S)
        and P(w|C)

        Parameters:
            X (pd.Series): training data
            y (pd.Series): training labels
        '''
        #get all of the different sms's and create a dictionary to keep track
        phrases = X.str.split()
        count_dic = {}

        for i in X.index:
            #see if each sms is ham or spam
            bin_ham = int(y[i] == 'ham')
            bin_spam = int(y[i] == 'spam')

            for word in phrases[i]:

                if word in count_dic:
                    #add to an instance of ham or spam for each word
                    num_spam, num_ham = count_dic[word]
                    count_dic[word] = (num_spam + bin_spam, num_ham + bin_ham)

                else:
                    #create the entry if the word isn't present
                    count_dic[word] = (bin_spam, bin_ham)
        
        self.data = pd.DataFrame(count_dic, index=['spam', 'ham'])

        #get the number of mesages
        nmess = y.size
        labels = y.value_counts().to_dict()
        #get the number of ham and spam 
        nspam = labels['spam']
        nham = labels['ham']

        self.prob_spam = nspam/nmess
        self.prob_ham = nham/nmess
        self.words = list(count_dic.keys())


    def predict_proba(self, X):
        '''
        Find P(C=k|x) for each x in X and for each class k by computing
        P(C=k)P(x|C=k)

        Parameters:
            X (pd.Series)(N,): messages to classify

        Return:
            (ndarray)(N,2): Probability each message is ham, spam
                0 column is ham
                1 column is spam
        '''
        prob_matrix = []
        #get the different parts of the message
        phrases = X.str.split()
        

        for i in X.index:
            spam_likelihood = []
            ham_likelihood = []
            #get the words present in each phrase and count how many times they appear
            these_words = list(set(phrases.loc[i])) 
            count_dict = {word:0 for word in these_words}

            for word in these_words:
                count_dict[word] = phrases.loc[i].count(word)
            
            #use MLE estimation
            for word in these_words:
                 if word in self.words:
                    spam_likelihood.append(self.data.loc['spam',word] / self.data.loc['spam'].sum()**count_dict[word])
                    ham_likelihood.append(self.data.loc['ham',word] / self.data.loc['ham'].sum()**count_dict[word])

            l_ham = self.prob_ham*np.product(ham_likelihood)
            l_spam = self.prob_spam*np.product(spam_likelihood)
            prob_matrix.append(np.array([l_ham, l_spam]))
        
        #create the predicted probabilities matrix
        prob_matrix = np.array(prob_matrix)

        return prob_matrix

    def predict(self, X):
        '''
        Use self.predict_proba to assign labels to X,
        the label will be a string that is either 'spam' or 'ham'

        Parameters:
            X (pd.Series)(N,): messages to classify

        Return:
            (ndarray)(N,): label for each message
        '''
        #create our list of labels
        word_class = []

        #call our method from the previous problem to get the probabilites
        prob_mat = self.predict_proba(X)

        #find which case is more likely
        for prob in prob_mat.tolist():
            if prob[0] >= prob[1]:
                word_class.append('ham')
            else:
                word_class.append('spam')
        
        return word_class

    def predict_log_proba(self, X):
        '''
        Find ln(P(C=k|x)) for each x in X and for each class k

        Parameters:
            X (pd.Series)(N,): messages to classify

        Return:
            (ndarray)(N,2): Probability each message is ham, spam
                0 column is ham
                1 column is spam
        '''
        prob_matrix = []
        #get the different parts of the message
        phrases = X.str.split()

        for i in X.index:
            spam_likelihood = []
            ham_likelihood = []
            #get the unique words in each phrase and count their occurrence
            these_words = list(set(phrases.loc[i]))
            count_dict = {word:0 for word in these_words}

            for word in these_words:
                count_dict[word] = phrases.loc[i].count(word)

            #use logarithmic MLE estimation 
            for word in these_words:
                 if word in self.words:
                    spam_likelihood.append((count_dict[word])*np.log((self.data.loc['spam',word]+1)/(self.data.loc['spam'].sum()+2)))
                    ham_likelihood.append((count_dict[word])*np.log((self.data.loc['ham',word]+1)/(self.data.loc['ham'].sum()+2)))
            
            l_spam = np.sum(spam_likelihood)
            l_ham = np.sum(ham_likelihood)
            prob_matrix.append(np.array([l_ham, l_spam]))
        
        #complete our probability matrix
        prob_matrix = np.array(prob_matrix)
        prob_matrix[:,0] += np.log(self.prob_ham)
        prob_matrix[:,1] += np.log(self.prob_spam)

        return prob_matrix


    def predict_log(self, X):
        '''
        Use self.predict_log_proba to assign labels to X,
        the label will be a string that is either 'spam' or 'ham'

        Parameters:
            X (pd.Series)(N,): messages to classify

        Return:
            (ndarray)(N,): label for each message
        '''
        #create our list of labels
        word_class = []

        #call our method from the previous problem to get the probabilites
        prob_mat = self.predict_log_proba(X)

        #find which case is more likely
        for prob in prob_mat.tolist():
            if prob[0] >= prob[1]:
                word_class.append('ham')
            else:
                word_class.append('spam')
        
        return word_class

In [2]:
class PoissonBayesFilter(ClassifierMixin):
    '''
    A Naive Bayes Classifier that sorts messages in to spam or ham.
    This classifier assumes that words are distributed like
    Poisson random variables
    '''

    def __init__(self):
        return


    def fit(self, X, y):
        '''
        Uses bayesian inference to find the poisson rate for each word
        found in the training set. For this we will use the formulation
        of l = rt since we have variable message lengths.

        This method creates a tool that will allow the filter to
        evaluate P(H), P(S), and P(w|C)


        Parameters:
            X (pd.Series): training data
            y (pd.Series): training labels

        Returns:
            self: this is an optional method to train
        '''

        #get all of the different sms's and create a dictionary to keep track
        phrases = X.str.split()
        count_dic = {}

        for i in X.index:
            #see if each sms is ham or spam
            bin_ham = int(y[i] == 'ham')
            bin_spam = int(y[i] == 'spam')

            for word in phrases[i]:

                if word in count_dic:
                    #add to an instance of ham or spam for each word
                    num_spam, num_ham = count_dic[word]
                    count_dic[word] = (num_spam + bin_spam, num_ham + bin_ham)

                else:
                    #create the entry if the word isn't present
                    count_dic[word] = (bin_spam, bin_ham)
        
        self.data = pd.DataFrame(count_dic, index=['spam', 'ham'])
        
        #get the number of mesages
        nmess = y.size
        labels = y.value_counts().to_dict()
        #get the number of ham and spam 
        nspam = labels['spam']
        nham = labels['ham']

        self.prob_spam = nspam/nmess
        self.prob_ham = nham/nmess

        #get the total instances of spam and ham
        nhams = self.data.loc['ham'].sum()
        nspams = self.data.loc['spam'].sum()

        #create our dictionaries for the poisson rates
        self.spam_rates = {}
        self.ham_rates = {}

        #use equation 11.11 to find the rates
        for word in self.data.columns:
            times_as_ham = self.data.loc['ham', word]
            ham_rate = (times_as_ham + 1) / (nhams + 2)
            self.ham_rates[word] = ham_rate

            times_as_spam = self.data.loc['spam', word]
            spam_rate = (times_as_spam + 1) / (nspams + 2)
            self.spam_rates[word] = spam_rate

        self.words = list(count_dic.keys())

    def predict_log_proba(self, X):
        '''
        Find ln(P(C=k|x)) for each x in X and for each class

        Parameters:
            X (pd.Series)(N,): messages to classify

        Return:
            (ndarray)(N,2): Log probability each message is ham or spam
                column 0 is ham, column 1 is spam
        '''

        prob_matrix = []
        #get the different parts of the message
        phrases = X.str.split()

        for i in X.index:
            n = len(phrases.loc[i])
            spam_likelihood = []
            ham_likelihood = []
            #get the unique words in each phrase and count their occurrence
            these_words = list(set(phrases.loc[i]))
            count_dict = {word:0 for word in these_words}

            for word in these_words:
                count_dict[word] = phrases.loc[i].count(word)

            #use logarithmic MLE estimation with poisson pmf
            for word in these_words:
                 if word in self.words:
                    ham_likelihood.append(np.log(stats.poisson.pmf(count_dict[word],self.ham_rates[word]*n)))
                    spam_likelihood.append(np.log(stats.poisson.pmf(count_dict[word],self.spam_rates[word]*n)))
            
            l_spam = np.sum(spam_likelihood)
            l_ham = np.sum(ham_likelihood)
            prob_matrix.append(np.array([l_ham, l_spam]))
        
        #complete our probability matrix
        prob_matrix = np.array(prob_matrix)
        prob_matrix[:,0] += np.log(self.prob_ham)
        prob_matrix[:,1] += np.log(self.prob_spam)

        return prob_matrix

    def predict(self, X):
        '''
        Use self.predict_log_proba to assign labels to X

        Parameters:
            X (pd.Series)(N,): messages to classify

        Return:
            (ndarray)(N,): label for each message
        '''

        #create our list of labels
        word_class = []

        #call our method from the previous problem to get the probabilites
        prob_mat = self.predict_log_proba(X)

        #find which case is more likely
        for prob in prob_mat.tolist():
            if prob[0] >= prob[1]:
                word_class.append('ham')
            else:
                word_class.append('spam')
        
        return word_class

In [5]:
def sklearn_method(X_train, y_train, X_test):
    '''
    Use sklearn's methods to transform X_train and X_test, create a
    naïve Bayes filter, and classify the provided test set.

    Parameters:
        X_train (pandas.Series): messages to train on
        y_train (pandas.Series): labels for X_train
        X_test  (pandas.Series): messages to classify

    Returns:
        (ndarray): classification of X_test
    '''
    #create a dictionary and transform the training data
    vectorizer = CountVectorizer()
    train_counts = vectorizer.fit_transform(X_train)

    #fit a multinomial nb model
    clf = MultinomialNB()
    clf = clf.fit(train_counts, y_train)

    #transform testing data and classify the data
    test_counts = vectorizer.transform(X_test)
    labels = clf.predict(test_counts)

    return labels

In [6]:
def test_fit():
    df = pd.read_csv("sms_spam_collection.csv")
    X, y = df.Message,df.Label
    nbclassifier = NaiveBayesFilter()
    nbclassifier.fit(X[:300],y[:300])
    assert nbclassifier.data.loc['ham','in'] == 47
    assert nbclassifier.data.loc['spam','in'] == 4
test_fit()

In [9]:
def test_predict_proba():
    df = pd.read_csv("sms_spam_collection.csv")
    X, y = df.Message,df.Label
    nbclassifier = NaiveBayesFilter()
    nbclassifier.fit(X[:300],y[:300])
    probs = nbclassifier.predict_proba(X[530:535])
    true_probs = np.array([[8.33609611e-16, 0.00000000e+00],
                            [0.00000000e+00, 2.26874221e-44],
                            [0.00000000e+00, 0.00000000e+00],
                            [0.00000000e+00, 0.00000000e+00],
                            [2.50103642e-10, 0.00000000e+00]])
    assert np.allclose(probs,true_probs)
test_predict_proba()

In [10]:
def test_predict():
    df = pd.read_csv("sms_spam_collection.csv")
    X, y = df.Message,df.Label
    nbclassifier = NaiveBayesFilter()
    nbclassifier.fit(X[:300],y[:300])
    preds = nbclassifier.predict(X[530:535])
    truth = np.array(['ham', 'spam', 'ham', 'ham', 'ham'])
    assert (preds == truth).all()
test_predict()

In [11]:
def test_predict_log_proba():
    df = pd.read_csv("sms_spam_collection.csv")
    X, y = df.Message,df.Label
    nbclassifier = NaiveBayesFilter()
    nbclassifier.fit(X[:300],y[:300])
    probs = nbclassifier.predict_log_proba(X[530:535])
    true_probs = np.array([[ -33.39347149,  -35.34710583],
                            [-106.83571245,  -93.63509276],
                            [ -57.05676356,  -58.34010293],
                            [ -19.22723879,  -20.19409107],
                            [ -21.5513236 ,  -26.18555562]])
    assert np.allclose(probs,true_probs)
test_predict_log_proba()

In [12]:
def test_predict_log():
    df = pd.read_csv("sms_spam_collection.csv")
    X, y = df.Message,df.Label
    nbclassifier = NaiveBayesFilter()
    nbclassifier.fit(X[:300],y[:300])
    preds = nbclassifier.predict_log(X[530:535])
    truth = np.array(['ham', 'spam', 'ham', 'ham', 'ham'])
    assert (preds == truth).all()
test_predict_log()

In [13]:
def test_poisson_fit():
    df = pd.read_csv("sms_spam_collection.csv")
    X, y = df.Message,df.Label
    pbclassifier = PoissonBayesFilter()
    pbclassifier.fit(X[:300],y[:300])
    assert np.isclose(pbclassifier.ham_rates['in'], 0.012588512981904013)
    assert np.isclose(pbclassifier.spam_rates['in'],0.004166666666666667)
test_poisson_fit()

In [14]:
def test_poisson_predict_log_proba():
    df = pd.read_csv("sms_spam_collection.csv")
    X, y = df.Message,df.Label
    pbclassifier = PoissonBayesFilter()
    pbclassifier.fit(X[:300],y[:300])
    probs = pbclassifier.predict_log_proba(X[530:535])
    true_probs = np.array([[-21.42246084, -23.29712325],
                            [-58.14578114, -44.50623148],
                            [-38.22508624, -39.48322892],
                            [-14.45137719, -15.419944  ],
                            [-16.23273939, -20.68704484]])
    assert np.allclose(probs,true_probs)
test_poisson_predict_log_proba()

In [15]:
def test_poisson_predict():
    df = pd.read_csv("sms_spam_collection.csv")
    X, y = df.Message,df.Label
    pbclassifier = PoissonBayesFilter()
    pbclassifier.fit(X[:300],y[:300])
    preds = pbclassifier.predict(X[530:535])
    truth = np.array(['ham', 'spam', 'ham', 'ham', 'ham'])
    assert (preds == truth).all()
test_poisson_predict()

In [18]:
def test_sklearn_method():
    df = pd.read_csv('sms_spam_collection.csv')
    X, y = df['Message'],df['Label']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7)


    actual_labels = sklearn_method(X_train, y_train, X_test)

    nbclassifier = NaiveBayesFilter()
    nbclassifier.fit(X_train, y_train)
    nbclassifier_labels = nbclassifier.predict_log(X_test)
    print(accuracy_score(actual_labels,nbclassifier_labels))

    pbclassifier = PoissonBayesFilter()
    pbclassifier.fit(X_train,y_train)
    pbclassifier_labels = pbclassifier.predict(X_test)
    print(accuracy_score(actual_labels,pbclassifier_labels))
test_sklearn_method()

0.9789797487823635
0.9797487823634965
