# Unigrams, Bigrams, and Trigrams in Naive Bayes Classifiers 
# Using data from Table 13.10


In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [20]:
df = pd.read_csv('./table13.10.csv', usecols=[0,1], encoding='latin-1')
df.columns = ['label','body']
# label spam as 1, not spam as 0
df['label'] = df['label'].replace(["ham","spam"],[0,1])
data = df.values
data

array([[0L, u'Taipei Taiwan'],
       [0L, u'Macao Taiwan Shanghai'],
       [1L, u'Japan Sapporo'],
       [1L, u'Sapporo Osaka Taiwan']], dtype=object)

In [21]:
class ngrams_bayes():
    
    def __init__(self, data, n=2, split=0.75):
        
        # split into training and testing data
        self.train_data, self.test_data = train_test_split(data,
                                                          train_size=split)
        # convert into n grams
        self.train_data = [[item[0], self.ngrams(n, item[1])] for item in self.train_data]
        self.test_data = [[item[0], self.ngrams(n, item[1])] for item in self.test_data]
        
        # count unique n grams in training data
        flattened = [gram for message in self.train_data for gram in message[1]]
        self.unique = len(set(flattened))
        
        # init dicts
        self.trainPositive = {}
        self.trainNegative = {}
        # counters
        self.posGramCount = 0
        self.negGramCount = 0
        self.spamCount = 0
        # priors
        self.pA = 0
        self.pNotA = 0
        
    def ngrams(self, n, text):
        text = text.split(' ')
        grams = []
        for i in range(len(text)-n+1):
            gram = ' '.join(text[i:i+n])
            grams.append(gram)
        return grams 
    
    def train(self):
        
        for item in self.train_data:
            label = item[0]
            grams = item[1]
            if label == 1:
                self.spamCount += 1   
            for gram in grams:
                if label == 1:
                    self.trainPositive[gram] = self.trainPositive.get(gram, 0) + 1
                    self.posGramCount += 1
                else:
                    self.trainNegative[gram] = self.trainNegative.get(gram, 0) + 1
                    self.negGramCount += 1
                    
        self.pA = self.spamCount/float(len(self.train_data))
        self.pNotA = 1.0 - self.pA
        
    def classify(self, text, alpha=1.0):
        
        self.alpha = alpha
        isSpam = self.pA * self.conditionalText(text, 1)
        notSpam = self.pNotA * self.conditionalText(text, 0)
        if (isSpam > notSpam):
            return 1
        else:
            return 0
        
    def conditionalText(self, grams, label):
        result = 1.0
        for ngram in grams:
            result *= self.conditionalNgram(ngram, label)
        return result
    
    def conditionalNgram(self, ngram, label):
        alpha = self.alpha
        if label == 1:
            return ((self.trainPositive.get(ngram,0)+alpha) /
                    float(self.posGramCount+alpha*self.unique))
        else:
            return ((self.trainNegative.get(ngram,0)+alpha) /
                    float(self.negGramCount+alpha*self.unique))
            
    def evaluate_test_data(self):
        results = []
        for test in self.test_data:
            label = test[0]
            text = test[1]
            ruling = self.classify(text)
            if ruling == label:
                results.append(1) 
            else:
                results.append(0) 
                
        print("Evaluated {} test cases. {:.2f}% Accuracy".format(len(results), 100.0*sum(results)/float(len(results))))
        return sum(results)/float(len(results))

In [22]:
unigram_bayes = ngrams_bayes(data,1)
unigram_bayes.test_data

[[0L, [u'Macao', u'Taiwan', u'Shanghai']]]

In [23]:
unigram_bayes.train_data

[[0L, [u'Taipei', u'Taiwan']],
 [1L, [u'Sapporo', u'Osaka', u'Taiwan']],
 [1L, [u'Japan', u'Sapporo']]]

In [24]:
unigram_bayes.train()

In [25]:
unigram_bayes.evaluate_test_data()

Evaluated 1 test cases. 100.00% Accuracy


1.0

In [26]:
bigram_sms= ngrams_bayes(data,2) 
bigram_sms.train()
bigram_sms.evaluate_test_data()


Evaluated 1 test cases. 0.00% Accuracy


0.0

In [27]:
trigram_sms = ngrams_bayes(data,3) 
trigram_sms.train()
trigram_sms.evaluate_test_data()

Evaluated 1 test cases. 0.00% Accuracy


0.0

Here we can see that our bayesian classifier performs well with unigrams, ok with bigrams, and is basically guessing randomly when it comes to trigrams. Increasing the size of your grams does not help this classifier classify this dataset. This is likely due to the data being made of up of short messages with highly specific and colloquial words. Nearly none of the trigrams will occur more than once in this dataset. I imagine that larger ngrams used in a baysian classifer would work well with something like product reviews which are longer than text messages and use less colloquial language. 