# Unigrams, Bigrams, and Trigrams in Naive Bayes Classifiers

Math of Intelligence Week 6 Challenge - https://www.youtube.com/watch?v=PrkiRVcrxOs&t=7s

In this notebook I will explore the performance of ngram words in a naive bayes classifier. I will look at how they perform across two data sets: 
    1) Table 13.1 
    2) Table 13.10
    3) Table IAML

In [144]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [145]:
t1 = pd.read_csv('data/13.1.csv', usecols=[0,1], encoding='latin-1')
t1.columns = ['label','body']
t1['label'] = t1['label'].replace(["no","yes"],[0,1])
t1 = t1.values
t2 = pd.read_csv('data/13.10.csv', usecols=[0,1], encoding='latin-1')
t2.columns = ['label','body']
t2['label'] = t2['label'].replace(["no","yes"],[0,1])
t2 = t2.values
t3 = pd.read_csv('data/IAML.csv', usecols=[0,1], encoding='latin-1')
# print t3
t3.columns = ['label','body']
t3['label'] = t3['label'].replace(["ham","spam"],[0,1])
t3 = t3.values

# Table 13.1

In [146]:
print t1

[[1 u'Chinese Beijing Chinese']
 [1 u'Chinese Chinese Shanghai']
 [1 u'Chinese Macao']
 [0 u'Tokyo Japan Chinese']
 [1 u'Chinese Chinese Chinese Tokyo Japan']]


# Table 13.10

In [147]:
print t2

[[1 u'Taipei Taiwan']
 [1 u'Macao Taiwan Shanghai']
 [0 u'Japan Sapporo']
 [0 u'Sapporo Osaka Taiwan']
 [0 u'Taiwan Taiwan Sapporo']]


# Table IAML

In [148]:
print t3

[[1 u'send us your password']
 [0 u'send us your review']
 [0 u'review your password']
 [1 u'review us']
 [1 u'send your password']
 [1 u'send us your account']
 [0 u'review us now']]


In [149]:
class ngrams_bayes():
    
    def __init__(self, data, n=2, split=0.75):
        
        # split into training and testing data
        self.train_data, self.test_data = train_test_split(data,
                                                          train_size=split, shuffle=False)
        print self.train_data
        print self.test_data
        # convert into n grams
        self.train_data = [[item[0], self.ngrams(n, item[1])] for item in self.train_data]
        self.test_data = [[item[0], self.ngrams(n, item[1])] for item in self.test_data]
        
        # count unique n grams in training data
        flattened = [gram for message in self.train_data for gram in message[1]]
        self.unique = len(set(flattened))
        
        # init dicts
        self.trainPositive = {}
        self.trainNegative = {}
        # counters
        self.posGramCount = 0
        self.negGramCount = 0
        self.spamCount = 0
        # priors
        self.pA = 0
        self.pNotA = 0
        
    def ngrams(self, n, text):
        text = text.split(' ')
        grams = []
        for i in range(len(text)-n+1):
            gram = ' '.join(text[i:i+n])
            grams.append(gram)
        return grams 
    
    def train(self):
        
        for item in self.train_data:
            label = item[0]
            grams = item[1]
            if label == 1:
                self.spamCount += 1   
            for gram in grams:
                if label == 1:
                    self.trainPositive[gram] = self.trainPositive.get(gram, 0) + 1
                    self.posGramCount += 1
                else:
                    self.trainNegative[gram] = self.trainNegative.get(gram, 0) + 1
                    self.negGramCount += 1
                    
        self.pA = self.spamCount/float(len(self.train_data))
        self.pNotA = 1.0 - self.pA
        
    def classify(self, text, alpha=1.0):
        
        self.alpha = alpha
        isSpam = self.pA * self.conditionalText(text, 1)
        notSpam = self.pNotA * self.conditionalText(text, 0)
        if (isSpam > notSpam):
            return 1
        else:
            return 0
        
    def conditionalText(self, grams, label):
        result = 1.0
        for ngram in grams:
            result *= self.conditionalNgram(ngram, label)
        return result
    
    def conditionalNgram(self, ngram, label):
        alpha = self.alpha
        if label == 1:
            return ((self.trainPositive.get(ngram,0)+alpha) /
                    float(self.posGramCount+alpha*self.unique))
        else:
            return ((self.trainNegative.get(ngram,0)+alpha) /
                    float(self.negGramCount+alpha*self.unique))
            
    def evaluate_test_data(self):
        results = []
        for test in self.test_data:
            label = test[0]
            text = test[1]
            ruling = self.classify(text)
            if ruling == label:
                results.append(1) 
            else:
                results.append(0) 
                
        print("Evaluated {} test cases. {:.2f}% Accuracy".format(len(results), 100.0*sum(results)/float(len(results))))
        return sum(results)/float(len(results))

In [158]:
%%capture
classifier13_1 = ngrams_bayes(t1,n=1,split=0.8);

classifier13_10 = ngrams_bayes(t2,n=1,split=0.8);

classifierIAML = ngrams_bayes(t3,n=1,split=0.89);

# Test sets

In [159]:
print classifier13_1.test_data

[[1, [u'Chinese', u'Chinese', u'Chinese', u'Tokyo', u'Japan']]]


In [160]:
print classifier13_10.test_data

[[0, [u'Taiwan', u'Taiwan', u'Sapporo']]]


In [161]:
print classifierIAML.test_data

[[0, [u'review', u'us', u'now']]]


# Training the classifiers

In [162]:
classifier13_1.train()
classifier13_10.train()
classifierIAML.train()

# Running the test sets

In [163]:
classifier13_1.classify(classifier13_1.test_data[0][1])

1

In [164]:
classifier13_10.classify(classifier13_10.test_data[0][1])

0

In [165]:
classifierIAML.classify(classifierIAML.test_data[0][1])

0

# Results

Table 13.1

"Chinese Chinese Chinese Tokyo Japan" belongs to class C (in China).

Table 13.10

"Taiwan Taiwan Sapporo" belongs no class ~c (not in China).

Table IAML

"review us now" belongs to class ham (not spam).