# CS760 HW4 - Naive Bayes
# By: Luke Neuendorf
***
## Imports

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10000) 
pd.set_option('display.max_columns', 10000) 
import string
import NaiveBayes
from collections import Counter

***
## Problem 1
Use files 0.txt to 9.txt in each language as the training data. Estimate the prior probabilities $\hat{p}(y = e)$, $\hat{p}(y = j)$, $\hat{p}(y = s)$ using additive smoothing with parameter $\frac{1}{2}$.

In [2]:
def get_bagOfCharacters(file_list=[]):
    acceptable_chars = list(string.ascii_lowercase)
    acceptable_chars.extend(" ")
    data = []
    
    # count the total occurences of each token (letters+space) in each file
    for i in range(0,len(file_list)):
        char_list =[]
        with open('data/'+file_list[i]+'.txt', 'r') as file:
            file_contents = file.read()
            char_list = list(file_contents)
        data.append(dict())
        data[i]['x'] = [char for char in char_list if char in acceptable_chars]
        data[i]['y'] = file_list[i][0]
        data[i]['numchars'] = len(data[i]['x'])
    return data

In [3]:
training_file_list = ['e0','e1','e2','e3','e4','e5','e6','e7','e8','e9',
                      's0','s1','s2','s3','s4','s5','s6','s7','s8','s9',
                      'j0','j1','j2','j3','j4','j5','j6','j7','j8','j9']
train_data = get_bagOfCharacters(training_file_list)

In [4]:
characters = list(string.ascii_lowercase)
characters.extend(" ")
languages = ['e','s','j']

In [5]:
model = NaiveBayes.NaiveBayesClassifier()
model.train(train_data, characters, languages, alpha=.5, K_L=3, K_S=27)

In [6]:
print("p_hat(y=language):", model.get_prior_pr_dict())

p_hat(y=language): {'e': 0.3333333333333333, 's': 0.3333333333333333, 'j': 0.3333333333333333}


***
## Problem 2
Using the same training data, estimate the class conditional probability (multinomial parameter) for English
$θ_{i,e}\;:=\;\hat{p}(c_i |y=e)$
where $c_i$ is the i-th character. That is, $c_1$ = a, . . . , $c_{26}$ = z, $c_{27}$ = space. Again, use additive smoothing with parameter $\frac{1}{2}$. Give the formula for additive smoothing with parameter $\frac{1}{2}$ in this case. Print $θ_e$ which is a vector with 27 elements.

In [7]:
print("θ_e:")
model.get_conditional_pr_dict()['e']

θ_e:


{'a': 0.0601685114819098,
 'b': 0.011134974392863043,
 'c': 0.021509995043779945,
 'd': 0.021972575582355856,
 'e': 0.1053692383941847,
 'f': 0.018932760614571286,
 'g': 0.017478936064761277,
 'h': 0.047216256401784236,
 'i': 0.055410540227986124,
 'j': 0.001420783082768875,
 'k': 0.0037336857756484387,
 'l': 0.028977366595076822,
 'm': 0.020518751032545846,
 'n': 0.057921691723112505,
 'o': 0.06446390219725756,
 'p': 0.01675202378985627,
 'q': 0.0005617049396993227,
 'r': 0.053824549810011564,
 's': 0.06618205848339666,
 't': 0.08012555757475633,
 'u': 0.026664463902197257,
 'v': 0.009284652238559392,
 'w': 0.015496448042293078,
 'x': 0.001156451346439782,
 'y': 0.013844374690236246,
 'z': 0.0006277878737815959,
 ' ': 0.1792499586981662}

***
## Problem 3
Print $θ_j$, $θ_s$, the class conditional probabilities for Japanese and Spanish.

In [8]:
print("θ_j:")
model.get_conditional_pr_dict()['j']

θ_j:


{'a': 0.1317656102589189,
 'b': 0.010866906600510151,
 'c': 0.005485866033054963,
 'd': 0.01722631818022992,
 'e': 0.06020475907613823,
 'f': 0.003878542227191726,
 'g': 0.014011670568503443,
 'h': 0.03176211607673224,
 'i': 0.09703343932352633,
 'j': 0.0023411020650616725,
 'k': 0.05740941332681086,
 'l': 0.001432614696530277,
 'm': 0.03979873510604843,
 'n': 0.05671057688947902,
 'o': 0.09116321324993885,
 'p': 0.0008735455466648031,
 'q': 0.00010482546559977637,
 'r': 0.04280373178657535,
 's': 0.0421747789929767,
 't': 0.056990111464411755,
 'u': 0.07061742199238269,
 'v': 0.0002445927530661449,
 'w': 0.01974212935462455,
 'x': 3.4941821866592126e-05,
 'y': 0.01415143785596981,
 'z': 0.00772214263251686,
 ' ': 0.12344945665466997}

In [9]:
print("θ_s:")
model.get_conditional_pr_dict()['s']

θ_s:


{'a': 0.10456045141993771,
 'b': 0.008232863618143134,
 'c': 0.03752582405722919,
 'd': 0.039745922111559924,
 'e': 0.1138108599796491,
 'f': 0.00860287996053159,
 'g': 0.0071844839813758445,
 'h': 0.0045327001942585795,
 'i': 0.049859702136844375,
 'j': 0.006629459467793161,
 'k': 0.0002775122567913416,
 'l': 0.052943171656748174,
 'm': 0.02580863988159477,
 'n': 0.054176559464709693,
 'o': 0.07249236841293824,
 'p': 0.02426690512164287,
 'q': 0.007677839104560451,
 'r': 0.05929511886774999,
 's': 0.06577040485954797,
 't': 0.03561407295488884,
 'u': 0.03370232185254849,
 'v': 0.00588942678301625,
 'w': 9.250408559711388e-05,
 'x': 0.0024976103111220747,
 'y': 0.007862847275754679,
 'z': 0.0026826184823163022,
 ' ': 0.16826493170115014}

***
## Problem 4
Treat e10.txt as a test document x. Represent x as a bag-of-words count vector (Hint: the vocabulary has
size 27). Print the bag-of-words vector x.

In [10]:
def get_bagOfCharactersCount(file):
    accepted_chars = list(string.ascii_lowercase)
    accepted_chars.extend(" ")
    bag_of_char_count = dict()
    
    char_list =[]
    with open('data/'+file+'.txt', 'r') as file:
        file_contents = file.read()
        char_list = list(file_contents)
    num_chars = Counter(char_list)
    for char in accepted_chars:
        bag_of_char_count[char] = num_chars[char]
    
    return bag_of_char_count

In [11]:
e10_bagOfCharactersCount = get_bagOfCharactersCount('e10')
e10_bagOfCharactersCount

{'a': 164,
 'b': 32,
 'c': 53,
 'd': 57,
 'e': 311,
 'f': 55,
 'g': 51,
 'h': 140,
 'i': 140,
 'j': 3,
 'k': 6,
 'l': 85,
 'm': 64,
 'n': 139,
 'o': 182,
 'p': 53,
 'q': 3,
 'r': 141,
 's': 186,
 't': 225,
 'u': 65,
 'v': 31,
 'w': 47,
 'x': 4,
 'y': 38,
 'z': 2,
 ' ': 498}

***
## Problem 5
For the x of e10.txt, compute $\hat{p}(x | y)$ for y = e, j, s under the multinomial model assumption, respectively. Use the formula $\hat{p}(x | y) = \prod_{i=1}^{d}(θ_{i,y})^{x_i}$ 
where $x = (x_1,...,x_d)$. Show the three values: $\hat{p}(x | y = e)$, $\hat{p}(x | y = j)$, $\hat{p}(x | y = s)$.
Hint: you may notice that we omitted the multinomial coefficient. This is ok for classification because it is a constant w.r.t. y. Also, store all probabilities here and below in log() internally to avoid underflow. This also means you need to do arithmetic in log space.

In [12]:
print("Predicted conditional probabilities of x:")
model.predict(e10_bagOfCharactersCount)
model.get_log_pred_conditional_pr_dict()

Predicted conditional probabilities of x:


{'e': -7841.865447060635, 's': -8467.282044010557, 'j': -8771.433079075032}

***
## Problem 6
For the x of e10.txt, use the Bayes rule and your estimated prior and likelihood, compute the posterior $\hat{p}(y | x)$. Show the three values: $\hat{p}(y = e | x)$, $\hat{p}(y = j | x)$, $\hat{p}(y = s | x)$). Show the predicted class label of $x$.

In [13]:
print("Predicted posterior probabilites of x:")
model.get_log_pred_posterior_pr_dict()

Predicted posterior probabilites of x:


{'e': -7842.964059349303, 's': -8468.380656299225, 'j': -8772.5316913637}

In [14]:
print('Predicted class label of x:', model.predict(e10_bagOfCharactersCount))

Predicted class label of x: e


***
## Problem 7
Evaluate the performance of your classifier on the test set (files 10.txt to 19.txt in three languages).

In [17]:
test_file_list = ['e10','e11','e12','e13','e14','e15','e16','e17','e18','e19',
                  's10','s11','s12','s13','s14','s15','s16','s17','s18','s19',
                  'j10','j11','j12','j13','j14','j15','j16','j17','j18','j19']

for file in test_file_list:
    bagOfCharactersCount = get_bagOfCharactersCount(file)
    print("Prediction for ", file,": ", model.predict(bagOfCharactersCount),sep="")
    

Prediction for e10: e
Prediction for e11: e
Prediction for e12: e
Prediction for e13: e
Prediction for e14: e
Prediction for e15: e
Prediction for e16: e
Prediction for e17: e
Prediction for e18: e
Prediction for e19: e
Prediction for s10: s
Prediction for s11: s
Prediction for s12: s
Prediction for s13: s
Prediction for s14: s
Prediction for s15: s
Prediction for s16: s
Prediction for s17: s
Prediction for s18: s
Prediction for s19: s
Prediction for j10: j
Prediction for j11: j
Prediction for j12: j
Prediction for j13: j
Prediction for j14: j
Prediction for j15: j
Prediction for j16: j
Prediction for j17: j
Prediction for j18: j
Prediction for j19: j


***
## Problem 8
Take a test document. Arbitrarily shuffle the order of its characters so that the words (and spaces) are scrambled beyond human recognition. How does this shuffling affect your Naive Bayes classifier’s prediction on this document?

In [31]:
j0_shuffled_bagOfCharactersCount = get_bagOfCharactersCount('j0_shuffled')
j0_bagOfCharactersCount = get_bagOfCharactersCount('j0')
entered = False
for char in list(j0_bagOfCharactersCount.keys()):
    if j0_shuffled_bagOfCharactersCount[char] != j0_bagOfCharactersCount[char]:
        print("ERROR: character counts between shuffled and not shuffled files don't match.")
        entered = True
if ~entered:
    print("File character counts match!")

File character counts match!


In [32]:
print("Prediction for j0: ", model.predict(j0_bagOfCharactersCount),sep="")
print("Prediction for j0_shuffled: ", model.predict(j0_shuffled_bagOfCharactersCount),sep="")

Prediction for j0: j
Prediction for j0_shuffled: j


In [33]:
model.predict(j0_bagOfCharactersCount)
print("Predicted posterior probabilites of j0:")
model.get_log_pred_posterior_pr_dict()

Predicted posterior probabilites of j0:


{'e': -4213.009416088309, 's': -4568.17154055695, 'j': -3799.777414208914}

In [34]:
model.predict(j0_shuffled_bagOfCharactersCount)
print("Predicted posterior probabilites of j0_shuffled:")
model.get_log_pred_posterior_pr_dict()

Predicted posterior probabilites of j0_shuffled:


{'e': -4213.009416088309, 's': -4568.17154055695, 'j': -3799.777414208914}