Morgan Turville-Heitz
CS 760
10/18/2023

In [1]:

import pandas as pd
import numpy as np
import os
import ipykernel
import re


Defining the Doc class, where counts/vectors are stored for each document.

In [2]:
class docRegistry(type):
    def __iter__(cls):
        return iter(cls._registry)
    

In [3]:
class Doc(metaclass=docRegistry):
    _registry = []
    
    def __init__(self, raw, fn, testset):
        self._registry.append(self)
        self.fn = fn
        self.vector = self.vectorize(raw)
        self.char_counts = self.count(self.vector)
        self.label = fn[0]
        self.pe = None
        self.pj = None
        self.ps = None
        self.testset = testset

    def vectorize(self, data):
        ### Stripping newlines/special characters
        filtered = ''.join([char for char in data if char.isalpha() or char == ' '])
        return list(filtered)

    def count(self, vector):
        count = {char: 0 for char in 'abcdefghijklmnopqrstuvwxyz '}
        for char in vector:
            count[char]+=1
        return count


For each language, I have a separate class.

Natural log form of the class conditional probability is 
$$\ln(P(c_{i}|e)) = \ln(n(c_{i,e}) + \alpha) - \ln(N_{c} + |c| \alpha)$$

In [4]:
class label(metaclass=docRegistry):
    _registry = []
    def __init__(self, prior, smoothing=0.5):
        self._registry.append(self)
        self.prior = prior
        self.char_count = None
        self.ntot = None
        self.smoothing = smoothing
        self.lncc = None
        self.pcc = None

    def class_conditional(self):
        lncc = {char: 0 for char in 'abcdefghijklmnopqrstuvwxyz '}
        ctot = 27
        for char in lncc.keys():
            d = np.log(self.char_count[char] + self.smoothing)
            n = np.log(self.ntot + self.smoothing * ctot)
            lncc[char] = d - n
        self.lncc = lncc
        return lncc
    
    def convert_P(self):
        lncc = self.lncc
        self.pcc = {char: np.exp(val) for char, val in lncc.items()}
        return self.pcc


Loading files, creating the Doc class for each file.

In [5]:
direc = r"C:\Users\Meau\Documents\GRAD SCHOOL\CS 760\Hw4\languageID"
files = os.listdir(direc)
for txtfile in files:
    testset = False
    numlabel = int(re.search(r'(\d+)', txtfile).group(1))
    if int(numlabel) > 9:
        #print(f"Added to testset: {txtfile}")
        testset = True
    path = os.path.join(direc, txtfile)
    with open(path, 'r') as file:
        raw = file.read().replace('\n', '')
    Doc(raw, str(txtfile), testset)
print(len(Doc._registry))

60


Calculating the priors, with additive smoothing $$ P(x) = \frac{n(x) + \alpha}{N + k \alpha }, \alpha = \frac{1}{2} $$ 


In [6]:
counts = {"e":0, "s":0, "j":0}
logpriors = {"e":0, "s":0, "j":0}
alpha = 0.5
Nd = 0
k = 3
for doc in Doc:
    if doc.testset:
        #print(f'Testset for doc {doc.fn}')
        continue
    Nd += 1
    counts[doc.label] += 1

for la in counts.keys():
    logpriors[la] = np.log((counts[la] + alpha ) / (Nd + k * alpha))
    
e = label(logpriors["e"])
s = label(logpriors["s"])
j = label(logpriors["j"])

priors = {la : np.exp(logpriors[la]) for la in logpriors.keys()}
print(f"Priors for languages are: {priors}")

Priors for languages are: {'e': 0.3333333333333333, 's': 0.3333333333333333, 'j': 0.3333333333333333}


Class conditional probability is $\theta_{i,e}\coloneqq \hat{p} (c_{i} | y = e)$ for $i$-th character $c_{i}$.
$\theta_{i,e}$ is the multinomial parameter.

With additive smoothing, this becomes $$ P(c_{i}|e) = \frac{n(c_{i,e}) + \alpha}{N_{c} + |c| \alpha } $$
where $ |c| $ is the number of characters (in this case, 27).
Looking for the conditional probability so we can find the a posteriori with prior.


In [7]:

ctot = {char: 0 for char in 'abcdefghijklmnopqrstuvwxyz '}
ntot = 0
for doc in Doc:
    if doc.testset:
        #print(f'Testset for doc {doc.fn}')
        continue
    if doc.label != "e":
        continue

    count = doc.char_counts
    for char, char_count in count.items():
        ntot += char_count
        ctot[char] += char_count
e.char_count = ctot
e.ntot = ntot

ctot = {char: 0 for char in 'abcdefghijklmnopqrstuvwxyz '}
ntot = 0
for doc in Doc:
    if doc.testset:
        #print(f'Testset for doc {doc.fn}')
        continue
    if doc.label != "s":
        continue

    count = doc.char_counts
    for char, char_count in count.items():
        ntot += char_count
        ctot[char] += char_count
s.char_count = ctot
s.ntot = ntot

ctot = {char: 0 for char in 'abcdefghijklmnopqrstuvwxyz '}
ntot = 0
for doc in Doc:
    if doc.testset:
        #print(f'Testset for doc {doc.fn}')
        continue
    if doc.label != "j":
        continue

    count = doc.char_counts
    for char, char_count in count.items():
        ntot += char_count
        ctot[char] += char_count
j.char_count = ctot
j.ntot = ntot


Class conditional probabilities (and their natural log) is calculated in the label class. Printing here.

In [8]:
print(f"Class conditional log probabilities for English are: {e.class_conditional()}")
print(f"Class conditional log probabilities for Spanish are: {s.class_conditional()}")
print(f"Class conditional log probabilities for Japanese are: {j.class_conditional()}")

print(f"Class conditional probabilities for English are: {e.convert_P()}")
print(f"Class conditional probabilities for Spanish are: {s.convert_P()}")
print(f"Class conditional probabilities for Japanese are: {j.convert_P()}")

Class conditional log probabilities for English are: {'a': -2.810606128598179, 'b': -4.497664277973426, 'c': -3.839237566117218, 'd': -3.8179601676699333, 'e': -2.2502845413157297, 'f': -3.9668614916110476, 'g': -4.046758776467488, 'h': -3.05301703039592, 'i': -2.8929854465025784, 'j': -6.556547092632226, 'k': -5.590359389613447, 'l': -3.5412402159536045, 'm': -3.8864161263923087, 'n': -2.848663323404553, 'o': -2.7416498677026766, 'p': -4.089236204734604, 'q': -7.484533864269571, 'r': -2.9220255997237423, 's': -2.7153458726599693, 't': -2.5241604049542037, 'u': -3.6244235400558384, 'v': -4.679392538992041, 'w': -4.167144439879508, 'x': -6.7623991468363736, 'y': -4.27987628840365, 'z': -7.373308229159347, ' ': -1.7189740299171268}
Class conditional log probabilities for Spanish are: {'a': -2.2579898923463224, 'b': -4.799621376071712, 'c': -3.2827259414844336, 'd': -3.2252480315327743, 'e': -2.173217331441963, 'f': -4.755658252650596, 'g': -4.93583158090626, 'h': -5.396437447693225, 'i':

In [9]:
def generate_latex_table(e, s, j):
    latex_code = "\\begin{table}[h!]\n"
    latex_code += "\\centering\n"
    latex_code += "\\begin{tabular}{c|ccc|ccc}\n"
    latex_code += "\\toprule\n"
    latex_code += "Character & $\\ln(P_e)$ & $\\ln(P_s)$ & $\\ln(P_j)$ & $P_e$ & $P_s$ & $P_j$ \\\\\n"
    latex_code += "\\midrule\n"

    for char in 'abcdefghijklmnopqrstuvwxyz ':
        latex_code += char + " & "
        latex_code += "{:.4f}".format(e.lncc[char]) + " & "
        latex_code += "{:.4f}".format(s.lncc[char]) + " & "
        latex_code += "{:.4f}".format(j.lncc[char]) + " & "
        latex_code += "{:.4f}".format(e.pcc[char]) + " & "
        latex_code += "{:.4f}".format(s.pcc[char]) + " & "
        latex_code += "{:.4f}".format(j.pcc[char])
        latex_code += " \\\\\n"

    latex_code += "\\bottomrule\n"
    latex_code += "\\end{tabular}\n"
    latex_code += "\\caption{Log class conditional probabilities and conditional probabilities for each character.}\n"
    latex_code += "\\end{table}"

    return latex_code


In [10]:
latex_output = generate_latex_table(e, s, j)
print(latex_output)

\begin{table}[h!]
\centering
\begin{tabular}{c|ccc|ccc}
\toprule
Character & $\ln(P_e)$ & $\ln(P_s)$ & $\ln(P_j)$ & $P_e$ & $P_s$ & $P_j$ \\
\midrule
a & -2.8106 & -2.2580 & -2.0267 & 0.0602 & 0.1046 & 0.1318 \\
b & -4.4977 & -4.7996 & -4.5220 & 0.0111 & 0.0082 & 0.0109 \\
c & -3.8392 & -3.2827 & -5.2056 & 0.0215 & 0.0375 & 0.0055 \\
d & -3.8180 & -3.2252 & -4.0613 & 0.0220 & 0.0397 & 0.0172 \\
e & -2.2503 & -2.1732 & -2.8100 & 0.1054 & 0.1138 & 0.0602 \\
f & -3.9669 & -4.7557 & -5.5523 & 0.0189 & 0.0086 & 0.0039 \\
g & -4.0468 & -4.9358 & -4.2679 & 0.0175 & 0.0072 & 0.0140 \\
h & -3.0530 & -5.3964 & -3.4495 & 0.0472 & 0.0045 & 0.0318 \\
i & -2.8930 & -2.9985 & -2.3327 & 0.0554 & 0.0499 & 0.0970 \\
j & -6.5565 & -5.0162 & -6.0571 & 0.0014 & 0.0066 & 0.0023 \\
k & -5.5904 & -8.1896 & -2.8575 & 0.0037 & 0.0003 & 0.0574 \\
l & -3.5412 & -2.9385 & -6.5483 & 0.0290 & 0.0529 & 0.0014 \\
m & -3.8864 & -3.6570 & -3.2239 & 0.0205 & 0.0258 & 0.0398 \\
n & -2.8487 & -2.9155 & -2.8698 & 0.0579 & 0

$$ \hat{e} = argmax_{e} p(e|c) $$
$$ \hat{e} = argmax_{e} \frac{ p(c|e) p(e) }{p(c)}$$

In [11]:
for doc in Doc:
    if doc.testset == True and doc.fn == 'e10.txt':
        #print('e10.txt found.')
        print(doc.char_counts)
        xhat = doc.char_counts
        test_doc = doc

{'a': 164, 'b': 32, 'c': 53, 'd': 57, 'e': 311, 'f': 55, 'g': 51, 'h': 140, 'i': 140, 'j': 3, 'k': 6, 'l': 85, 'm': 64, 'n': 139, 'o': 182, 'p': 53, 'q': 3, 'r': 141, 's': 186, 't': 225, 'u': 65, 'v': 31, 'w': 47, 'x': 4, 'y': 38, 'z': 2, ' ': 498}


$$\hat{p}(X|y) = \prod_{i=1}^{d} \theta_{i,y}^{x_{i}}$$
$$log(\hat{p}(X|y)) = \sum_{i=1}^{d} log(\theta_{i,y}^{x_{i}})$$
$$log(\hat{p}{X|Y = y}) = \sum_{i = a}^{i=space}x_{i}*log(\theta_{i, Y=y})$$

In [12]:
#print(f"As a reminder, class conditionals are the following \n e: {e.pcc}, s: {s.pcc}, j: {j.pcc}")

lconditional_e = 0
lconditional_j = 0
lconditional_s = 0

for char, val in xhat.items():
    char_count = test_doc.char_counts
    lconditional_e += val * np.log(e.pcc[char])
    lconditional_j += val * np.log(j.pcc[char])
    lconditional_s += val * np.log(s.pcc[char])
conditional_e = np.exp(lconditional_e)
conditional_j = np.exp(lconditional_j)
conditional_s = np.exp(lconditional_s)
print(f"For test document e10.txt, log conditional probabilities are: \n For y = e, {lconditional_e} \n For y = j, {lconditional_j} \n For y = s, {lconditional_s}")
print(f"For test document e10.txt, conditional probabilities are: \n For y = e, {conditional_e:.6e} \n For y = j, {conditional_j:.6e} \n For y = s, {conditional_s:.6e}")


For test document e10.txt, log conditional probabilities are: 
 For y = e, -7841.865447060635 
 For y = j, -8771.433079075032 
 For y = s, -8467.282044010557
For test document e10.txt, conditional probabilities are: 
 For y = e, 0.000000e+00 
 For y = j, 0.000000e+00 
 For y = s, 0.000000e+00


$$p(Y=y|x) = \frac{p(x|Y=y)p(Y=y)}{p(x)}$$
$$log(p(Y=y|x)) = log(p(x|Y=y)) + log(p(Y=y)) - log(p(x))$$
Omitting $p(x)$ for stability

$$p(Y=y|x) = p(x|Y=y)p(Y=y)$$
$$log(p(Y=y|x)) = log(p(x|Y=y)) + log(p(Y=y))$$



In [13]:
print(f"Log priors are {logpriors}")
print(f"Log conditionals for e10.txt are e: {lconditional_e}, j: {lconditional_j}, s: {lconditional_s}")
print(f"Omitting p(x) for stability: ")
lposteriori_e = logpriors['e'] + lconditional_e
lposteriori_j = logpriors['j'] + lconditional_j
lposteriori_s = logpriors['s'] + lconditional_s

print(f"log(p(Y=e | x = e10.txt)) is {lposteriori_e}")
print(f"p(Y=e | x = e10.txt) is {np.exp(lposteriori_e)}")
print(f"log(p(Y=s | x = e10.txt)) is {lposteriori_s}")
print(f"p(Y=s | x = e10.txt) is {np.exp(lposteriori_s)}")
print(f"log(p(Y=j | x = e10.txt)) is {lposteriori_j}")
print(f"p(Y=j | x = e10.txt) is {np.exp(lposteriori_j)}")

print(f"Finding the normalization factor manually:")


Log priors are {'e': -1.0986122886681098, 's': -1.0986122886681098, 'j': -1.0986122886681098}
Log conditionals for e10.txt are e: -7841.865447060635, j: -8771.433079075032, s: -8467.282044010557
Omitting p(x) for stability: 
log(p(Y=e | x = e10.txt)) is -7842.964059349303
p(Y=e | x = e10.txt) is 0.0
log(p(Y=s | x = e10.txt)) is -8468.380656299225
p(Y=s | x = e10.txt) is 0.0
log(p(Y=j | x = e10.txt)) is -8772.5316913637
p(Y=j | x = e10.txt) is 0.0
Finding the normalization factor manually:


Generating the confusion matrix.

In [14]:
### First key is the correct label, second key is the predicted label
predictions = {'e': {'e': 0, 's':0, 'j':0 }, 's': {'e': 0, 's':0, 'j':0 }, 'j':{'e': 0, 's':0, 'j':0 }}
for doc in Doc:
    if doc.testset:
        lab = doc.label
        #print(f"True label is {lab} for doc {doc.fn}")
        lconditional = {'e' : 0, 's': 0, 'j' :0}

        ### Creating the bag of words representation for the document:
        xhat = doc.char_counts
        #print(f"For the document {doc.fn}, bag of word count is {xhat}")
        for char, val in xhat.items():
            ### Char count * log(p(c | y = Y))
            lconditional['e'] += val * np.log(e.pcc[char])
            lconditional['s'] += val * np.log(s.pcc[char])
            lconditional['j'] += val * np.log(j.pcc[char])
        
        lposteriori = {'e' : 0, 's': 0, 'j' :0}
        for key, i in lposteriori.items():
            lposteriori[key] = lconditional[key] + logpriors[key]
        #print(f"Posteriori for {doc.fn} is {lposteriori}")
        pred = max(lposteriori, key=lposteriori.get)
        predictions[lab][pred] += 1
print(predictions) 

{'e': {'e': 10, 's': 0, 'j': 0}, 's': {'e': 0, 's': 10, 'j': 0}, 'j': {'e': 0, 's': 0, 'j': 10}}
