Morgan Turville-Heitz
CS 760
10/18/2023

In [60]:

import pandas as pd
import numpy as np
import os
import ipykernel

Defining the Doc class, where counts/vectors are stored for each document.

In [61]:
class docRegistry(type):
    def __iter__(cls):
        return iter(cls._registry)
    

In [62]:
class Doc(metaclass=docRegistry):
    _registry = []

    def __init__(self, raw, fn):
        self._registry.append(self)
        self.fn = fn
        self.vector = self.vectorize(raw)
        self.char_counts = self.count(self.vector)
        self.label = fn[0]
        self.pe = None
        self.pj = None
        self.ps = None

    def vectorize(self, data):
        ### Stripping newlines/special characters
        filtered = ''.join([char for char in data if char.isalpha() or char == ' '])
        return list(filtered)

    def count(self, vector):
        count = {char: 0 for char in 'abcdefghijklmnopqrstuvwxyz '}
        for char in vector:
            count[char]+=1
        return count


For each language, I have a separate class.

Natural log form of the class conditional probability is 
$$\ln(P(c_{i}|e)) = \ln(n(c_{i,e}) + \alpha) - \ln(N_{c} + |c| \alpha)$$

In [63]:
class label():
    def __init__(self, prior, smoothing=0.5):
        self.prior = prior
        self.char_count = None
        self.ntot = None
        self.smoothing = smoothing
        self.lncc = None
        self.pcc = None

    def class_conditional(self):
        lncc = {char: 0 for char in 'abcdefghijklmnopqrstuvwxyz '}
        ctot = 27
        for char in lncc.keys():
            d = np.log(self.char_count[char] + self.smoothing)
            n = np.log(self.ntot + self.smoothing * ctot)
            lncc[char] = d - n
        self.lncc = lncc
        return lncc
    
    def convert_P(self):
        lncc = self.lncc
        self.pcc = {char: np.exp(val) for char, val in lncc.items()}
        return self.pcc


Loading files, creating the Doc class for each file.

In [64]:
direc = r"C:\Users\Meau\Documents\GRAD SCHOOL\CS 760\Hw4\languageID"
files = os.listdir(direc)
for txtfile in files:
    path = os.path.join(direc, txtfile)
    with open(path, 'r') as file:
        raw = file.read()
    Doc(raw, str(txtfile))

Calculating the priors, with additive smoothing $$ P(x) = \frac{n(x) + \alpha}{N + k \alpha }, \alpha = \frac{1}{2} $$ 


In [65]:
counts = {"e":0, "s":0, "j":0}
logpriors = {"e":0, "s":0, "j":0}
alpha = 0.5
Nd = 0
k = 3
for doc in Doc:
    Nd += 1
    lang = doc.label
    counts[lang] += 1
for la in counts.keys():
    logpriors[la] = np.log((counts[la] + alpha ) / (Nd + k * alpha))
    
e = label(logpriors["e"])
s = label(logpriors["s"])
j = label(logpriors["j"])

priors = {la : np.exp(logpriors[la]) for la in logpriors.keys()}
print(f"Priors for languages are: {priors}")

Priors for languages are: {'e': 0.3333333333333333, 's': 0.3333333333333333, 'j': 0.3333333333333333}


Class conditional probability is $\theta_{i,e}\coloneqq \hat{p} (c_{i} | y = e)$ for $i$-th character $c_{i}$.
$\theta_{i,e}$ is the multinomial parameter.

With additive smoothing, this becomes $$ P(c_{i}|e) = \frac{n(c_{i,e}) + \alpha}{N_{c} + |c| \alpha } $$
where $ |c| $ is the number of characters (in this case, 27).
Looking for the conditional probability so we can find the a posteriori with prior.


In [None]:

ctot = {char: 0 for char in 'abcdefghijklmnopqrstuvwxyz '}
ntot = 0
for doc in Doc:
    if doc.label != "e":
        continue

    count = doc.char_counts
    for char, char_count in count.items():
        ntot += char_count
        ctot[char] += char_count
e.char_count = ctot
e.ntot = ntot

ctot = {char: 0 for char in 'abcdefghijklmnopqrstuvwxyz '}
ntot = 0
for doc in Doc:
    if doc.label != "s":
        continue

    count = doc.char_counts
    for char, char_count in count.items():
        ntot += char_count
        ctot[char] += char_count
s.char_count = ctot
s.ntot = ntot

ctot = {char: 0 for char in 'abcdefghijklmnopqrstuvwxyz '}
ntot = 0
for doc in Doc:
    if doc.label != "j":
        continue

    count = doc.char_counts
    for char, char_count in count.items():
        ntot += char_count
        ctot[char] += char_count
j.char_count = ctot
j.ntot = ntot


Class conditional probabilities (and their natural log) is calculated in the label class. Printing here.

In [67]:
print(f"Class conditional log probabilities for English are: {e.class_conditional()}")
print(f"Class conditional log probabilities for Spanish are: {s.class_conditional()}")
print(f"Class conditional log probabilities for Japanese are: {j.class_conditional()}")

print(f"Class conditional probabilities for English are: {e.convert_P()}")
print(f"Class conditional probabilities for Spanish are: {s.convert_P()}")
print(f"Class conditional probabilities for Japanese are: {j.convert_P()}")

Class conditional log probabilities for English are: {'a': -2.7865811234361626, 'b': -4.427169857735544, 'c': -3.8260913630557525, 'd': -3.8106767996545665, 'e': -2.244746660701587, 'f': -3.912573312696918, 'g': -4.134482622472095, 'h': -3.0668898205344055, 'i': -2.9055378849476314, 'j': -7.0579966882142875, 'k': -5.523423654744361, 'l': -3.501712297068451, 'm': -3.8094029141319776, 'n': -2.8572266038846204, 'o': -2.7322368688297782, 'p': -4.110112399868934, 'q': -7.277050254276972, 'r': -2.9849148350826447, 's': -2.752382065092994, 't': -2.485315823756136, 'u': -3.662828373869477, 'v': -4.690360910179029, 'w': -4.168536092112369, 'x': -6.726219295897282, 'y': -4.313461753777671, 'z': -7.557952639743375, ' ': -1.717861102231387}
Class conditional log probabilities for Spanish are: {'a': -2.239267033006646, 'b': -4.646197151180955, 'c': -3.3036929956807963, 'd': -3.2068431696908783, 'e': -2.1861391161676735, 'f': -4.911394042719514, 'g': -4.933324746213484, 'h': -5.3443485123254755, 'i'

$$ \hat{e} = argmax_{e} p(e|c) $$
$$ \hat{e} = argmax_{e} \frac{ p(c|e) p(e) }{p(c)}$$