# Homework 4 - Dario Placencio

In [4]:
# import the necessary packages
import numpy as np
import pandas as pd
import os 

### 3 - Language Identification with Naive Bayes

Use files 0.txt to 9.txt in each language as the training data.
Estimate the prior probabilities 
$\hat p(y=e)$,
$\hat p(y=j)$,
$\hat p(y=s)$
using additive smoothing with parameter $\frac{1}{2}$. 
Give the formula for additive smoothing with parameter $\frac{1}{2}$ in this case. 
Print and include in final report the prior probabilities.
(Hint: Store all probabilities here and below in $\log()$ internally to avoid underflow. This also means you need to do arithmetic in log-space.  But answer questions with probability, not log probability.)


In [3]:
# Adjust the extract_path to point directly to the languageID folder
extract_path = 'languageID'

In [5]:
# Read the contents of the training files
def read_files(lang, start, end):
    """Read files for a given language between start and end indices."""
    contents = ""
    for i in range(start, end + 1):
        with open(os.path.join(extract_path, f"{lang}{i}.txt"), 'r', errors='ignore') as f:
            contents += f.read()
    return contents

# Extract training data for each language
english_train = read_files('e', 0, 9)
spanish_train = read_files('s', 0, 9)
japanese_train = read_files('j', 0, 9)

len(english_train), len(spanish_train), len(japanese_train)

(15339, 16500, 15496)

In [10]:
# Given data and smoothing parameters
N_y = 10  # Number of English training documents
N = 30   # Total number of training documents
alpha = 0.5  # Smoothing parameter
V = 3  # Number of classes (languages)

# Compute smoothed prior probability for English
p_e_smoothed = (N_y + alpha) / (N + alpha * V)
p_e_smoothed

0.3333333333333333

In [11]:
# Store as log probabilities
p_e_smoothed = np.log(p_e_smoothed)

In [12]:
# Compute smoothed prior probability for Spanish
p_s_smoothed = (N_y + alpha) / (N + alpha * V)
p_s_smoothed

0.3333333333333333

In [13]:
# Store as log probabilities
p_s_smoothed = np.log(p_s_smoothed)

In [8]:
# Compute smoothed prior probability for Japanese
p_j_smoothed = (N_y + alpha) / (N + alpha * V)
p_j_smoothed

0.3333333333333333

In [14]:
# Store as log probabilities
p_j_smoothed = np.log(p_j_smoothed)

Using the same training data, estimate the class conditional probability (multinomial parameter) for English $\theta_{i,e} := \hat p(c_i \mid y=e)$ where $c_i$ is the $i$-th character. That is, $c_1 = a, \ldots, c_{26} = z, c_{27} = space$. 

Again use additive smoothing with parameter $\frac{1}{2}$. Give the formula for additive smoothing with parameter $\frac{1}{2}$ in this case. 

Print $\theta_e$ and include in final report which is a vector with 27 elements.

In [16]:
from collections import defaultdict

# Function to count the frequency of each character in a given text
def char_frequencies(text):
    freq = defaultdict(int)
    for char in text:
        # Only consider lowercase alphabets and space
        if char in 'abcdefghijklmnopqrstuvwxyz ':
            freq[char] += 1
    return freq

# Assuming english_train contains the combined content of English training files
english_freq = char_frequencies(english_train)

# Compute the total number of characters in the English training data
total_english_chars = sum(english_freq.values())

# Vocabulary
vocabulary = 'abcdefghijklmnopqrstuvwxyz '

# Smoothing parameter
alpha = 0.5

# Compute the class conditional probabilities for English
theta_e = {}
for char in vocabulary:
    theta_e[char] = (english_freq[char] + alpha) / (total_english_chars + alpha * len(vocabulary))

print(theta_e)

{'a': 0.0601685114819098, 'b': 0.011134974392863043, 'c': 0.021509995043779945, 'd': 0.021972575582355856, 'e': 0.1053692383941847, 'f': 0.018932760614571286, 'g': 0.017478936064761277, 'h': 0.047216256401784236, 'i': 0.055410540227986124, 'j': 0.001420783082768875, 'k': 0.0037336857756484387, 'l': 0.028977366595076822, 'm': 0.020518751032545846, 'n': 0.057921691723112505, 'o': 0.06446390219725756, 'p': 0.01675202378985627, 'q': 0.0005617049396993227, 'r': 0.053824549810011564, 's': 0.06618205848339666, 't': 0.08012555757475633, 'u': 0.026664463902197257, 'v': 0.009284652238559392, 'w': 0.015496448042293078, 'x': 0.001156451346439782, 'y': 0.013844374690236246, 'z': 0.0006277878737815959, ' ': 0.1792499586981662}


In [17]:
# Store as log probabilities
for char in vocabulary:
    theta_e[char] = np.log(theta_e[char])

Print $\theta_j, \theta_s$ and include in final report the class conditional probabilities for Japanese and Spanish.

In [18]:
# Compute character frequencies for Japanese and Spanish
japanese_freq = char_frequencies(japanese_train)
spanish_freq = char_frequencies(spanish_train)

# Compute the total number of characters in the Japanese and Spanish training data
total_japanese_chars = sum(japanese_freq.values())
total_spanish_chars = sum(spanish_freq.values())

# Compute the class conditional probabilities for Japanese
theta_j = {}
for char in vocabulary:
    theta_j[char] = (japanese_freq[char] + alpha) / (total_japanese_chars + alpha * len(vocabulary))

# Compute the class conditional probabilities for Spanish
theta_s = {}
for char in vocabulary:
    theta_s[char] = (spanish_freq[char] + alpha) / (total_spanish_chars + alpha * len(vocabulary))

print("Theta_j:", theta_j)
print("Theta_s:", theta_s)

Theta_j: {'a': 0.1317656102589189, 'b': 0.010866906600510151, 'c': 0.005485866033054963, 'd': 0.01722631818022992, 'e': 0.06020475907613823, 'f': 0.003878542227191726, 'g': 0.014011670568503443, 'h': 0.03176211607673224, 'i': 0.09703343932352633, 'j': 0.0023411020650616725, 'k': 0.05740941332681086, 'l': 0.001432614696530277, 'm': 0.03979873510604843, 'n': 0.05671057688947902, 'o': 0.09116321324993885, 'p': 0.0008735455466648031, 'q': 0.00010482546559977637, 'r': 0.04280373178657535, 's': 0.0421747789929767, 't': 0.056990111464411755, 'u': 0.07061742199238269, 'v': 0.0002445927530661449, 'w': 0.01974212935462455, 'x': 3.4941821866592126e-05, 'y': 0.01415143785596981, 'z': 0.00772214263251686, ' ': 0.12344945665466997}
Theta_s: {'a': 0.10456045141993771, 'b': 0.008232863618143134, 'c': 0.03752582405722919, 'd': 0.039745922111559924, 'e': 0.1138108599796491, 'f': 0.00860287996053159, 'g': 0.0071844839813758445, 'h': 0.0045327001942585795, 'i': 0.049859702136844375, 'j': 0.006629459467793

In [19]:
# Store as log probabilities
for char in vocabulary:
    theta_j[char] = np.log(theta_j[char])
    theta_s[char] = np.log(theta_s[char])

Treat e10.txt as a test document $x$. Represent $x$ as a bag-of-words count vector (Hint: the vocabulary has size 27).

Print the bag-of-words vector $x$ and include in final report.

In [20]:
# Read the content of e10.txt
with open('languageID/e10.txt', 'r', errors='ignore') as f:
    test_content = f.read()

# Compute character frequencies for the test content
test_freq = char_frequencies(test_content)

# Create the bag-of-words vector
bow_vector = [test_freq[char] for char in vocabulary]

print("Bag-of-Words Vector:", bow_vector)

Bag-of-Words Vector: [164, 32, 53, 57, 311, 55, 51, 140, 140, 3, 6, 85, 64, 139, 182, 53, 3, 141, 186, 225, 65, 31, 47, 4, 38, 2, 498]


Compute $\hat p(x \mid y)$ for $y=e, j, s$ under the multinomial model assumption, respectively. Use the formula
$\hat p(x \mid y) = \prod_{i=1}^d \theta_{i, y}^{x_i}$ where $x=(x_1, \ldots, x_d)$.

Show the three values: $\hat p(x \mid y=e), \hat p(x \mid y=j), \hat p(x \mid y=s)$.

Hint: you may notice that we omitted the multinomial coefficient.  This is ok for classification because it is a constant w.r.t. $y$.

In [29]:
import math

# Function to compute the log-probability log(p(x|y)) for a given language and its log(theta) values
def compute_log_probability(bow_vector, log_theta_values):
    log_prob = 0  
    for xi, char in zip(bow_vector, vocabulary):
        log_prob += xi * log_theta_values[char]
    return log_prob

# Compute log(p(x|y)) for each language
log_p_x_given_e = compute_log_probability(bow_vector, theta_e)
log_p_x_given_j = compute_log_probability(bow_vector, theta_j)
log_p_x_given_s = compute_log_probability(bow_vector, theta_s)

print("log(p(x|y=e)):", log_p_x_given_e)
print("log(p(x|y=j)):", log_p_x_given_j)
print("log(p(x|y=s)):", log_p_x_given_s)

log(p(x|y=e)): -7841.865447060635
log(p(x|y=j)): -8771.433079075032
log(p(x|y=s)): -8467.282044010557


In [30]:
# Convert the log probabilities back to regular probabilities
p_x_given_e = math.exp(log_p_x_given_e)
p_x_given_j = math.exp(log_p_x_given_j)
p_x_given_s = math.exp(log_p_x_given_s)

print("p(x|y=e):", p_x_given_e)
print("p(x|y=j):", p_x_given_j)
print("p(x|y=s):", p_x_given_s)

p(x|y=e): 0.0
p(x|y=j): 0.0
p(x|y=s): 0.0


Use Bayes rule and your estimated prior and likelihood, compute the posterior $\hat p(y \mid x)$.

Show the three values: $\hat p(y=e \mid x), \hat p(y=j \mid x), \hat p(y=s \mid x)$.

Show the predicted class label of $x$.

In [35]:
# Compute log posterior for each language using the log priors
log_posterior_e = log_p_x_given_e + p_e_smoothed
log_posterior_j = log_p_x_given_j + p_j_smoothed
log_posterior_s = log_p_x_given_s + p_s_smoothed

# Normalize log posteriors by subtracting the max value among them
max_log_posterior = max(log_posterior_e, log_posterior_j, log_posterior_s)

# Convert normalized log posteriors back to regular probabilities
posterior_e = math.exp(log_posterior_e - max_log_posterior)
posterior_j = math.exp(log_posterior_j - max_log_posterior)
posterior_s = math.exp(log_posterior_s - max_log_posterior)

# Normalize the posteriors to sum to 1
normalizing_factor = posterior_e + posterior_j + posterior_s
normalized_posterior_e = posterior_e / normalizing_factor
normalized_posterior_j = posterior_j / normalizing_factor
normalized_posterior_s = posterior_s / normalizing_factor

# Determine the predicted class label
predicted_class_label = max([('e', normalized_posterior_e), ('j', normalized_posterior_j), ('s', normalized_posterior_s)], key=lambda x: x[1])[0]

print("p(y=e|x):", normalized_posterior_e)
print("p(y=j|x):", normalized_posterior_j)
print("p(y=s|x):", normalized_posterior_s)
print("Predicted class label:", predicted_class_label)

p(y=e|x): 1.0
p(y=j|x): 0.0
p(y=s|x): 2.4267389118368303e-272
Predicted class label: e


Evaluate the performance of your classifier on the test set (files 10.txt to 19.txt in three languages).

Present the performance using a confusion matrix. A confusion matrix summarizes the types of errors your classifier makes, as shown in the table below.   The columns are the true language a document is in, and the rows are the classified outcome of that document.  The cells are the number of test documents in that situation.  For example, the cell with row = English and column = Spanish contains the number of test documents that are really Spanish, but misclassified as English by your classifier.

In [38]:
def predict_language(document):
    bow_vector = [document.count(char) for char in vocabulary]
    log_p_x_given_e = compute_log_probability(bow_vector, theta_e)
    log_p_x_given_j = compute_log_probability(bow_vector, theta_j)
    log_p_x_given_s = compute_log_probability(bow_vector, theta_s)
    log_posterior_e = log_p_x_given_e + p_e_smoothed
    log_posterior_j = log_p_x_given_j + p_j_smoothed
    log_posterior_s = log_p_x_given_s + p_s_smoothed
    predicted_class_label = max([('e', log_posterior_e), ('j', log_posterior_j), ('s', log_posterior_s)], key=lambda x: x[1])[0]
    return predicted_class_label

# Confusion matrix evaluation
languages = ['e', 'j', 's']
confusion_matrix = {lang1: {lang2: 0 for lang2 in languages} for lang1 in languages}

base_path = "languageID/"  
for lang in languages:
    for i in range(10, 20):
        filename = os.path.join(base_path, f"{lang}{i}.txt")
        with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
            document = file.read().lower()
        predicted_label = predict_language(document)
        confusion_matrix[predicted_label][lang] += 1  # Rows: Predicted, Columns: True

print(confusion_matrix)

{'e': {'e': 10, 'j': 0, 's': 0}, 'j': {'e': 0, 'j': 10, 's': 0}, 's': {'e': 0, 'j': 0, 's': 10}}
