## Language Identification with Naive Bayes 

### 3.2 Estimate the class conditional probability (multinomial parameter) for English $$\theta_{i,e} := \hat p(c_i \mid y=e)$$ 

In [3]:
from collections import Counter, defaultdict
import string
import os
import tarfile

extracted_subdir_path = r"C:\\Users\\Nicor\\OneDrive\\Documents\\Sem 9\\ECE 760\\HW4\\languageID\\languageID"
extracted_files = os.listdir(extracted_subdir_path)
extracted_files.sort()

# Function to calculate the class conditional probabilities with additive smoothing
def calculate_class_conditional_probabilities(files, class_label, alpha, vocabulary_size, path):
    character_counts = Counter()
    total_characters = 0

    # Process each file
    for file in files:
        # Filter by class label and training set (0.txt to 9.txt)
        if file[0] == class_label and file[1] in '0123456789':
            with open(os.path.join(path, file), 'r', encoding='utf-8') as f:
                content = f.read().lower()
                # Count characters ignoring non-alphabetic and non-space characters
                filtered_content = ''.join(filter(lambda x: x in string.ascii_lowercase or x == ' ', content))
                character_counts.update(filtered_content)
                total_characters += len(filtered_content)

    # Calculate probabilities
    conditional_probabilities = {}
    for char in string.ascii_lowercase + ' ':
        conditional_probabilities[char] = (character_counts[char] + alpha) / (total_characters + alpha * vocabulary_size)

    return conditional_probabilities

# Vocabulary size (26 letters + space)
vocabulary_size = 26 + 1
alpha = 0.5  # smoothing parameter

# Calculate class conditional probabilities for English
theta_e = calculate_class_conditional_probabilities(extracted_files, 'e', alpha, vocabulary_size, extracted_subdir_path)
theta_j = calculate_class_conditional_probabilities(extracted_files, 'j', alpha, vocabulary_size, extracted_subdir_path)
theta_s = calculate_class_conditional_probabilities(extracted_files, 's', alpha, vocabulary_size, extracted_subdir_path)
print(theta_j)
print(theta_s)


{'a': 0.13212646167171935, 'b': 0.00992637505413599, 'c': 0.005214378518839324, 'd': 0.016370723256821134, 'e': 0.059991338241663056, 'f': 0.0034820268514508443, 'g': 0.014880900822867043, 'h': 0.031476829796448676, 'i': 0.09896925075790386, 'j': 0.0020961455175400605, 'k': 0.05718492854049372, 'l': 0.001195322650498051, 'm': 0.04083152880034647, 'n': 0.056942399307059334, 'o': 0.090376786487657, 'p': 0.0007102641836292768, 'q': 5.1970550021654394e-05, 'r': 0.04259852750108272, 's': 0.04259852750108272, 't': 0.05798181030749242, 'u': 0.07028150714595063, 'v': 0.00019055868341273278, 'w': 0.020285838025119098, 'x': 1.7323516673884798e-05, 'y': 0.013841489822433954, 'z': 0.007743611953226505, ' ': 0.12263317453443048}
{'a': 0.10653656355302873, 'b': 0.009598032483191465, 'c': 0.03674720922433205, 'd': 0.040484213552230225, 'e': 0.11234968139642589, 'f': 0.007362217928038712, 'g': 0.007202516888384944, 'h': 0.0047750610856476675, 'i': 0.05000239551559481, 'j': 0.006723413769423639, 'k': 0

### 3.4 Treat e10.txt as a test document $x$. Represent $x$ as a bag-of-words count vector. Print the bag-of-words vector $x$

In [4]:
# Function to create a bag-of-words vector for a given document
def create_bag_of_words_vector(file_path, vocabulary):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read().lower()
        # Count characters ignoring non-alphabetic and non-space characters
        filtered_content = ''.join(filter(lambda x: x in string.ascii_lowercase or x == ' ', content))
    
    character_counts = Counter(filtered_content)

    # Create vector based on the vocabulary
    vector = [character_counts[char] for char in vocabulary]
    return vector

# Vocabulary (a to z and space)
vocabulary = string.ascii_lowercase + ' '

# Create the bag-of-words vector for e10.txt
file_path = os.path.join(extracted_subdir_path, 'e10.txt')
bag_of_words_vector_e10 = create_bag_of_words_vector(file_path, vocabulary)
bag_of_words_vector_e10


[164,
 32,
 53,
 57,
 311,
 55,
 51,
 140,
 140,
 3,
 6,
 85,
 64,
 139,
 182,
 53,
 3,
 141,
 186,
 225,
 65,
 31,
 47,
 4,
 38,
 2,
 498]

### 3.5 Compute $\hat p(x \mid y)$ for $y=e, j, s$ under the multinomial model assumption, respectively. Use the formula $$\hat p(x \mid y) = \prod_{i=1}^d \theta_{i, y}^{x_i}$$

In [5]:
#Function to compute log probability of x given y
import math

def compute_log_probability_x_given_y(bag_of_words_vector, theta_y):
    log_prob = sum([count * math.log(theta_y[char]) for count, char in zip(bag_of_words_vector, string.ascii_lowercase + ' ')])
    return log_prob

# Now compute the log probabilities
log_p_x_given_e = compute_log_probability_x_given_y(bag_of_words_vector_e10, theta_e)
log_p_x_given_j = compute_log_probability_x_given_y(bag_of_words_vector_e10, theta_j)
log_p_x_given_s = compute_log_probability_x_given_y(bag_of_words_vector_e10, theta_s)

log_p_x_given_e, log_p_x_given_j, log_p_x_given_s


(-7841.662478537944, -8818.782947292133, -8421.593490399442)

### 3.6 Use Bayes rule and your estimated prior and likelihood, compute the posterior $\hat p(y \mid x)$. Show the three values: $\hat p(y=e \mid x), \hat p(y=j \mid x), \hat p(y=s \mid x)$. Show the predicted class label of $x$.

In [8]:
# Bayes rule: p(y|x) = p(x|y)*p(y)/p(x) 
import math
from collections import Counter, defaultdict

# Re-defining prior probabilities and log probabilities
prior_probabilities = {'e': 0.333, 'j': 0.333, 's': 0.333}

# Log prior probabilities
log_priors = {key: math.log(value) for key, value in prior_probabilities.items()}

# Log of the normalizing factor with log-sum-exp trick
# Find the maximum of the log probabilities (log_p_x_given_y + log_priors[y])
max_log_prob = max(log_p_x_given_e + log_priors['e'], log_p_x_given_j + log_priors['j'], log_p_x_given_s + log_priors['s'])

# Compute the log-sum-exp
log_normalizing_factor = max_log_prob + math.log(
    sum(math.exp(log_p_x_given_y + log_priors[y] - max_log_prob)
        for log_p_x_given_y, y in zip([log_p_x_given_e, log_p_x_given_j, log_p_x_given_s], ['e', 'j', 's']))
)

# Recompute the log posterior probabilities
log_p_y_e_given_x = log_p_x_given_e + log_priors['e'] - log_normalizing_factor
log_p_y_j_given_x = log_p_x_given_j + log_priors['j'] - log_normalizing_factor
log_p_y_s_given_x = log_p_x_given_s + log_priors['s'] - log_normalizing_factor

# Convert log posterior probabilities back to normal probabilities for easier interpretation
p_y_e_given_x = math.exp(log_p_y_e_given_x)
p_y_j_given_x = math.exp(log_p_y_j_given_x)
p_y_s_given_x = math.exp(log_p_y_s_given_x)

# Posterior probabilities and predicted class
posterior_probabilities = {
    'e': p_y_e_given_x,
    'j': p_y_j_given_x,
    's': p_y_s_given_x
}
predicted_class = max(posterior_probabilities, key=posterior_probabilities.get)

posterior_probabilities, predicted_class


({'e': 1.0, 'j': 0.0, 's': 1.3777222237663097e-252}, 'e')

### 3.7 Evaluate the performance of your classifier on the test set and create a confusion matrix. 

In [9]:
# Predict language and compute confusion matrix
# Function to predict the class of a given document
def predict_language(file_path, theta_e, theta_j, theta_s, priors, vocabulary):
    # Create bag-of-words vector for the document
    bag_of_words_vector = create_bag_of_words_vector(file_path, vocabulary)

    # Compute log probabilities for each class
    log_p_x_given_e = compute_log_probability_x_given_y(bag_of_words_vector, theta_e)
    log_p_x_given_j = compute_log_probability_x_given_y(bag_of_words_vector, theta_j)
    log_p_x_given_s = compute_log_probability_x_given_y(bag_of_words_vector, theta_s)

    # Compute the log posterior probabilities using log-sum-exp trick
    max_log_prob = max(log_p_x_given_e + log_priors['e'], log_p_x_given_j + log_priors['j'], log_p_x_given_s + log_priors['s'])
    log_normalizing_factor = max_log_prob + math.log(
        sum(math.exp(log_p_x_given_y + log_priors[y] - max_log_prob)
            for log_p_x_given_y, y in zip([log_p_x_given_e, log_p_x_given_j, log_p_x_given_s], ['e', 'j', 's']))
    )
    
    log_posterior_probabilities = {
        'e': log_p_x_given_e + log_priors['e'] - log_normalizing_factor,
        'j': log_p_x_given_j + log_priors['j'] - log_normalizing_factor,
        's': log_p_x_given_s + log_priors['s'] - log_normalizing_factor
    }
    predicted_class = max(log_posterior_probabilities, key=log_posterior_probabilities.get)

    return predicted_class

# Confusion matrix setup
confusion_matrix = {
    'e': {'e': 0, 'j': 0, 's': 0},
    'j': {'e': 0, 'j': 0, 's': 0},
    's': {'e': 0, 'j': 0, 's': 0}
}

for file in extracted_files:
    if file[1] in '10111213141516171819':
        true_label = file[0]
        file_path = os.path.join(extracted_subdir_path, file)
        predicted_label = predict_language(file_path, theta_e, theta_j, theta_s, prior_probabilities, vocabulary)
        confusion_matrix[predicted_label][true_label] += 1

confusion_matrix

{'e': {'e': 20, 'j': 0, 's': 0},
 'j': {'e': 0, 'j': 20, 's': 0},
 's': {'e': 0, 'j': 0, 's': 20}}