In [68]:
import os
import string
from math import log, exp
import pandas as pd

In [69]:
### 3.1 Obtaining priors for language classes

path_data_lang = "./data/languageID"
lang_classes = ['e', 'j', 's']
lang_num = len(lang_classes)
lang_str = ['english','japanese', 'spanish']
lang_classes_to_str = { lang_classes[i]: lang_str[i] for i in range(len(lang_classes))} 
lang_classes_obs = { c:0 for c in lang_classes }
obs_total = 0

for entry in sorted(os.listdir(path_data_lang)):
    entry_name = entry.split(sep=".")[0]
    entry_lang_classes = entry_name[0]
    entry_num = int(entry_name[1:])
    if entry_num >= 10:
        continue
    
    lang_classes_obs[entry_lang_classes] += 1
    obs_total += 1
    # print(f'entry: {entry}, entry_name: {entry_name}, entry_lang_classes: {entry_lang_classes}, entry_num: {entry_num}')


prob_prior = { c: o/obs_total for c, o in lang_classes_obs.items() }
smoothing_param = 1/2
prob_prior_smoothed = { c: (o + smoothing_param)/(obs_total + lang_num*smoothing_param) for c, o in lang_classes_obs.items() }
prob_prior_smoothed_log = { k: log(v) for k, v in prob_prior_smoothed.items() }
print(f'prob_prior: {prob_prior}')
print(f'prob_prior_smoothed: {prob_prior_smoothed}')
print(f'prob_prior_smoothed_log: {prob_prior_smoothed_log}')

prob_prior: {'e': 0.3333333333333333, 'j': 0.3333333333333333, 's': 0.3333333333333333}
prob_prior_smoothed: {'e': 0.3333333333333333, 'j': 0.3333333333333333, 's': 0.3333333333333333}
prob_prior_smoothed_log: {'e': -1.0986122886681098, 'j': -1.0986122886681098, 's': -1.0986122886681098}


In [70]:
### 3.2 Obtaining estimates for class conditional probability
char_classes = [ c for c in string.ascii_lowercase ]
char_classes.append(" ")
char_classes_set = { c for c in char_classes }
char_class_num = len(char_classes)

char_counts_by_lang = {}
for l in lang_classes:
    char_counts = { c:0 for c in char_classes }
    char_counts[" "] = 0
    char_counts_by_lang[l] = char_counts

# for l in char_counts_by_lang:
#     print(f'{l}: {char_counts_by_lang[l]}')
# print(char_counts_by_lang)
num_obs = 0

# Determine these counts for each training file
for entry in sorted(os.listdir(path_data_lang)):
    entry_name = entry.split(sep=".")[0]
    entry_lang_classes = entry_name[0]
    entry_num = int(entry_name[1:])
    if entry_num >= 10:
        continue

    # if not entry_lang_classes == 'e':
    #     continue

    # print(f'Collecting character class observations from {entry_name}')

    path_sample = path_to_data_lang + "/" + entry
    with open(path_sample) as f:
        file_text = f.read()
        # print(file_text)
        for c in file_text:
            if not c in char_classes_set:
                continue
            char_counts_by_lang[entry_lang_classes][c] += 1
            num_obs += 1
    # print(char_counts)
    # for l in char_counts_by_lang:
    #     print(f'{l}: {char_counts_by_lang[l]}')
    # print(num_obs)

In [71]:
(1 + 0.5) / (num_obs + 0.5*char_class_num)

3.2872749586351235e-05

In [72]:
### 3.2 Continued

# We have collected per-character estimated probs for each lang class
prob_char_est = { l: {c:((b + smoothing_param)/(num_obs + smoothing_param*char_class_num)) for c,b in char_counts_by_lang[l].items() } for l in lang_classes}

prob_char_est_log = { l: {k:log(v) for k,v in prob_char_est[l].items()} for l in lang_classes}

print(f'char_counts_by_lang:')
for l in lang_classes:
    print(f'{l}: {char_counts_by_lang[l]}')

print(f'prob_char_est:')
for l in lang_classes:
    print(f'{l}: {prob_char_est[l]}')

print(f'prob_char_est_log:')
for l in lang_classes:
    print(f'{l}: {prob_char_est_log[l]}')

char_counts_by_lang:
e: {'a': 910, 'b': 168, 'c': 325, 'd': 332, 'e': 1594, 'f': 286, 'g': 264, 'h': 714, 'i': 838, 'j': 21, 'k': 56, 'l': 438, 'm': 310, 'n': 876, 'o': 975, 'p': 253, 'q': 8, 'r': 814, 's': 1001, 't': 1212, 'u': 403, 'v': 140, 'w': 234, 'x': 17, 'y': 209, 'z': 9, ' ': 2712}
j: {'a': 1885, 'b': 155, 'c': 78, 'd': 246, 'e': 861, 'f': 55, 'g': 200, 'h': 454, 'i': 1388, 'j': 33, 'k': 821, 'l': 20, 'm': 569, 'n': 811, 'o': 1304, 'p': 12, 'q': 1, 'r': 612, 's': 603, 't': 815, 'u': 1010, 'v': 3, 'w': 282, 'x': 0, 'y': 202, 'z': 110, ' ': 1766}
s: {'a': 1695, 'b': 133, 'c': 608, 'd': 644, 'e': 1845, 'f': 139, 'g': 116, 'h': 73, 'i': 808, 'j': 107, 'k': 4, 'l': 858, 'm': 418, 'n': 878, 'o': 1175, 'p': 393, 'q': 124, 'r': 961, 's': 1066, 't': 577, 'u': 546, 'v': 95, 'w': 1, 'x': 40, 'y': 127, 'z': 43, ' ': 2728}
prob_char_est:
e: {'a': 0.0199537589989152, 'b': 0.0036927055368667886, 'c': 0.007133386660238218, 'd': 0.007286792824974524, 'e': 0.03494373281029136, 'f': 0.0062786951

In [73]:
df_3_3 = pd.DataFrame(prob_char_est)
df_3_3

Unnamed: 0,e,j,s
a,0.019954,0.041321,0.037157
b,0.003693,0.003408,0.002926
c,0.007133,0.00172,0.013335
d,0.007287,0.005402,0.014124
e,0.034944,0.01888,0.040444
f,0.006279,0.001216,0.003057
g,0.005797,0.004394,0.002553
h,0.015658,0.00996,0.001611
i,0.018376,0.030429,0.017718
j,0.000471,0.000734,0.002356


In [74]:
### 3.4 Representing e10.txt as a bag-of-words count vector x
bag_of_words = { c:0 for c in char_classes }
path_sample = path_to_data_lang + "/" + "e10.txt"
with open(path_sample) as f:
    file_text = f.read()
    # print(file_text)
    for c in file_text:
        if not c in char_classes_set:
            continue
        bag_of_words[c] += 1
        # num_obs += 1
print(bag_of_words)

print('<', end='')
print(bag_of_words[char_classes[0]], end='')
for i in range(1,len(char_classes)):
    c = char_classes[i]
    print(f', {bag_of_words[c]}', end='')
print('>')

# pd.DataFrame(bag_of_words.items())

{'a': 164, 'b': 32, 'c': 53, 'd': 57, 'e': 311, 'f': 55, 'g': 51, 'h': 140, 'i': 140, 'j': 3, 'k': 6, 'l': 85, 'm': 64, 'n': 139, 'o': 182, 'p': 53, 'q': 3, 'r': 141, 's': 186, 't': 225, 'u': 65, 'v': 31, 'w': 47, 'x': 4, 'y': 38, 'z': 2, ' ': 498}
<164, 32, 53, 57, 311, 55, 51, 140, 140, 3, 6, 85, 64, 139, 182, 53, 3, 141, 186, 225, 65, 31, 47, 4, 38, 2, 498>


In [75]:
### 3.5 Compute \hat p(x | y) for y=e,j,s under the multinomial assumption. Basically, compute the probability of the vector having arisen from each of the languages

x = bag_of_words
# We log the provided formula to avoid underflow
prob_document = [ sum(x[c]*prob_char_est_log[l][c] for c in char_classes) for l in lang_classes]
prob_document # <-- these values are logprobabilities, not normal probabilities
# print([exp(v) for v in prob_document]) # <-- this would make them normal probabilities again

[-10904.720647602408, -11989.469322201065, -11338.321401996694]

In [76]:
### 3.6 obtain estimates \hat p(y | x) by using Bayes' rule
# We must obtain the probabilities p(c) for individual characters so that we can calculate p(x)
char_counts_all_lang = { c: sum(char_counts_by_lang[l][c] for l in lang_classes) for c in char_classes}

print(f'char_counts_by_lang:')
for l in lang_classes:
    print(f'{l}: {char_counts_by_lang[l]}')

print(char_counts_all_lang)
num_char_all = sum(v for c,v in char_counts_all_lang.items())
# num_char_all
prob_char_prior = { c: (v + smoothing_param)/(num_char_all + smoothing_param * char_class_num) for c,v in char_counts_all_lang.items() }
print(prob_char_prior)

prob_char_prior_log = { c: log(v) for c,v in prob_char_prior.items() }
print(prob_char_prior_log)

char_counts_by_lang:
e: {'a': 910, 'b': 168, 'c': 325, 'd': 332, 'e': 1594, 'f': 286, 'g': 264, 'h': 714, 'i': 838, 'j': 21, 'k': 56, 'l': 438, 'm': 310, 'n': 876, 'o': 975, 'p': 253, 'q': 8, 'r': 814, 's': 1001, 't': 1212, 'u': 403, 'v': 140, 'w': 234, 'x': 17, 'y': 209, 'z': 9, ' ': 2712}
j: {'a': 1885, 'b': 155, 'c': 78, 'd': 246, 'e': 861, 'f': 55, 'g': 200, 'h': 454, 'i': 1388, 'j': 33, 'k': 821, 'l': 20, 'm': 569, 'n': 811, 'o': 1304, 'p': 12, 'q': 1, 'r': 612, 's': 603, 't': 815, 'u': 1010, 'v': 3, 'w': 282, 'x': 0, 'y': 202, 'z': 110, ' ': 1766}
s: {'a': 1695, 'b': 133, 'c': 608, 'd': 644, 'e': 1845, 'f': 139, 'g': 116, 'h': 73, 'i': 808, 'j': 107, 'k': 4, 'l': 858, 'm': 418, 'n': 878, 'o': 1175, 'p': 393, 'q': 124, 'r': 961, 's': 1066, 't': 577, 'u': 546, 'v': 95, 'w': 1, 'x': 40, 'y': 127, 'z': 43, ' ': 2728}
{'a': 4490, 'b': 456, 'c': 1011, 'd': 1222, 'e': 4300, 'f': 480, 'g': 580, 'h': 1241, 'i': 3034, 'j': 161, 'k': 881, 'l': 1316, 'm': 1297, 'n': 2565, 'o': 3454, 'p': 658

In [90]:
### 3.6 continued

path_sample = path_to_data_lang + "/" + "e10.txt"
file_text = ''
with open(path_sample) as f:
    file_text = f.read()

def pred_label(file_text):

    bag_of_words = { c:0 for c in char_classes }

    # for a string file_text, obtain probability of observing sequence of chars and obtain the bag of words
    p_x_log = 0
    for c in file_text:
        if not c in char_classes_set:
            continue
        p_x_log += prob_char_prior_log[c]
        bag_of_words[c] += 1

    # then, for each language, obtain the probability that a document is of a language class given the string
    p_y_probs = []
    for l in lang_classes:
        p_x_given_y_log = sum(bag_of_words[c]*prob_char_est_log[l][c] for c in char_classes)
        p_y_log = prob_prior_smoothed_log[l]
        p_y_given_x_log = p_x_given_y_log + p_y_log - p_x_log
        # print(f'{p_y_given_x_log} = {p_x_given_y_log} + {p_y_log} - {p_x_log}')
        p_y_probs.append(p_y_given_x_log)
        # print(p_y_given_x_log)
    
    # for p in p_y_probs:
    #     print(p)
    
    highest_prob = max(enumerate(p_y_probs), key = lambda x: x[1])
    return lang_classes[highest_prob[0]]

pred_label(file_text)

'e'

In [96]:
### 3.7 Evaluate the performance of our classifier on the test set (files 10-19.txt in the three languages) and present the performance using a confusion matrix

confusion_matrix = { l_true:{ l_pred:0 for l_pred in lang_classes} for l_true in lang_classes }

for entry in sorted(os.listdir(path_data_lang)):
    entry_name = entry.split(sep=".")[0]
    entry_lang_classes = entry_name[0]
    entry_num = int(entry_name[1:])
    if entry_num < 10:
        continue

    path_sample = path_to_data_lang + "/" + entry
    file_text = ''
    with open(path_sample) as f:
        file_text = f.read()
    
    prediction = pred_label(file_text)
    # print(f'entry_lang_classes: {entry_lang_classes}, prediction={prediction}')
    confusion_matrix[entry_lang_classes][prediction] += 1

In [97]:
df_confusion_matrix = pd.DataFrame(confusion_matrix)
df_confusion_matrix

Unnamed: 0,e,j,s
e,10,0,0
j,0,10,0
s,0,0,10
