In [1]:
import numpy as np
import matplotlib.pyplot as plt
import string
from sklearn.model_selection import train_test_split

In [2]:
input_files = [
    'assets/edgar_allan_poe.txt',
    'assets/robert_frost.txt'
]

In [3]:
# each line is going to be treated as one data point

In [4]:
# collect data into lists
input_texts = []
labels = []
for label, f in enumerate(input_files):
    print(f'{f} correspnds to label {label}')
    for line in open(f):
        line = line.rstrip().lower()
        if line:
            # remove punctuation
            line = line.translate(str.maketrans('', '', string.punctuation))
            input_texts.append(line)
            labels.append(label)

assets/edgar_allan_poe.txt correspnds to label 0
assets/robert_frost.txt correspnds to label 1


In [5]:
train_text, test_text, Ytrain, Ytest = train_test_split(input_texts, labels)

In [6]:
len(Ytrain), len(Ytest)

(1618, 540)

In [7]:
train_text[:5]

['i struck the hand off brittle on the floor',
 'call her nausicaa and take a timber',
 'oh hasten oh let us not linger',
 'they eddy over it too toppling weak',
 'here he is now this box put it away']

In [8]:
Ytrain[:5]

[1, 1, 0, 1, 1]

In [9]:
idx = 1
word2idx = {'<unk>' : 0}

In [10]:
# populate word2idx
for text in train_text:
    tokens = text.split()
    for token in tokens:
        if token not in word2idx:
            word2idx[token] = idx
            idx += 1

In [11]:
len(word2idx)

2512

In [12]:
# convert data into integer format
train_text_int = []
test_text_int = []
for text in train_text:
    tokens = text.split()
    line_as_int = [ word2idx[token] for token in tokens ]
    train_text_int.append(line_as_int)

for text in test_text:
    tokens = text.split()
    line_as_int = [ word2idx.get(token, 0) for token in tokens ]
    test_text_int.append(line_as_int)

In [13]:
train_text_int[100:105]

[[60, 110, 43, 296, 381, 382, 55, 12, 383],
 [12, 384, 97, 385, 43, 3, 386, 387],
 [388, 3, 200, 389, 183, 390, 391, 47, 93],
 [72, 392, 92, 393, 12, 394],
 [30, 395, 162, 396, 50, 3, 217, 22, 203]]

In [18]:
# initialize A and pi
# one for each category

V = len(word2idx)

A0 = np.ones((V, V))
pi0 = np.ones(V)

A1 = np.ones((V, V))
pi1 = np.ones(V)

In [19]:
# compute counts for A and pi
def compute_counts(text_as_int, A, pi):
    for tokens in text_as_int:
        last_idx = None
        for idx in tokens:
            if last_idx is None:
                # it's the first word in a sentence
                pi[idx] += 1
            else:
                # it's a transition
                A[last_idx, idx] += 1
            # update last idx
            last_idx = idx

In [20]:
compute_counts([ t for t, y in zip(train_text_int, Ytrain) if y == 0 ], A0, pi0)
compute_counts([ t for t, y in zip(train_text_int, Ytrain) if y == 1 ], A1, pi1)

In [21]:
# normalizing A and pi to represent probabilities
A0 /= A0.sum(axis=1, keepdims=True)
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
pi1 /= pi1.sum()

In [22]:
# log A and log pi
logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)

In [23]:
# compute priors
count0 = sum(y == 0 for y in Ytrain)
count1 = sum(y == 1 for y in Ytrain)
total = len(Ytrain)
p0 = count0 / total
p1 = count1 / total
logp0 = np.log(p0)
logp1 = np.log(p1)
p0, p1

(0.34239802224969096, 0.657601977750309)

In [24]:
# build a classifier
class Classifier:
    def __init__(self, logAs, logpis, logpriors):
        self.logAs = logAs
        self.logpis = logpis
        self.logpriors = logpriors
        self.K = len(logpriors) # number of class

    def _compute_log_likelihood(self, input_, class_):
        logA = self.logAs[class_]
        logpi = self.logpis[class_]

        last_idx = None
        logprob = 0
        for idx in input_:
            if last_idx is None:
                # it's the first token
                logprob += logpi[idx]
            else:
                # it's a transition
                logprob += logA[last_idx, idx]
            last_idx = idx
        return logprob

    def predict(self, inputs):
        predictions = np.zeros(len(inputs))
        for i, input_ in enumerate(inputs):
            posteriors = [ self._compute_log_likelihood(input_, c) + self.logpriors[c] \
                            for c in range(self.K) ]
            pred = np.argmax(posteriors)
            predictions[i] = pred
        return predictions

In [26]:
# each array must be in order since classes are assumed to index these lists
clf = Classifier([logA0, logA1], [logpi0, logpi1], [logp0, logp1])

In [27]:
Ptrain = clf.predict(train_text_int)
print(f'Train acc: {np.mean(Ptrain == Ytrain)}')

Train acc: 0.9932014833127317


In [29]:
Ptest = clf.predict(test_text_int)
print(f'Test acc: {np.mean(Ptest == Ytest)}')

Test acc: 0.8333333333333334


In [30]:
from sklearn.metrics import confusion_matrix, f1_score

In [31]:
cm = confusion_matrix(Ytest, Ptest)
cm

array([[ 90,  78],
       [ 12, 360]], dtype=int64)

In [32]:
f1_score(Ytest, Ptest)

0.8888888888888888