In [1]:
# https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt
# https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt

## Importing libraries

In [2]:
import numpy as np
import string

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score

## Preparing data

In [3]:
input_files = [
  'datasets/edgar_allan_poe.txt',
  'datasets/robert_frost.txt',
]

input_txts = []
labels = []

for label, data in enumerate(input_files):
    print(f'{data} belongs to the labels {label}')
    with open(data, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip().lower()
            line = line.translate(str.maketrans('', '', string.punctuation))
            if line != '':
                input_txts.append(line)
                labels.append(label)

datasets/edgar_allan_poe.txt belongs to the labels 0
datasets/robert_frost.txt belongs to the labels 1


In [4]:
input_txts[:5], labels[:5]

(['lo death hath reard himself a throne',
  'in a strange city all alone',
  'far down within the dim west',
  'where the good and the bad and the worst and the best',
  'have gone to their eternal rest'],
 [0, 0, 0, 0, 0])

In [5]:
input_txts[-5:], labels[-5:]

(['to say which buds are leaf and which are bloom',
  'a featherhammer gives a double knock',
  'this eden day is done at two oclock',
  'an hour of winter day might seem too short',
  'to make it worth lifes while to wake and sport'],
 [1, 1, 1, 1, 1])

In [6]:
len(input_txts)

2154

In [7]:
X_train, X_test, y_train, y_test = train_test_split(input_txts, labels, test_size = 0.2, shuffle = True)

In [8]:
X_train[:5], y_train[:5]

(['ill find that fountain if it takes all summer',
  'in youth have i known one with whom the earth',
  'they made him keep on gnawing till he whined',
  'i wonder why he doesnt marry her',
  'nor the ghoulhaunted woodland of weir'],
 [1, 0, 1, 1, 0])

## Word-to-Index

In [9]:
idx = 1
word2idx = {'<unk>': 0}

for text in X_train:
    tokens = text.split()
    for token in tokens:
        if token not in word2idx:
            word2idx[token] = idx
            idx += 1

In [10]:
word2idx

{'<unk>': 0,
 'ill': 1,
 'find': 2,
 'that': 3,
 'fountain': 4,
 'if': 5,
 'it': 6,
 'takes': 7,
 'all': 8,
 'summer': 9,
 'in': 10,
 'youth': 11,
 'have': 12,
 'i': 13,
 'known': 14,
 'one': 15,
 'with': 16,
 'whom': 17,
 'the': 18,
 'earth': 19,
 'they': 20,
 'made': 21,
 'him': 22,
 'keep': 23,
 'on': 24,
 'gnawing': 25,
 'till': 26,
 'he': 27,
 'whined': 28,
 'wonder': 29,
 'why': 30,
 'doesnt': 31,
 'marry': 32,
 'her': 33,
 'nor': 34,
 'ghoulhaunted': 35,
 'woodland': 36,
 'of': 37,
 'weir': 38,
 'pearly': 39,
 'lustre': 40,
 'moon': 41,
 'went': 42,
 'out': 43,
 'hes': 44,
 'sold': 45,
 'his': 46,
 'farm': 47,
 'or': 48,
 'creaking': 49,
 'a': 50,
 'buggy': 51,
 'load': 52,
 'grain': 53,
 'son': 54,
 'we': 55,
 'think': 56,
 'had': 57,
 'grave': 58,
 'down': 59,
 'cellar': 60,
 'their': 61,
 'office': 62,
 'is': 63,
 'to': 64,
 'illumine': 65,
 'and': 66,
 'enkindle': 67,
 'last': 68,
 'night': 69,
 'was': 70,
 'nights': 71,
 'shes': 72,
 'kiting': 73,
 'know': 74,
 'some': 75,


## Converting data to interger format

In [11]:
X_train_int = []
X_test_int = []

for text in X_train:
  tokens = text.split()
  line_as_int = [word2idx[token] for token in tokens]
  X_train_int.append(line_as_int)

for text in X_test:
  tokens = text.split()
  line_as_int = [word2idx.get(token, 0) for token in tokens]
  X_test_int.append(line_as_int)

In [12]:
X_train_int[:5]

[[1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24, 25, 26, 27, 28],
 [13, 29, 30, 27, 31, 32, 33],
 [34, 18, 35, 36, 37, 38]]

In [13]:
X_test_int[:5]

[[20, 313, 50, 0, 46, 579, 0, 121, 89],
 [128, 5, 27, 313, 153, 13, 379, 1325],
 [140, 13, 379, 158, 30, 6, 459, 12, 64, 1823],
 [13, 379, 157, 158, 22, 1099, 650, 178],
 [427, 0, 580, 0, 6, 10, 18, 69]]

## Calculating A and pi

In [14]:
# initialize A and pi matrices - for both classes
N = len(word2idx)

A0 = np.ones((N, N))
pi0 = np.ones(N)

A1 = np.ones((N, N))
pi1 = np.ones(N)

In [15]:
# compute counts for A and pi
def compute_counts(text_as_int, A, pi):
    for tokens in text_as_int:
        last_idx = None
        for idx in tokens:
            if last_idx is None:
                # it's the first word in a sentence
                pi[idx] += 1
            else:
                # the last word exists, so count a transition
                A[last_idx, idx] += 1

            # update last idx
            last_idx = idx


compute_counts([t for t, y in zip(X_train_int, y_train) if y == 0], A0, pi0)
compute_counts([t for t, y in zip(X_train_int, y_train) if y == 1], A1, pi1)

In [16]:
# normalize A and pi so they are valid probability matrices
A0 /= A0.sum(axis=1, keepdims=True)
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
pi1 /= pi1.sum()

In [17]:
# log A and pi since we don't need the actual probs
logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)

## Compute priors

In [18]:
count0 = sum(y == 0 for y in y_train)
count1 = sum(y == 1 for y in y_train)
total = len(y_train)
p0 = count0 / total
p1 = count1 / total
logp0 = np.log(p0)
logp1 = np.log(p1)

p0, p1

(0.3441671503192107, 0.6558328496807894)

## Build a Classifier

In [19]:
class Classifier:
    def __init__(self, logAs, logpis, logpriors):
        self.logAs = logAs
        self.logpis = logpis
        self.logpriors = logpriors
        self.K = len(logpriors) # number of classes

    def _compute_log_likelihood(self, input_, class_):
        logA = self.logAs[class_]
        logpi = self.logpis[class_]
    
        last_idx = None
        logprob = 0
        for idx in input_:
          if last_idx is None:
            # it's the first token
            logprob += logpi[idx]
          else:
            logprob += logA[last_idx, idx]
          
          # update last_idx
          last_idx = idx
        
        return logprob
  
    def predict(self, inputs):
        predictions = np.zeros(len(inputs))
        for i, input_ in enumerate(inputs):
            posteriors = [self._compute_log_likelihood(input_, c) + self.logpriors[c] for c in range(self.K)]
            pred = np.argmax(posteriors)
            predictions[i] = pred
        return predictions

In [20]:
# each array must be in order since classes are assumed to index these lists
clf = Classifier([logA0, logA1], [logpi0, logpi1], [logp0, logp1])

## Evaluation

**Accuracy**

In [21]:
train_output = clf.predict(X_train_int)
print(f"Train acc: {np.mean(train_output == y_train)}")

Train acc: 0.995937318630296


In [22]:
test_output = clf.predict(X_test_int)
print(f"Test acc: {np.mean(test_output == y_test)}")

Test acc: 0.8538283062645011


**Confusion Matrix**

In [23]:
train_cm = confusion_matrix(y_train, train_output)
train_cm

array([[ 586,    7],
       [   0, 1130]], dtype=int64)

In [24]:
test_cm = confusion_matrix(y_test, test_output)
test_cm

array([[ 68,  57],
       [  6, 300]], dtype=int64)

**F1-score**

In [25]:
f1_score(y_train, train_output)

0.9969122187913542

In [26]:
f1_score(y_test, test_output)

0.9049773755656109