In [27]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt


--2024-07-05 15:57:53--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26622 (26K) [text/plain]
Saving to: ‘edgar_allan_poe.txt’


2024-07-05 15:57:53 (117 MB/s) - ‘edgar_allan_poe.txt’ saved [26622/26622]

--2024-07-05 15:57:53--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56286 (55K) [text/plain]
Saving t

In [28]:
import numpy as np
import matplotlib.pyplot as plt
import string
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import train_test_split

In [29]:
input_files = ['edgar_allan_poe.txt', 'robert_frost.txt']
!head 'edgar_allan_poe.txt'

LO! Death hath rear'd himself a throne
In a strange city, all alone,
Far down within the dim west
Where the good, and the bad, and the worst, and the best,
Have gone to their eternal rest.
 
There shrines, and palaces, and towers
Are not like any thing of ours
Oh no! O no! ours never loom
To heaven with that ungodly gloom!


In [30]:

!head 'robert_frost.txt'

Two roads diverged in a yellow wood,
And sorry I could not travel both
And be one traveler, long I stood
And looked down one as far as I could
To where it bent in the undergrowth; 

Then took the other, as just as fair,
And having perhaps the better claim
Because it was grassy and wanted wear,
Though as for that the passing there


In [31]:
#Collect data into lists
input_texts = []
labels = []
for label, files in enumerate(input_files):
  print(f'Reading {files} corresponding to label {label}')
  for line in open(files):
    line = line.rstrip().lower() #python adds a newline character at the end of each line while looping thorough lines. rstrip() removes the '\n'
    if line: #some lines maybe empty lines
      line = line.translate(str.maketrans('', '', string.punctuation)) #removes punctuation (NOT CRUCIAL TO UNDERSTAND THIS STATEMENT)
      input_texts.append(line)
      labels.append(label)


Reading edgar_allan_poe.txt corresponding to label 0
Reading robert_frost.txt corresponding to label 1


In [32]:
train_texts, test_texts, train_labels, test_labels = train_test_split(input_texts, labels, test_size=0.2)

In [33]:
train_texts[:5]

['that high tone of the spirit which hath striven',
 'well take me there',
 'how daring an ambition yet how deep',
 'with furs to sell',
 'with a swish in the grass what if the others']

In [34]:
train_labels[:5]

[0, 1, 0, 1, 1]

In [35]:
idx = 1
word2idx = {'<unk>' : 0}
for text in train_texts:
  tokens = text.split()
  for tok in tokens:
    if tok not in word2idx:
      word2idx[tok] = idx
      idx += 1
print(word2idx)

{'<unk>': 0, 'that': 0, 'high': 1, 'tone': 2, 'of': 3, 'the': 4, 'spirit': 5, 'which': 6, 'hath': 7, 'striven': 8, 'well': 9, 'take': 10, 'me': 11, 'there': 12, 'how': 13, 'daring': 14, 'an': 15, 'ambition': 16, 'yet': 17, 'deep': 18, 'with': 19, 'furs': 20, 'to': 21, 'sell': 22, 'a': 23, 'swish': 24, 'in': 25, 'grass': 26, 'what': 27, 'if': 28, 'others': 29, 'and': 30, 'i': 31, 'dread': 32, 'ominous': 33, 'stain': 34, 'tar': 35, 'ill': 36, 'hardly': 37, 'breathe': 38, 'he': 39, 'want': 40, 'but': 41, 'fifteen': 42, 'at': 43, 'time': 44, 'they': 45, 'say': 46, 'drawn': 47, 'into': 48, 'town': 49, 'about': 50, 'this': 51, 'cellar': 52, 'hole': 53, 'from': 54, 'north': 55, 'south': 56, 'across': 57, 'blue': 58, 'knowing': 59, 'way': 60, 'leads': 61, 'on': 62, 'see': 63, 'were': 64, 'beside': 65, 'track': 66, 'quickening': 67, 'spell': 68, 'doth': 69, 'oer': 70, 'us': 71, 'pass': 72, 'far': 73, 'down': 74, 'within': 75, 'dim': 76, 'west': 77, 'barn': 78, 'smells': 79, 'you': 80, 'cant': 8

In [36]:
#convert data into integer format
train_text_int = []
test_text_int = []

for text in train_texts:
  tokens = text.split()
  train_text_int.append([word2idx[tok] for tok in tokens])

for text in test_texts:
  tokens = text.split()
  test_text_int.append([word2idx.get(tok,0) for tok in tokens])

In [37]:
train_text_int[100:115]

[[31, 384, 21, 385, 137, 19, 4, 386],
 [21, 23, 387, 313, 4, 388, 0, 389, 70],
 [390, 391, 143, 54, 392],
 [393, 165, 103, 394, 30, 384, 395],
 [41, 114, 142, 15, 111],
 [25, 396, 357, 367, 131, 397, 51, 23, 398],
 [320, 132, 399, 400, 30, 401, 359, 402],
 [25, 403, 31, 132, 404, 185, 19, 405, 4, 406],
 [4, 407, 408, 3, 181, 319, 409, 410, 411],
 [25, 184, 60, 80, 412, 413, 414],
 [224, 415, 4, 416, 30, 167],
 [251, 417, 64, 418, 30, 419],
 [30, 120, 145, 420, 4, 421, 3, 4, 422],
 [4, 423, 3, 165, 181, 220, 83, 3, 80],
 [19, 304, 30, 25, 424, 425]]

In [38]:
#intialize A and PI matrices - for Both classes
V = len(word2idx)
A0 = np.ones((V, V)) # for add-one smoothing , initial fake count is 1. therefore np,ones() is used here.
PI0 = np.ones(V)

A1 = np.ones((V, V))
PI1 = np.ones(V)

In [39]:
#compute counts for A and PI matrices (putting inside a function so that we can use for both the state transition matrices and initial state distributions A0,A1 and PI0,PI1)
def compute_counts(train_text_int, A,PI):
  for tokens in train_text_int:
    last_index = None
    for index in tokens:
      if last_index is None:
        PI[index] += 1 #initial state distribution -> First word of the sentence
      else:
        A[last_index, index] += 1 #state transition matrix -> the last word exists , so count transitions
      last_index = index #for next iteration update last index


In [40]:
compute_counts([t for t, y in zip(train_text_int, train_labels) if y == 0], A0,PI0)
compute_counts([t for t, y in zip(train_text_int, train_labels) if y == 1], A1,PI1)

In [41]:
#normalize A and PI so they are valid probability matrices

A0 = A0 / A0.sum(axis=1, keepdims=True)
A1 = A1 / A1.sum(axis=1, keepdims=True)
PI0 = PI0 / PI0.sum()
PI1 = PI1 / PI1.sum()

logA0 = np.log(A0)
logA1 = np.log(A1)
logPI0 = np.log(PI0)
logPI1 = np.log(PI1)

In [42]:
#compute priors
count0 = sum(y==0 for y in train_labels)
count1 = sum(y==1 for y in train_labels)
total = len(train_texts)
prior0 = count0/total
prior1 = count1/total

logprior0 = np.log(prior0)
logprior1 = np.log(prior1)

logprior0, logprior1

(-1.0853517887964919, -0.41216188666585396)

In [43]:
from os import pread
from math import log
#build a classifier
class Clssifier:
  def __init__(self, logA, logPI, logprior):
    self.logA = logA
    self.logPI = logPI
    self.logprior = logprior
    self.K = len(logprior)
  def _compute_log_likelyhood(self, input_, class_):
    logA = self.logA[class_]
    logPI = self.logPI[class_]

    last_index = None
    logprob = 0
    for index in input_:
      if last_index is None:
        logprob += logPI[index] #its the first token
      else:
        logprob += logA[last_index, index]
      #update last index
      last_index = index
    return logprob

  def predict(self, inputs):
    predictions = np.zeros(len(inputs))
    for i, input_ in enumerate(inputs):
      posteriors = [self._compute_log_likelyhood(input_, class_) + self.logprior[class_] for class_ in range(self.K)]
      predictions[i] = np.argmax(posteriors)

    return predictions

In [44]:
# Each array must be in order since classes are assumed to index these lists
clf = Clssifier([logA0, logA1], [logPI0, logPI1], [logprior0, logprior1])

In [45]:
P_train = clf.predict(train_text_int)
P_test = clf.predict(test_text_int)

print(f'Train accuracy: {np.mean(P_train == train_labels)}')
print(f'Test accuracy: {np.mean(P_test == test_labels)}')

Train accuracy: 0.9953569355774812
Test accuracy: 0.8306264501160093


In [46]:
cm_test = confusion_matrix(test_labels, P_test)
cm_test

array([[ 70,  66],
       [  7, 288]])

In [47]:
cm = confusion_matrix(train_labels, P_train)
cm

array([[ 574,    8],
       [   0, 1141]])

In [48]:
f1_score(test_labels, P_test)

0.8875192604006163

In [49]:
f1_score(train_labels, P_train)

0.9965065502183407