##Project : Markov Model Classifier / Poetry generator

In [None]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt

--2023-06-30 11:41:52--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26622 (26K) [text/plain]
Saving to: ‘edgar_allan_poe.txt’


2023-06-30 11:41:52 (73.8 MB/s) - ‘edgar_allan_poe.txt’ saved [26622/26622]

--2023-06-30 11:41:52--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56286 (55K) [text/plain]
Saving 

In [None]:
#Lets go!

In [None]:
#import libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import string  #string.punctuate method is good

In [None]:
# assign a variable name to files
input_files = ['edgar_allan_poe.txt','robert_frost.txt']

In [None]:
input_files

['edgar_allan_poe.txt', 'robert_frost.txt']

In [None]:
#Note: pd.read_csv would have also been fine.

In [None]:
!head edgar_allan_poe.txt

LO! Death hath rear'd himself a throne
In a strange city, all alone,
Far down within the dim west
Where the good, and the bad, and the worst, and the best,
Have gone to their eternal rest.
 
There shrines, and palaces, and towers
Are not like any thing of ours
Oh no! O no! ours never loom
To heaven with that ungodly gloom!


In [None]:
!head robert_frost.txt

Two roads diverged in a yellow wood,
And sorry I could not travel both
And be one traveler, long I stood
And looked down one as far as I could
To where it bent in the undergrowth; 

Then took the other, as just as fair,
And having perhaps the better claim
Because it was grassy and wanted wear,
Though as for that the passing there


**The Punctuation Marks:**
Comma,
Period/Full Stop,
Colon,
Ellipsis,
Semicolon,
Apostrophe,
Hyphen,
Dash,
Quotation Marks,
Question Mark,
Exclamation Point,
Slash,
Parentheses and Brackets..

In [None]:
for label,f in enumerate(input_files):
  print(f"{f} corresponds to label{label}")

edgar_allan_poe.txt corresponds to label0
robert_frost.txt corresponds to label1


In [None]:
with open('edgar_allan_poe.txt') as f:
  f = f.read()

In [None]:
f

'LO! Death hath rear\'d himself a throne\nIn a strange city, all alone,\nFar down within the dim west\nWhere the good, and the bad, and the worst, and the best,\nHave gone to their eternal rest.\n\u2009\nThere shrines, and palaces, and towers\nAre not like any thing of ours\nOh no! O no! ours never loom\nTo heaven with that ungodly gloom!\nTime-eaten towers that tremble not!\nResemble nothing that is ours.\nAround, by lifting winds forgot,\nResignedly beneath the sky\nThe melancholy waters lie.\n\u2009\nNo holy rays from heaven come down\nOn the long night-time of that town,\nBut light from out the lurid sea\nStreams up the turrets silently\nUp thrones up long-forgotten bowers\nOf scultur\'d ivy and stone flowers\nUp domes up spires up kingly halls\nUp fanes up Babylon-like walls\nUp many a melancholy shrine\nWhose entablatures intertwine\nThe mask the viol and the vine.\n\u2009\nThere open temples open graves\nAre on a level with the waves\nBut not the riches there that lie\nIn each i

In [None]:
#collect data into a list

input_text = []
labels = []

for label,f in enumerate(input_files):
  print(f"{f} corresponds to label{label}")

  for line in open(f):
    line = line.rstrip().lower()  #rstrip to remove \n from text
    if line:      #if line is true or line is not empty
      #remove punctuation
      line = line.translate(str.maketrans('','',string.punctuation))
      input_text.append(line)
      labels.append(label)

edgar_allan_poe.txt corresponds to label0
robert_frost.txt corresponds to label1


In [None]:
#Lets do train test split
#X_train,X_test ,y_train,y_test = train_test_split(X,y)
#X = input_texts , y = labels
text_train,text_test ,y_train,y_test = train_test_split(input_text,labels)

In [None]:
#Our y (label)
len(y_train), len(y_test)

(1615, 539)

In [None]:
#Our X (feature/text)
len(text_train),len(text_test)

(1615, 539)

In [None]:
text_train[:5]  #random line of text w/o punctuation

['will start which lately slept in apathy',
 'to drag down man',
 'where loves a grownup god',
 'to seek for treasure in the jewelled skies',
 'thats what i sit up in the dark to say']

In [None]:
y_train[:5]

[0, 1, 0, 0, 1]

In [None]:
#convert text to integers (word2idx)
idx = 1 #index start at 1
word2idx = {'<unk>': 0}  #give index 0 for any unkn words in test set

In [None]:
#Now populate word2idx
for text in text_train:
  tokens = text.split()   #split the text

  for token in tokens:
    if token not in word2idx:
      word2idx[token] = idx   #if word not in word2idx dict,add word with unique index
      idx += 1

In [None]:
word2idx  # word to index mapping

{'<unk>': 0,
 'will': 1,
 'start': 2,
 'which': 3,
 'lately': 4,
 'slept': 5,
 'in': 6,
 'apathy': 7,
 'to': 8,
 'drag': 9,
 'down': 10,
 'man': 11,
 'where': 12,
 'loves': 13,
 'a': 14,
 'grownup': 15,
 'god': 16,
 'seek': 17,
 'for': 18,
 'treasure': 19,
 'the': 20,
 'jewelled': 21,
 'skies': 22,
 'thats': 23,
 'what': 24,
 'i': 25,
 'sit': 26,
 'up': 27,
 'dark': 28,
 'say': 29,
 'deep': 30,
 'heart': 31,
 'whose': 32,
 'hope': 33,
 'has': 34,
 'died': 35,
 'its': 36,
 'twothousandmile': 37,
 'coast': 38,
 'he': 39,
 'steered': 40,
 'was': 41,
 'really': 42,
 'straight': 43,
 'away': 44,
 'it': 45,
 'looks': 46,
 'as': 47,
 'if': 48,
 'some': 49,
 'magic': 50,
 'of': 51,
 'sun': 52,
 'and': 53,
 'we': 54,
 'marked': 55,
 'not': 56,
 'night': 57,
 'year': 58,
 'have': 59,
 'passed': 60,
 'by': 61,
 'watchman': 62,
 'on': 63,
 'his': 64,
 'beat': 65,
 'save': 66,
 'only': 67,
 'thee': 68,
 'me': 69,
 'paused': 70,
 'looked': 71,
 'then': 72,
 'cane': 73,
 'one': 74,
 'knock': 75,
 'or

In [None]:
word2idx['yawning']

999

In [None]:
len(word2idx)

2498

In [None]:
text_train[0]

'will start which lately slept in apathy'

In [None]:
#Practice
for text in text_train:
  tokens = text.split()
  line_as_int = [word2idx[token] for token in tokens]
print(line_as_int)

[2497, 101, 1462, 47, 416, 91, 47, 54]


In [None]:
t = ['words 1','game 2']
[x.split() for x in t]

[['words', '1'], ['game', '2']]

In [None]:
t = text_test[0]
print([x.split() for x in t])

[['y'], ['o'], ['u'], ['r'], [], ['d'], ['e'], ['s'], ['t'], ['i'], ['n'], ['a'], ['t'], ['i'], ['o'], ['n'], [], ['a'], ['n'], ['d'], [], ['y'], ['o'], ['u'], ['r'], [], ['d'], ['e'], ['s'], ['t'], ['i'], ['n'], ['y'], ['s']]


In [None]:
#convert data into integer format
# convert words into its integer eqvt using some matrix

text_train_int = []  #empty list for text_train data(X_train)
text_test_int = []   # for text_test(X_test)

for text in text_train:
  tokens = text.split()
  line_as_int = [word2idx[token] for token in tokens]
  text_train_int.append(line_as_int)

for text in text_test:
  tokens = text.split()
  line_as_int = [word2idx.get(token,0) for token in tokens]
  #.get(str,key) for unkn value present in test & not in train data
  text_test_int.append(line_as_int)

.get(__key: str, /) -> (int | None)

Return the value for key if key is in the dictionary, else default

In [None]:
#Lets see if it worked
text_train_int[100:108]  #random indexing

[[45, 378, 45, 100, 20, 379, 194, 237, 115, 380],
 [381, 256, 194, 93, 146, 382, 383, 384, 206],
 [20, 385, 386, 133, 81, 81, 54, 153, 387],
 [388, 47, 14, 389, 47, 390, 89, 320, 36, 391],
 [212, 392, 137, 104, 100, 20, 393],
 [220, 394, 49, 395, 53, 396, 366, 51, 397],
 [61, 146, 267, 47, 39, 398, 341, 399, 8, 255],
 [244, 25, 400, 25, 400, 10, 401]]

In [None]:
text_test_int[10:20]

[[0, 244, 8, 0, 0],
 [12, 81, 321],
 [14, 1312, 134, 557, 18, 64, 1009, 41, 1313],
 [20, 202, 51, 165, 0, 557, 14, 0],
 [267, 1907, 56, 1823, 146, 6, 64, 0],
 [36, 20, 292, 0, 28, 274],
 [194, 1, 115, 0, 8, 0, 47, 779],
 [165, 1440, 426],
 [729, 45, 1139, 39, 299, 25, 951, 8, 266],
 [89, 320, 8, 2474, 100, 0, 0]]

In [None]:
#lets build A & pi matices - for both classes
# No of classes = No of Markov Models
V = len(word2idx)  #V = Vocabulary Size

A0 = np.ones((V,V))
pi0 = np.ones(V)

A1 = np.ones((V,V))
pi1 = np.ones(V)

In [None]:
# compute counts for A and pi
# Populate A's & pi's
def compute_counts(text_as_int, A, pi):
  for tokens in text_as_int:
    last_idx = None #This will help us realise if we are populating A or pi.

    for idx in tokens:
      if last_idx is None:
        #it's the first word in a sentance
        pi[idx] += 1
      else:
        # the last word exists ,so count a transition
        A[last_idx,idx] +=1

      #update last idx
      last_idx = idx

#for class = 0
compute_counts([t for t,y in zip(text_train_int,y_train) if y == 0],A0,pi0)
#for class = 1
compute_counts([t for t,y in zip(text_train_int,y_train) if y == 1],A1,pi1)

In [None]:
#normalize A and pi so they are valid probability matrices
#convince yourself that this is eqvt to the formulas shown before.
# keepdims imp as A are 2d array & to divide it in numpy ,we need to keep shape intact.
A0 = A0/A0.sum(axis = 1,keepdims=True)  #keepdimension
pi0 = pi0/pi0.sum()

A1 = A1/A1.sum(axis = 1,keepdims = True)
pi1 = pi1/pi1.sum()


In [None]:
A0

array([[0.00040032, 0.00040032, 0.00040032, ..., 0.00040032, 0.00040032,
        0.00040032],
       [0.00039936, 0.00039936, 0.00119808, ..., 0.00039936, 0.00039936,
        0.00039936],
       [0.0004    , 0.0004    , 0.0004    , ..., 0.0004    , 0.0004    ,
        0.0004    ],
       ...,
       [0.00040032, 0.00040032, 0.00040032, ..., 0.00040032, 0.00040032,
        0.00040032],
       [0.00040032, 0.00040032, 0.00040032, ..., 0.00040032, 0.00040032,
        0.00040032],
       [0.00040032, 0.00040032, 0.00040032, ..., 0.00040032, 0.00040032,
        0.00040032]])

In [None]:
pi0

array([0.00032895, 0.00098684, 0.00032895, ..., 0.00032895, 0.00032895,
       0.00032895])

In [None]:
#since we dont need actual probabilties,take log
logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)

In [None]:
logA0

array([[-7.82324569, -7.82324569, -7.82324569, ..., -7.82324569,
        -7.82324569, -7.82324569],
       [-7.82564473, -7.82564473, -6.72703244, ..., -7.82564473,
        -7.82564473, -7.82564473],
       [-7.82404601, -7.82404601, -7.82404601, ..., -7.82404601,
        -7.82404601, -7.82404601],
       ...,
       [-7.82324569, -7.82324569, -7.82324569, ..., -7.82324569,
        -7.82324569, -7.82324569],
       [-7.82324569, -7.82324569, -7.82324569, ..., -7.82324569,
        -7.82324569, -7.82324569],
       [-7.82324569, -7.82324569, -7.82324569, ..., -7.82324569,
        -7.82324569, -7.82324569]])

In [None]:
logpi0

array([-8.01961279, -6.92100051, -8.01961279, ..., -8.01961279,
       -8.01961279, -8.01961279])

In [None]:
#compute priors
#Lets see how many samples belong to class 0 & 1
count0 = sum(y==0 for y in y_train)
count1 = sum(y==1 for y in y_train)

total = len(y_train)

#compute prior probabilities
p0 = count0 / total
p1 = count1 / total
#its log
logp0 = np.log(p0)
logp1 = np.log(p1)

p0,p1

(0.33560371517027865, 0.6643962848297214)

In [None]:
# It seems classes are imbalanced.

In [None]:
#finally lets build a classifier
class classifier:     #constructor
  def __init__(self,logAs, logpis,logpriors):
    self.logAs = logAs
    self.logpis = logpis
    self.logpriors = logpriors
    self.K = len(logpriors)  #No of classes

  def _compute_log_likelyhood(self,input_,class_):
    logA = self.logAs[class_]
    logpi = self.logpis[class_]

    last_idx = None
    logprob = 0
    for idx in input_:
      if last_idx is None:
        #its the first token
        logprob += logpi[idx]
      else:                     #logprob not none
        logprob += logA[last_idx,idx]

      #update last idx
      last_idx = idx

    return logprob
  def predict(self,inputs):
    predictions = np.zeros(len(inputs))
    for i ,input_ in enumerate(inputs):
      posteriors = [self._compute_log_likelyhood(input_,c) + self.logpriors[c] for c in range(self.K)]

      pred = np.argmax(posteriors)
      predictions[i] = pred
    return predictions



In [None]:
#each arry must be in order since classes are assumed to index these lists
clf = classifier([logA0, logA1], [logpi0, logpi1],[logp0,logp1])

In [None]:
Ptrain = clf.predict(text_train_int)
print(f"Train accuracy: {np.mean(Ptrain == y_train)}")

Train accuracy: 0.9956656346749226


In [None]:
Ptest = clf.predict(text_test_int)
print(f"Test accuracy: {np.mean(Ptest == y_test)}")


Test accuracy: 0.8163265306122449


In [None]:
#import metrics
from sklearn.metrics import confusion_matrix,f1_score

In [None]:
cm_train = confusion_matrix(y_train,Ptrain)

In [None]:
cm_train

array([[ 535,    7],
       [   0, 1073]])

In [None]:
cm_test = confusion_matrix(y_test,Ptest)
cm_test

array([[ 89,  87],
       [ 12, 351]])

In [None]:
f1_score(y_train,Ptrain)

0.9967487227124942

In [None]:
f1_score(y_test,Ptest)

0.8764044943820225

In [None]:
#Thank you!