In [1]:
import re
import numpy as np

In [30]:
from sklearn.metrics import confusion_matrix, f1_score

# read about F-score: https://en.wikipedia.org/wiki/F-score

In [7]:
!curl  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt -O edgar_allan_poe.txt
!curl  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt -O robert_frost.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 26622  100 26622    0     0   209k      0 --:--:-- --:--:-- --:--:--  209k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0curl: (6) Could not resolve host: edgar_allan_poe.txt
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
 49 56286   49 27868    0     0   217k      0 --:--:-- --:--:-- --:--:--  215k
100 56286  100 56286    0     0   401k      0 --:--:-- --:--:-- --:--:--  3

In [2]:
def read_lines(filename):
    file = open(filename, 'r')
    return file.readlines()

#### Reading Files

In [3]:
lines_edgar = read_lines('edgar_allan_poe.txt')
lines_robert = read_lines('robert_frost.txt')

#### Labeling 

In [4]:
cont = 0

lines_labeled = []

# Adding labels to rows
# Label 0 to Edgar
# Label 1 to Robert
for label,file in enumerate([lines_edgar,lines_robert]):
                            
    for cont in range(len(file)):
        if(file[cont] not in ['â€‰\n','\n']):
            lines_labeled.append([re.sub('[^a-zA-Z0-9 \n]', '', file[cont].strip().lower()),label])
            

In [5]:
lines_labeled

[['lo death hath reard himself a throne', 0],
 ['in a strange city all alone', 0],
 ['far down within the dim west', 0],
 ['where the good and the bad and the worst and the best', 0],
 ['have gone to their eternal rest', 0],
 ['there shrines and palaces and towers', 0],
 ['are not like any thing of ours', 0],
 ['oh no o no ours never loom', 0],
 ['to heaven with that ungodly gloom', 0],
 ['timeeaten towers that tremble not', 0],
 ['resemble nothing that is ours', 0],
 ['around by lifting winds forgot', 0],
 ['resignedly beneath the sky', 0],
 ['the melancholy waters lie', 0],
 ['no holy rays from heaven come down', 0],
 ['on the long nighttime of that town', 0],
 ['but light from out the lurid sea', 0],
 ['streams up the turrets silently', 0],
 ['up thrones up longforgotten bowers', 0],
 ['of sculturd ivy and stone flowers', 0],
 ['up domes up spires up kingly halls', 0],
 ['up fanes up babylonlike walls', 0],
 ['up many a melancholy shrine', 0],
 ['whose entablatures intertwine', 0],


#### Special index

In [6]:
selected_index = np.random.choice([0, 1], size=(len(lines_labeled),), p=[2./3, 1./3])

In [7]:
train_text = []
train_y = []
test_text = []
test_y = []

for ind,lin in zip(selected_index,lines_labeled):
    if(ind == 0):
        train_text.append(lin[0])
        train_y.append(lin[1])
    else:
        test_text.append(lin[0])
        test_y.append(lin[1])


#### Split string into words

In [8]:
words = []

for line in train_text:
    words = words + re.sub('[^a-zA-Z0-9 \n]', '', line.strip().lower()).split()

In [9]:
words

['lo',
 'death',
 'hath',
 'reard',
 'himself',
 'a',
 'throne',
 'in',
 'a',
 'strange',
 'city',
 'all',
 'alone',
 'far',
 'down',
 'within',
 'the',
 'dim',
 'west',
 'where',
 'the',
 'good',
 'and',
 'the',
 'bad',
 'and',
 'the',
 'worst',
 'and',
 'the',
 'best',
 'have',
 'gone',
 'to',
 'their',
 'eternal',
 'rest',
 'there',
 'shrines',
 'and',
 'palaces',
 'and',
 'towers',
 'oh',
 'no',
 'o',
 'no',
 'ours',
 'never',
 'loom',
 'to',
 'heaven',
 'with',
 'that',
 'ungodly',
 'gloom',
 'timeeaten',
 'towers',
 'that',
 'tremble',
 'not',
 'resemble',
 'nothing',
 'that',
 'is',
 'ours',
 'up',
 'thrones',
 'up',
 'longforgotten',
 'bowers',
 'up',
 'domes',
 'up',
 'spires',
 'up',
 'kingly',
 'halls',
 'up',
 'many',
 'a',
 'melancholy',
 'shrine',
 'whose',
 'entablatures',
 'intertwine',
 'the',
 'mask',
 'the',
 'viol',
 'and',
 'the',
 'vine',
 'are',
 'on',
 'a',
 'level',
 'with',
 'the',
 'waves',
 'in',
 'each',
 'idols',
 'diamond',
 'eye',
 'tempt',
 'the',
 'wat

#### Counting tokens

In [10]:
tokens = {'<unknow>':0}

for word in words:
    if(word not in tokens):
        tokens[word] = 1
    else:
        tokens[word] = tokens[word] + 1

#### Mapping tokens to index

In [11]:
mapping = {}
index = 0
for token in tokens.keys():
    mapping[token] = index
    index += 1

#### Converting text to indexes

In [12]:
train_text_int = []
test_text_int = []

for line in train_text:
    train_text_int.append([mapping[token] for token in line.split()])
    
for line in test_text:
    test_text_int.append([mapping.get(token, 0) for token in line.split()])

In [13]:
test_text_int[40:50]

[[21, 93, 885, 130, 516, 736, 243, 275],
 [179, 886, 92, 887, 27, 6, 888],
 [641, 152, 889, 156, 21, 890, 235],
 [21, 179, 891, 27, 16, 195, 87, 16, 621],
 [100, 127, 1074, 139, 16, 280, 87, 6, 282],
 [70, 16, 280, 87, 238, 281, 282],
 [103, 16, 294, 43, 127, 892, 21, 178],
 [238, 135, 136, 137, 87, 138],
 [238, 143, 144, 87, 138],
 [317, 549, 0, 397, 21, 0]]

#### Initialize A and pi matrices - for both classes

In [16]:
V = len(mapping)

A0 = np.ones((V, V))
pi0 = np.ones(V)

A1 = np.ones((V, V))
pi1 = np.ones(V)

#### Compute counts for A and pi

In [17]:
def compute_counts(text_as_int, A, pi):
    for tokens in text_as_int:
        last_idx = None
        for idx in tokens:
            if last_idx is None:
                # it's the first word in a sentence
                pi[idx] += 1
            else:
                # the last word exists, so count a transition
                A[last_idx, idx] += 1

            # update last idx
            last_idx = idx


compute_counts([t for t, y in zip(train_text_int, train_y) if y == 0], A0, pi0)
compute_counts([t for t, y in zip(train_text_int, train_y) if y == 1], A1, pi1)

In [18]:
# normalize A and pi so they are valid probability matrices
# convince yourself that this is equivalent to the formulas shown before
A0 /= A0.sum(axis=1, keepdims=True)
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
pi1 /= pi1.sum()

In [19]:
# log A and pi since we don't need the actual probs
logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)

In [24]:
# compute priors
count0 = sum(y == 0 for y in train_y)
count1 = sum(y == 1 for y in train_y)
total = len(train_y)
p0 = count0 / total
p1 = count1 / total
logp0 = np.log(p0)
logp1 = np.log(p1)
p0, p1, logp0,logp1

(0.3442737430167598,
 0.6557262569832403,
 -1.0663181734779008,
 -0.42201186831232795)

In [26]:
# build a classifier
class Classifier:
    def __init__(self, logAs, logpis, logpriors):
        self.logAs = logAs
        self.logpis = logpis
        self.logpriors = logpriors
        self.K = len(logpriors) # number of classes

    def _compute_log_likelihood(self, input_, class_):
        logA = self.logAs[class_]
        logpi = self.logpis[class_]

        last_idx = None
        logprob = 0
        
        for idx in input_:
            if last_idx is None:
                # it's the first token
                logprob += logpi[idx]
            else:
                logprob += logA[last_idx, idx]

            # update last_idx
            last_idx = idx

        return logprob
  
    def predict(self, inputs):
        predictions = np.zeros(len(inputs))
        for i, input_ in enumerate(inputs):
            posteriors = [self._compute_log_likelihood(input_, c) + self.logpriors[c] for c in range(self.K)]
            pred = np.argmax(posteriors)
            predictions[i] = pred
        return predictions

In [27]:
# each array must be in order since classes are assumed to index these lists
clf = Classifier([logA0, logA1], [logpi0, logpi1], [logp0, logp1])

In [28]:
Ptrain = clf.predict(train_text_int)
print(f"Train acc: {np.mean(Ptrain == train_y)}")

Train acc: 0.9951117318435754


In [29]:
Ptest = clf.predict(test_text_int)
print(f"Test acc: {np.mean(Ptest == test_y)}")

Test acc: 0.8213296398891967


In [32]:
cm = confusion_matrix(train_y, Ptrain)
cm

array([[486,   7],
       [  0, 939]], dtype=int64)

In [34]:
cm_test = confusion_matrix(test_y, Ptest)
cm_test

array([[109, 116],
       [ 13, 484]], dtype=int64)

In [35]:
f1_score(train_y, Ptrain)

0.9962864721485412

In [37]:
f1_score(test_y, Ptest)

0.8824065633546034

In [39]:
logpi1

array([-8.08610254, -8.08610254, -8.08610254, ..., -8.08610254,
       -8.08610254, -8.08610254])