### Objective: Given a setence, classify if it's Robert Frost or Edgar Allen Poe poem  

steps:
1. get the data
2. process data
3. save labels

In [57]:
# !python  -m wget https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt
# !python -m wget https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt

In [65]:
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

### Process data

In [59]:
def clean(line):
    line = line.strip()
    line = line.lower()
    line = re.sub("[^a-zA-Z ]", '', line)
    return line

def tolkenizer(sentences):
    wordMap = {'<unknown>': 0}
    count =1 
    for sentence in sentences:
        words = sentence.split(" ")
        for word in words:
            if(word in wordMap):
                continue
            else:
                wordMap[word] = count
                count = count + 1
    return wordMap

In [60]:
features = list()
labels = list()
class0 = 0

with open('edgar_allan_poe.txt') as file:
    lines = file.readlines()
    count0 = len(lines)
    for line in lines:
        line = clean(line)
        features.append(line)
        labels.append(0)
        class0 = class0 + 1

class1 = 0
with open('robert_frost.txt') as file:
    lines = file.readlines()
    for line in lines:
        line = clean(line)
        features.append(line)
        labels.append(1)
        class1 = class1 + 1

print('Dataset samples : ' + str(len(features)))
print('class robert frost (0) ' + str(class0))
print('class allan poe (1) ' + str(class1))

Dataset samples : 2378
class robert frost (0) 797
class allan poe (1) 1581


In [61]:
x_train, x_test, y_train, y_test = train_test_split(features, labels)
print(x_train[100: 105])
print(y_train[100: 105])

wordMap = tolkenizer(x_train)

print("word map lenght :", len(wordMap))
# Convert sentence to idx
for i in range(len(x_train)):
    x_train[i] = [wordMap[word] for word in x_train[i].split(" ")]

# Convert test 
for i in range(len(x_test)):
    x_test[i] = [wordMap.get(word, 0) for word in x_test[i].split(" ")]

print(x_train[100: 105])
print(y_train[100: 105])

print(x_test[100: 105])
print(y_test[100: 105])

['but everybody took it for proof', 'it is not that my founts of bliss', '', 'to where it bent in the undergrowth', 'this was no playhouse but a house in earnest']
[1, 0, 1, 1, 1]
word map lenght : 2534
[[83, 362, 341, 1, 104, 363], [1, 40, 72, 143, 254, 364, 8, 365], [102], [20, 170, 1, 366, 12, 5, 367], [263, 2, 159, 368, 83, 46, 172, 12, 369]]
[1, 0, 1, 1, 1]
[[17, 21, 154, 49, 433, 10, 5, 258], [28, 125, 127, 336, 5, 588, 8, 5, 980], [20, 0, 5, 661, 28, 478, 5, 691], [204, 137, 915, 5, 330, 351, 682, 683], [28, 17, 62, 61, 40, 1778, 52, 1779]]
[1, 0, 1, 1, 0]


In [62]:
M = len(wordMap)
A0 = np.ones((M, M))
pi0 = np.ones(M)

A1 = np.ones((M,M))
pi1 = np.ones(M)

def count_occurences(x_train, A, pi):
    for x in x_train:
        prev = None
        for idx in x:
            if prev == None:
                pi[idx] += 1
            else:
                A[prev, idx] += 1
            prev = idx

count_occurences([t for t,y in zip(x_train, y_train) if y == 0 ] , A0, pi0)
count_occurences([t for t,y in zip(x_train, y_train) if y == 1 ] , A1, pi1)

min = np.min(A0)

max = np.max(A0)
print(max)

# Normalize A, pi
A0 /= A0.sum(axis = 1, keepdims=True)
pi0 /= pi0.sum()
print('Max = ', np.max(A0))
print('Min = ', np.min(A0))

A1 /= A1.sum(axis = 1, keepdims= True)
pi1 /= pi1.sum()

print('Max = ', np.max(A1))
print('Min = ', np.min(A1))

logA0 = np.log(A0)
logA1 = np.log(A1)

log_pi0= np.log(pi0)
log_pi1 = np.log(pi1)

prior0 = sum(y == 0 for y in y_train) / len(y_train)
prior1 = sum(y == 1 for y in y_train) / len(y_train)

log_prior0 = np.log(prior0)
log_prior1 = np.log(prior1)

print("Prior 0 = ", prior0, ", prior 1=" ,prior1)
print("log prior 0 = ", log_prior0, ", log prior 1 = ", log_prior1)

27.0
Max =  0.010093457943925233
Min =  0.0003620564808110065
Max =  0.011290929619872036
Min =  0.0003399048266485384
Prior 0 =  0.3370723499719574 , prior 1= 0.6629276500280427
log prior 0 =  -1.0874576833279328 , log prior 1 =  -0.41108941989709685


In [63]:
class Classifier:
    def __init__(self, logAs, logPis, logPs):
        self.logAs = logAs
        self.logPis = logPis
        self.logPs = logPs
        self.K = len(logPs)

    def compute_probability(self, input, class_):
        logA = self.logAs[class_]
        logPi = self.logPis[class_]

        last_idx = None
        logProb = 0

        for idx in input:
            if(last_idx == None):
                logProb += logPi[idx]
            else:
                logProb += logA[last_idx, idx]
            last_idx = idx

        return logProb
    
    def predict(self, inputs):

        preds = np.zeros(len(inputs))

        for i, input in enumerate(inputs):
            posteriors = [self.compute_probability(input, c) + self.logPs[c] for c in range(self.K)]
        
            pred = np.argmax(posteriors)
            preds[i] = pred

        return preds


In [64]:
clf = Classifier([logA0, logA1], [log_pi0, log_pi1], [log_prior0, log_prior1])

ptrain = clf.predict(x_train)
print("Train acc: ", np.mean(ptrain == y_train))

# ptest = clf.predict(x_test)
ptest = clf.predict(x_test)
print("Train acc: ", np.mean(ptest == y_test))



Train acc:  0.9607403252944475
Train acc:  0.8084033613445378


In [68]:
cm_train= confusion_matrix(y_train, ptrain)
print(cm_train)

cm_test= confusion_matrix(y_test, ptest)
print(cm_test)

[[ 531   70]
 [   0 1182]]
[[ 98  98]
 [ 16 383]]


In [69]:
print( 'f1 training ', f1_score(y_train, ptrain))
print( 'f1 testing ', f1_score(y_test, ptest))

f1 training  0.9712407559572719
f1 testing  0.8704545454545454
