In [29]:
import numpy as np
import matplotlib.pyplot as plt
import string
from sklearn.model_selection import train_test_split

In [30]:
input_files = [
    'edgar_allan_poe.txt',
    'robert_frost.txt'
]

In [31]:
input_texts = []
labels = []

for label, f in enumerate(input_files):
    print(f"{f} corresponds to label {label}")

    for line in open(f):
        line = line.rstrip().lower()
        if line:
            #remove punctuation
            line = line.translate(str.maketrans('','', string.punctuation))

            input_texts.append(line)
            labels.append(label)
        

edgar_allan_poe.txt corresponds to label 0
robert_frost.txt corresponds to label 1


In [32]:
train_text, test_text, Ytrain, Ytest = train_test_split(input_texts, labels)

In [33]:
len(Ytrain), len(Ytest)

(1615, 539)

In [34]:
train_text[:5]

['as often as he had in the tail of the night',
 'and now as the night was senescent',
 'and birds that joined in the excited fun',
 'where did i see one of those pieces lately',
 'and theres a story in a book about it']

In [35]:
Ytrain[:5]

[1, 0, 1, 1, 1]

In [36]:
idx = 1
word2idx = {'<unk':0}

In [49]:
for text in train_text:
    tokens = text.split()
    for token in tokens:
        if token not in word2idx:
            word2idx[token] = idx
            idx +=1 

word2idx

{'<unk': 0,
 'i': 1,
 'looked': 2,
 'at': 3,
 'nine': 4,
 'the': 5,
 'swarm': 6,
 'was': 7,
 'turned': 8,
 'to': 9,
 'rock': 10,
 'that': 11,
 'isnt': 12,
 'it': 13,
 'folks': 14,
 'arent': 15,
 'afraid': 16,
 'of': 17,
 'us': 18,
 'having': 19,
 'interfered': 20,
 'in': 21,
 'huse': 22,
 'business': 23,
 'never': 24,
 'let': 25,
 'them': 26,
 'stay': 27,
 'attic': 28,
 'when': 29,
 'they': 30,
 'sometimes': 31,
 'left': 32,
 'named': 33,
 'thrown': 34,
 'away': 35,
 'takes': 36,
 'moon': 37,
 'for': 38,
 'this': 39,
 'suns': 40,
 'a': 41,
 'wizard': 42,
 'he': 43,
 'can': 44,
 'eat': 45,
 'off': 46,
 'barrel': 47,
 'from': 48,
 'ground': 49,
 'heard': 50,
 'toffile': 51,
 'upstairs': 52,
 'bedroom': 53,
 'crosslegged': 54,
 'and': 55,
 'come': 56,
 'make': 57,
 'your': 58,
 'summer': 59,
 'dwelling': 60,
 'here': 61,
 'well': 62,
 'then': 63,
 'right': 64,
 'is': 65,
 'where': 66,
 'show': 67,
 'you': 68,
 'books': 69,
 'end': 70,
 'not': 71,
 'far': 72,
 'my': 73,
 'going': 74,
 'for

In [50]:
train_text_int = []
test_text_int = []

for text in train_text:
    tokens = text.split()
    line_as_int = [word2idx[token] for token in tokens]
    train_text_int.append(line_as_int)

for text in test_text:
    tokens = text.split()
    line_as_int = [word2idx.get(token, 0) for token in tokens]
    test_text_int.append(line_as_int)





In [51]:
train_text_int[100:105]

[[182, 722, 159, 109, 207, 149, 81],
 [55, 179, 677, 9, 5, 70, 17, 5, 1937],
 [856, 68, 93, 94, 857, 229, 858, 113, 219],
 [248, 1554, 5, 1555, 1343],
 [55, 550, 5, 1938, 48, 5, 1736]]

In [52]:
V = len(word2idx)

A0 = np.ones((V,V))
pi0 = np.ones(V)

A1 = np.ones((V,V))
pi1 = np.ones(V)

In [53]:
def compute_counts(text_as_int, A, pi):
    for tokens in text_as_int:
        last_idx = None
        for idx in tokens:
            if last_idx is None:
                pi[idx]+=1
            else:
                A[last_idx, idx] +=1
            last_idx = idx

In [54]:
compute_counts([t for t, y in zip(train_text_int, Ytrain) if y==0], A0, pi0)
compute_counts([t for t, y in zip(train_text_int, Ytrain) if y==1], A1, pi1)


In [55]:
A0 /= A0.sum(axis = 1, keepdims=True)
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
pi1/=pi1.sum()

In [56]:
logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)


In [65]:
count0 = sum(y==0 for y in Ytrain)
count1 = sum(y==1 for y in Ytrain)
total = len(Ytrain)
p0 = count0/total
p1 = count1/total
logp0 = np.log(p0)
logp1 = np.log(p1)
p0, p1


(0.3442724458204334, 0.6557275541795665)

In [59]:
class Classifier:
  def __init__(self, logAs, logpis, logpriors):
    self.logAs = logAs
    self.logpis = logpis
    self.logpriors = logpriors
    self.K = len(logpriors) # number of classes

  def _compute_log_likelihood(self, input_, class_):
    logA = self.logAs[class_]
    logpi = self.logpis[class_]

    last_idx = None
    logprob = 0
    for idx in input_:
      if last_idx is None:
        # it's the first token
        logprob += logpi[idx]
      else:
        logprob += logA[last_idx, idx]
      
      # update last_idx
      last_idx = idx
    
    return logprob
  
  def predict(self, inputs):
    predictions = np.zeros(len(inputs))
    for i, input_ in enumerate(inputs):
      posteriors = [self._compute_log_likelihood(input_, c) + self.logpriors[c] \
             for c in range(self.K)]
      pred = np.argmax(posteriors)
      predictions[i] = pred
    return predictions

In [60]:
clf = Classifier([logA0, logA1], [logpi0, logpi1], [logp0, logp1])


In [61]:
Ptrain = clf.predict(train_text_int)
print(f"Train acc: {np.mean(Ptrain == Ytrain)}")

Train acc: 0.9938080495356038


In [62]:
Ptest = clf.predict(test_text_int)
print(f"Test acc: {np.mean(Ptest == Ytest)}")

Test acc: 0.849721706864564


In [63]:
from sklearn.metrics import confusion_matrix, f1_score


In [64]:
cm = confusion_matrix(Ytrain, Ptrain)
cm

array([[ 546,   10],
       [   0, 1059]], dtype=int64)