In [1]:
import numpy as np
import string

np.random.seed(1234)

In [2]:
from sklearn.metrics import confusion_matrix, f1_score

In [3]:
# !curl  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt -O robert_frost.txt

In [4]:
def remove_punctuation(s):
    return s.translate(str.maketrans('','',string.punctuation))

In [5]:
initial = {} # start of a phrase
first_order = {} # second word only
second_order = {}

In [6]:
def add2dict(d, k, v):
    if k not in d:
        d[k] = []
    
    d[k].append(v)

# [cat, cat, dog, dog, dog, dog, dog, mouse, ...]

In [7]:
for line in open('robert_frost.txt'):
    tokens = remove_punctuation(line.rstrip().lower()).split()

    T = len(tokens)
    
    #print(line)
    
    for i in range(T):
        #print(second_order)
        t = tokens[i]
        
        if i == 0:
            # measure the distribution of the first word
            initial[t] = initial.get(t, 0.) + 1
        else:
            t_1 = tokens[i-1]
            
            if i == T - 1:
                # measure probability of ending the line
                # add2dict(second_order, (t_1, t), 'END')
                if (t_1, t) not in second_order:
                    second_order[(t_1, t)] = {'END':1}
                else:
                    second_order[(t_1, t)]['END'] = second_order[(t_1, t)].get('END',0) + 1
            if i == 1:
                # measure distribution of second word
                # given only first word
                # add2dict(first_order, t_1, t)
                if t_1 not in first_order:
                    first_order[t_1] = {t: 1}
                else:
                    first_order[t_1][t] = first_order[t_1].get(t,0) + 1
            else:
                t_2 = tokens[i-2]
                # add2dict(second_order, (t_2, t_1), t)
                if (t_2, t_1) not in second_order:
                    second_order[(t_2, t_1)] = {t: 1}
                else:
                    second_order[(t_2, t_1)][t] = second_order[(t_2, t_1)].get(t,0) + 1

In [8]:
# normalize the distributions
initial_total = sum(initial.values())
for t, c in initial.items():
    initial[t] = c / initial_total

In [9]:
# convert [cat, cat, cat, dog, dog, dog, dog, mouse, ...]
# into {cat: 0.5, dog: 0.4, mouse: 0.1}

def list2pdict(ts):
    # turn each list of possibilities into a dictionary of probabilities
    d = {}
    n = len(ts)
    for t in ts:
        d[t] = d.get(t, 0.) + 1
    for t, c in d.items():
        d[t] = c / n
    return d

In [10]:
# convert [cat, cat, cat, dog, dog, dog, dog, mouse, ...]
# into {cat: 0.5, dog: 0.4, mouse: 0.1}

def list2prob_dict(ts):
    # turn each list of possibilities into a dictionary of probabilities
    d = {}
    n = 0
    for t, c in ts.items():
        n += c
    for t, c in ts.items():
        d[t] = c / n
    return d

In [11]:
for t_1, ts in first_order.items():
    # replace list with dictionary of probabilities
    first_order[t_1] = list2prob_dict(ts)

In [12]:
for k, ts in second_order.items():
    second_order[k] = list2prob_dict(ts)

In [13]:
def sample_word(d):
    
    # print "d:", d
    p0 = np.random.random()
    # print "p0:", p0
    cumulative = 0
    
    for t, p in d.items():
        cumulative += p
        if p0 < cumulative:
            return t
    assert(False) # should never get here

In [14]:
def generate():
    for i in range(4): # generate 4 lines
        sentence = []

        # initial word
        w0 = sample_word(initial)
        sentence.append(w0)

        # sample second word
        w1 = sample_word(first_order[w0])
        sentence.append(w1)

        # second-order transitions until END
        while True:
            w2 = sample_word(second_order[(w0, w1)])
            if w2 == 'END':
                break
            sentence.append(w2)
            w0 = w1
            w1 = w2
            
        print(' '.join(sentence))

In [15]:
generate()

i went to bed alone and left me
might just as empty
but it isnt as if and thats not all the money goes so fast
you couldnt call it living for it aint


In [16]:
# Exercise 2:
# We can skip the step where we accumulate all the possible next words in a list
# E.g. [cat, cat, dog, dog, dog, ...]
#
# Instead, like we do with the initial state distribution, create the dictionary
# of counts directly as you loop through the data.
#
# You'll no longer need list2pdict()
# Done