In [1]:
import nltk
import string
from HMM import supervised_HMM, unsupervised_HMM, HiddenMarkovModel
import re
import numpy as np

In [2]:
def load_poems(filename):
    
    lines = [] # 2d dictionary, each array is a split + cleaned line
    words = {} # dictionary of a word, and its frequency
    
    file = open(filename, 'r')
    
    for line in file:
        line = line.strip()
        if  len(line) < 10:
            # Too short to be a valid line
            continue
        line = "".join(l for l in line if l not in string.punctuation)
        line = line.lower()
        line = line.split()
        
        lines.append(line)

        for word in line:
            try:
                # add to frequency if the word is already in the dic
                words[word] += 1
            except KeyError:
                # if not, add the word to the dic
                words[word] = 1
    return lines, words

In [3]:
file = "data/shakespeare.txt"
lines, words = load_poems(file)

In [4]:
lines[0]

['from', 'fairest', 'creatures', 'we', 'desire', 'increase']

In [5]:
def unsupervised_learning(lines, n_states, n_iters):
    '''
    n_iters: Number of iterations we should go through.
    n_states: Number of hidden states our HMM should have.
    '''
    # Train the HMM.
    obs, obs_map =  parse_observations(lines)
    flat_lines = [[item] for sublist in lines for item in sublist]
    leHMM = unsupervised_HMM(obs, n_states, n_iters)
    return leHMM, obs,obs_map 
    

In [6]:
def obs_map_reverser(obs_map):
    obs_map_r = {}

    for key in obs_map:
        obs_map_r[obs_map[key]] = key

    return obs_map_r

In [7]:
def parse_observations(lines):

    obs_counter = 0
    obs = []
    obs_map = {}

    for line in lines:
        obs_elem = []

        for word in line:
            word = re.sub(r'[^\w]', '', word).lower()
            if word not in obs_map:
                # Add unique words to the observations map.
                obs_map[word] = obs_counter
                obs_counter += 1

            # Add the encoded word.
            obs_elem.append(obs_map[word])

        # Add the encoded sequence.
        obs.append(obs_elem)

    return obs, obs_map

In [8]:
obs, obs_map = parse_observations(lines)

In [9]:
# get syllable info from syllable_dictionary.txt
def load_syllables(filename):
    file = open(filename, 'r')
    syllable = {}
    for line in file:
        line = line.split()
        #print(line)
        word = line[0]
        rest = line[1: len(line)]

        syllable[word] = rest
    return syllable

In [10]:
filename = "data/Syllable_dictionary.txt"
syllable = load_syllables(filename)

In [11]:
def save_HMM(hmmmmmm, filename):
    
    with open(filename+".txt", "w+") as filept:
        filept.write(str(hmmmmmm.L)+"\n")
        filept.write(str(hmmmmmm.D)+"\n")
        for i in hmmmmmm.A:
            line = ""
            for j in i:
                line += str(j) + ","
            filept.write(line[:len(line)-1]+"\n")
        for i in hmmmmmm.O:
            line = ""
            for j in i:
                line += str(j) + ","
            filept.write(line[:len(line)-1]+"\n")
        

def read_HMM(filename):
    with open(filename+".txt", "r") as filept:
        L = int(filept.readline())
        D = int(filept.readline())
        O = []
        A = []
        for i in range(L):
            line = [float(x) for x in filept.readline().split(",")]
            A.append(line)
        for j in range(L):
            line = [float(x) for x in filept.readline().split(",")]
            O.append(line)
    return HiddenMarkovModel(A, O)

In [12]:
testHMM40 = read_HMM("40-iter-8-hidden-hmm")

In [13]:
obs_map_r = obs_map_reverser(obs_map)

In [14]:
for i in range(14): # each poem is 14 lines long
    emission = testHMM40.generate_emission(10) # each line is 10 words long
    sentence = [obs_map_r[i] for i in emission[0]]

    print(' '.join(sentence).capitalize())

Summers as be oer love thine not lilys doth lie
Better composition prove write self have best in potions into
Light and on thee longing live will physic good works
Centre if thee sepulchres and truths make i for now
Oer see and blunter give of thine correct to eager
So was eithers thine youth most a like can wrong
Thy beguiled show be than still thee makes all where
I eye i durst thing this in perfect others calls
Not times all on delight lack to out the hate
Tomb and to the summers who strong and to on
Intents may this greater but time bars seeting by by
Mens most in is sense defendant more as flesh do
Good were illused how like go is key true foot
Discased still where every the distance invoked silent shake loves


In [15]:
for i in range(14): # each poem is 14 lines long
    emission = testHMM40.generate_emission_syllables(10, obs_map_r, syllable) # each line is 10 words long
    sentence = [obs_map_r[i] for i in emission[0]]

    print(' '.join(sentence).capitalize())

Belied so by after the and more lords
Be please fair knew worst thou show is to with
Worthy of then fearing these thy verse muse
That heaven of kindness you of breast thy have
But authority this man i part so
Hath pluck let but dost a is to and and
World friends word continual tyrant that hide
Heart shallowest so and the by watchman so
The a say so at pride or policy
When nor me i for by they my my he
Writ for burn of thy have with make of thy
War remover to my time show and your
Than spoil so of advantage every but
Must thy you palate she thy yet you she


In [16]:
for i in range(14): # each poem is 14 lines long
    emission = testHMM40.generate_emission_syllables_other(10, obs_map_r, syllable) # each line is 10 words long
    sentence = [obs_map_r[i] for i in emission[0]]

    print(' '.join(sentence).capitalize())

These and wit speed love so they ride due praise
Do the world not one a in lovers is
My love and i use through report my it
These thee her this abundance as and large
Action thee love vows rhymers long worth can
Thee woos down speak evermore sauces went
So found i sun crossed is of as odours
Folly away mine their thy heart thy all
Merchandized call any saw issue chide
Stay bosoms bright take is changing judgement
Another figure my in thy at to
Or where sinful these unlearned rest on
The upon my catch thee pity name shall
To now live taste heavy that the watchman


In [17]:
for i in range(14): # each poem is 14 lines long
    emission = testHMM40.generate_emission_syllables_correct(10, obs_map_r, syllable) # each line is 10 words long
    sentence = [obs_map_r[i] for i in emission[0]]

    print(' '.join(sentence).capitalize())

Those in i those are hindmost on stay seem
Stores my blame thy bloody thy is my it
To they is stones did him is give of flower
Self registers missed rhyme my birth as when
Hate your vassal doth siren thy thine earth
Art where rest your wound my home better of
Feathered them a like truest power their are
At thy thou and the mountain palate thy
On but unless that gives reeleth quite thy
To my impute towers for i appetite
O hours so then idle feeds way and bare
Boast and do me old tired the low so east
Beated woeful breast and thine her of thy
Graces and i should then those you shine me


In [18]:
def script():
    file = open("data/shakespeare.txt", 'r')
    throwaway = [98, 125, 144]
    sonnet_counter = 0
    i = 0
    all_pairs = []
    temp = [[] for _ in range(7)]
    for line in file:
        line = line.strip()
        if len(line) < 10:
            # Too short to be a valid line
            if i != 0:
                if sonnet_counter not in throwaway:
                    all_pairs.extend(temp)
                sonnet_counter += 1
                i = 0
                temp = [[] for _ in range(7)]
            continue
        line = "".join(l for l in line if l not in string.punctuation)
        line = line.lower()
        line = line.split()
        
        last = line[-1]
        
        if i == 0 or i == 2:
            # a
            temp[0].append(last)
        elif i == 1 or i == 3:
            #b
            temp[1].append(last)
        elif i==4 or i==6:
            #c
            temp[2].append(last)
        elif i==5 or i==7:
            #d
            temp[3].append(last)
            
        elif i==8 or i==10:
            #e
            temp[4].append(last)
            
        elif i==9 or i==11:
            #f
            temp[5].append(last)
            
        elif i==12 or i==13:
            #g
            temp[6].append(last)
            
        i += 1
        lines.append(line)

                
    all_pairs_dict = {}
    for i, j in all_pairs:
        if i not in all_pairs_dict:
                    all_pairs_dict[i] = [j]
                
        if j not in all_pairs_dict:
                    all_pairs_dict[j] = [i]
                
        # checking all against all other pairs
        for k in all_pairs:
            # If i or j is in k, this means we need to add things
            if i in k or j in k:
                for a in k:
                    if a not in all_pairs_dict[i] and a != i:
                        all_pairs_dict[i].append(a)
                    if a not in all_pairs_dict[j] and a != j:
                        all_pairs_dict[j].append(a)
           
    # Completing the graph. 
    for key, val in all_pairs_dict.items():
        for i in val:
            if key not in all_pairs_dict[i]:
                all_pairs_dict[i].append(key)
            for j in val:
                if j != i and j not in all_pairs_dict[i]:
                    all_pairs_dict[i].append(j)
                    
    return all_pairs, all_pairs_dict

In [19]:
all_pairs, all_pairs_dict  = script()

In [20]:
for i in range(7): # each poem is 14 lines long, with 7 rhymes
    word = np.random.choice(list(all_pairs_dict.keys()))
    words = all_pairs_dict[word]
    word2 = np.random.choice(words)
    emission1 = testHMM40.generate_emission_rhyme(10, obs_map_r, obs_map[word]) # each line is 10 words long
    emission2 = testHMM40.generate_emission_rhyme(10, obs_map_r, obs_map[word2])
    sentence1 = [obs_map_r[i] for i in emission1[0]]
    sentence2 = [obs_map_r[i] for i in emission2[0]]

    print(' '.join(sentence1).capitalize())
    print(' '.join(sentence2).capitalize())

8 3175 8 8
208


IndexError: list index out of range

In [None]:
word = np.random.choice(list(all_pairs_dict.keys()))

In [None]:
obs_map[word]