#  Preprocessing

In [44]:
import numpy as np
import matplotlib.pyplot as plt
import random

import nltk
import os
import string
from HMM import supervised_HMM, unsupervised_HMM, HiddenMarkovModel
import re # regular expression

import keras.preprocessing.text

In [46]:
def parse_map(lines):
    obs_counter = 0
    obs = []
    obs_map = {}

    for line in lines:
        obs_elem = []

        for word in line:
            word = re.sub(r'[^\w]', '', word).lower()
            if word not in obs_map:
                # Add unique words to the observations map.
                obs_map[word] = obs_counter
                obs_counter += 1

            # Add the encoded word.
            obs_elem.append(obs_map[word])

        # Add the encoded sequence.
        obs.append(obs_elem)

    return obs, obs_map

In [47]:
def unsupervised_learning(lines, n_states, n_iters):
    '''
    n_iters: Number of iterations we should go through.
    n_states: Number of hidden states our HMM should have.
    '''
    # Train the HMM.
    obs, obs_map =  parse_map(lines)
    flat_lines = [[item] for sublist in lines for item in sublist]
    leHMM = unsupervised_HMM(obs, n_states, n_iters)
    return leHMM, obs,obs_map

In [48]:
def load_poems(files):
    
    lines = [] # 2d dictionary, each array is a split + cleaned line
    words = {} # dictionary of a word, and its frequency
    
    for f in files:
        file = open(f, 'r')
        for line in file:
            line = line.strip()
            if  len(line) < 10:
                # Too short to be a valid line
                continue
            line = "".join(l for l in line if l not in string.punctuation)
            line = line.lower()
            line = line.split()

            lines.append(line)

            for word in line:
                try:
                    # add to frequency if the word is already in the dic
                    words[word] += 1
                except KeyError:
                    # if not, add the word to the dic
                    words[word] = 1
    return lines, words

In [49]:
def save_HMM(hmmmmmm, filename):
    
    with open(filename+".txt", "w+") as filept:
        filept.write(str(hmmmmmm.L)+"\n")
        filept.write(str(hmmmmmm.D)+"\n")
        for i in hmmmmmm.A:
            line = ""
            for j in i:
                line += str(j) + ","
            filept.write(line[:len(line)-1]+"\n")
        for i in hmmmmmm.O:
            line = ""
            for j in i:
                line += str(j) + ","
            filept.write(line[:len(line)-1]+"\n")
        

def read_HMM(filename):
    with open(filename+".txt", "r") as filept:
        L = int(filept.readline())
        D = int(filept.readline())
        O = []
        A = []
        for i in range(L):
            line = [float(x) for x in filept.readline().split(",")]
            A.append(line)
        for j in range(L):
            line = [float(x) for x in filept.readline().split(",")]
            O.append(line)
    return HiddenMarkovModel(A, O)

In [50]:
file = "data/shakespeare.txt"
file2 = "data/spenser.txt"
lines, words = load_poems([file, file2])
HMM = unsupervised_learning(lines, 12, 40)

merp 0
merp 1
merp 2
merp 3
merp 4
merp 5
merp 6
merp 7
merp 8
merp 9
merp 10
merp 11
merp 12
merp 13
merp 14
merp 15
merp 16
merp 17
merp 18
merp 19
merp 20
merp 21
merp 22
merp 23
merp 24
merp 25
merp 26
merp 27
merp 28
merp 29
merp 30
merp 31
merp 32
merp 33
merp 34
merp 35
merp 36
merp 37
merp 38
merp 39


In [52]:
def load_stress(all_words):
    stress = nltk.corpus.cmudict.dict()
    stress_dict = {}
    unclear = []


    for word in all_words:
        if word not in stress.keys():
            unclear.append(word)
        else:
            stress_dict[word] = stress[word]

    for word in stress_dict.keys():
        phoneme = stress_dict[word][0]
        syls = []

        for phon in phoneme:
            if '0' in phon:
                syls.append(0)
            elif '1' in phon:
                syls.append(1)

        stress_dict[word] = syls

    return stress_dict, unclear

In [53]:
all_words = list(words.keys())
# for the stress, just use the first thing from the array
stress_dict, unclear = load_stress(all_words)

In [54]:
def obs_map_reverser(obs_map):
    obs_map_r = {}

    for key in obs_map:
        obs_map_r[obs_map[key]] = key

    return obs_map_r

# get syllable info from syllable_dictionary.txt
def load_syllables(filename):
    file = open(filename, 'r')
    syllable = {}
    for line in file:
        line = line.split()
        #print(line)
        word = line[0]
        rest = line[1: len(line)]

        syllable[word] = rest
    return syllable

In [57]:
obs, obs_map = parse_map(lines)
obs_map_r =  obs_map_reverser(obs_map)
filename = "data/Syllable_dictionary.txt"

syllable = load_syllables(filename)
for i in range(14): # each poem is 14 lines long
        emission = HMM[0].generate_emission_syllables_correct(10, obs_map_r, syllable) # each line is 10 syllables long
        sentence = [obs_map_r[i] for i in emission[0]]

        print(' '.join(sentence).capitalize())

Rare are too mine on tongue and love thee we
The gilded thee in alike so a name
There may as smother the remedy joy
Though wait with her by love favour to for
Costly didst another clouds depart true
Thy love supposed am like me writ thy a
Me from shall he best like that due not which
And then travels spoils was out solemn which
And from a name grievances his heart show
May sits in the lack is countenance thy
Be so in to up now this my love fly
Never was needs love for sang thereof blood
As any earth her with full arts so proud
O goodly do grace such from men burn thus
