In [1]:
import HMM
import itertools

In [4]:
with open("./data/shakespeare.txt", "r") as f:
    data = f.read()
    
# Split by poems
poems = data.split("\n\n\n") 
# Split poem by line, remove 1st line
poems = [poem.split("\n")[1:] for poem in poems] 
# Remove trailing/leading spaces for certain lines (last 2 lines)
poems = [[line.strip() for line in poem] for poem in poems] 
# Split each line into a list of words
poems = [[line.split(" ") for line in poem] for poem in poems] 
# Strip punctuation : Optional
poems_by_lines = [[[word.strip(",.:;?!()").lower() for word in line] for line in poem ] for poem in poems] 
# Combine all the lines in a single poem so that each pome is just a list of words
poems_by_words = [list(itertools.chain.from_iterable(poem)) for poem in poems_by_lines] 

In [5]:
# Create the dictionary of words. We associate each word with a unique index
# and the inverse dictionary associates the index with the word
with open("./data/Syllable_dictionary.txt", "r") as f:
    data = f.readlines()
    
words = [word.strip() for word in data]
words = [word.split(" ")[0] for word in words]
words_dict = dict(zip(words, range(len(words))))
inverse_words_dict = dict(zip(range(len(words)), words))

In [6]:
# Convert each word into an index by searching in the dictionary
# We need the function because sometimes there are words that have ' at the start (as part of the word itself)
# but sometimes it's just used as a normal quotation mark. So we need to split into 2 cases.
def word_idx(word):
    try:
        return words_dict[word.lower()]
    except KeyError:
        return words_dict[word.lower().strip(",.:;?!()'")]

poems_idx = [[word_idx(word) for word in poem] for poem in poems_by_words]

In [7]:
shake = HMM.unsupervised_HMM(poems_idx, 10, 50)

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50


In [8]:
emission_idx = shake.generate_emission(140)
# print(emission_idx)
emission_words = [inverse_words_dict[idx] for idx in emission_idx[0]]
print(" ".join(emission_words))

sufficed to which for dead thou there mortal their you loves thee for gentle is wail were but beauteous at heart my their just the name bring and sail with the will vanished so long holds love thee to twilight cold for foot of lifts bright curse true that thee all flowers thee then writ a envy so reasons thine your shadows as eye bore woo wear my which shall thoughts that on babe love there sweet at the for for oblivion general find the proud mistaking whence dost 'gainst be space fair i love's both then behold thou gold gentle mine of bootless must make catch physic dost is or with our charactered holds all not the watery the rose and for for clouds beauty's lest sings servant add my your mouthed without this beloved dost to breath if


In [9]:
word_pairs = []
rhyme_pairs = []
for poem in poems_by_lines:
    word_pairs.append((poem[0][-1], poem[1][-1]))
    word_pairs.append((poem[2][-1], poem[3][-1]))
    word_pairs.append((poem[4][-1], poem[5][-1]))
    word_pairs.append((poem[6][-1], poem[7][-1]))
    word_pairs.append((poem[8][-1], poem[9][-1]))
    word_pairs.append((poem[10][-1], poem[11][-1]))
    
for poem in poems_by_lines:
    rhyme_pairs.append((poem[0][-1], poem[2][-1]))
    rhyme_pairs.append((poem[1][-1], poem[3][-1]))
    rhyme_pairs.append((poem[4][-1], poem[6][-1]))
    rhyme_pairs.append((poem[5][-1], poem[7][-1]))
    rhyme_pairs.append((poem[8][-1], poem[10][-1]))
    rhyme_pairs.append((poem[9][-1], poem[11][-1]))
    try:
        rhyme_pairs.append((poem[12][-1], poem[13][-1]))
    except IndexError:
        pass
    
print("Number of word pairs (excluding last pair): ", len(word_pairs)) 
print("Number of unique word pairs (excluding last pair): ", len(set(word_pairs)))
print("Number of rhyme pairs: ", len(rhyme_pairs))
print("Number of rhyme pairs: ", len(set(rhyme_pairs)))

Number of word pairs (excluding last pair):  924
Number of unique word pairs (excluding last pair):  921
Number of rhyme pairs:  1077
Number of rhyme pairs:  864
