In [1]:
import HMM
import itertools

In [2]:
with open("./data/shakespeare.txt", "r") as f:
    data = f.read()
    
# Split by poems
poems = data.split("\n\n\n") 
# Split poem by line, remove 1st line
poems = [poem.split("\n")[1:] for poem in poems] 
# Remove trailing/leading spaces for certain lines (last 2 lines)
poems = [[line.strip() for line in poem] for poem in poems] 
# Split each line into a list of words
poems = [[line.split(" ") for line in poem] for poem in poems] 
# Strip punctuation : Optional
poems_by_lines = [[[word.strip(",.:;?!()").lower() for word in line] for line in poem ] for poem in poems] 
# Combine all the lines in a single poem so that each pome is just a list of words
poems_by_words = [list(itertools.chain.from_iterable(poem)) for poem in poems_by_lines] 

In [3]:
# Create the dictionary of words. We associate each word with a unique index
# and the inverse dictionary associates the index with the word
with open("./data/Syllable_dictionary.txt", "r") as f:
    data = f.readlines()
    
words = [word.strip() for word in data]
words = [word.split(" ")[0] for word in words]
words_dict = dict(zip(words, range(len(words))))
inverse_words_dict = dict(zip(range(len(words)), words))

In [4]:
# Convert each word into an index by searching in the dictionary
# We need the function because sometimes there are words that have ' at the start (as part of the word itself)
# but sometimes it's just used as a normal quotation mark. So we need to split into 2 cases.
def word_idx(word):
    try:
        return words_dict[word.lower()]
    except KeyError:
        return words_dict[word.lower().strip(",.:;?!()'")]

poems_idx = [[word_idx(word) for word in poem] for poem in poems_by_words]

In [5]:
# Create the dictionary of words. We associate each word with a unique index
# and the inverse dictionary associates the index with the word

with open("./data/Syllable_dictionary.txt", "r") as f:
    data = f.readlines()
    
lines = [word.strip() for word in data]

words_list = []
syllables_list = []
end_syllables_list = []

for line in lines:
    word_syllables_list = []
    word_end_syllables_list = []
    
    # Split into the word itself and everything else
    word = line.split(" ")[0]
    syllables = line.split(" ")[1:]
    # Iterate over everything else
    for syllable in syllables:
        # Check that it does not represent an end syllable
        if syllable[0] != "E":
            # Add it to the list syllable count for the current word
            word_syllables_list.append(int(syllable))
        else:
            # Slice off the "E" and directly append to the curent word's end_syllables_list
            word_end_syllables_list.append(int(syllable[1:]))
            
    words_list.append(word)
    syllables_list.append(tuple(word_syllables_list))
    end_syllables_list.append(tuple(word_end_syllables_list))
    
words_dict = dict(zip(words_list, range(len(words_list))))
inverse_words_dict = dict(zip(range(len(words_list)), words_list))
syllables_dict = dict(zip(words_list, syllables_list))
end_syllables_dict = dict(zip(words_list, end_syllables_list))

# Convert each word into an index by searching in the dictionary
# We need the function because sometimes there are words that have ' at the start (as part of the word itself)
# but sometimes it's just used as a normal quotation mark. So we need to split into 2 cases.
def word_idx(word):
    try:
        return words_dict[word.lower()]
    except KeyError:
        return words_dict[word.lower().strip(",.:;?!()'")]

In [56]:
shake = HMM.unsupervised_HMM(poems_idx, n_states=10, n_words=len(words_list), N_iters=200)

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90
Iteration: 100
Iteration: 110
Iteration: 120
Iteration: 130
Iteration: 140
Iteration: 150
Iteration: 160
Iteration: 170
Iteration: 180
Iteration: 190
Iteration: 200


In [65]:
emission_idx = shake.generate_emission(140)
# print(emission_idx)
emission_words = [inverse_words_dict[idx] for idx in emission_idx[0]]
print(" ".join(emission_words))



In [66]:
# Split each line into syllables
sonnet = []
word_idx = 0

for line_number in range(14):
    prev_idx = word_idx
    exactly_10_syllables = False
    
    # Create an empty line
    line = []
    syllable_count = set([0])
    end_syllable_count = set([0])

    while True:
        word_idx += 1
        prev_best = min([abs(x-10) for x in syllable_count])

        # Add syllable counts for this current word 
        syllable_count_new = {count+syl for count in syllable_count for syl in syllables_dict[emission_words[word_idx]]}
        # Add syllable counts for this current words's end-of-line syllables
        syllable_count_end = {count+syl for count in syllable_count for syl in end_syllables_dict[emission_words[word_idx]]}
        syllable_count_end = syllable_count_end.union(syllable_count_new)
        # Update the syllable count set
        syllable_count = syllable_count_new

        if (10 in syllable_count) or (10 in end_syllable_count):
#             print(syllable_count)
#             print('yes')
            break

        curr_best = min([abs(x-10) for x in syllable_count.union(end_syllable_count)])                
        if prev_best <= curr_best:
            word_idx -= 1
#             print(syllable_count)
#             print('maybe')
            break
        else:
            pass
    # Once we are out of that while loop, we have exactly 10 syllables        
    line = emission_words[prev_idx+1: word_idx+1]
    sonnet.append(line)

# print(sonnet)
# Print the sonnet itself
for idx, line in enumerate(sonnet):
    # Upper case "I" and "O" and "I'll"
    line = [word[0].upper()+word[1:] if word in ("i", "o", "i'll") else word for word in line]
    # Upper case the 1st word of each line
    line[0] = line[0][0].upper() + line[0][1:]
    
    # Add comma to each line except the last
    if idx == len(sonnet)-1:
        line[-1] += "."
    else:
        line[-1] += ","
    print(" ".join(line))

My grace this their hied her a when had I,
Self thy do me under I frailties,
Second hath sweet whether that for pleasure,
Thou or as time read wish such blooms winter's,
Altered case thence mine beauty's wind gracious,
Evermore whit kind your speak yet make far,
My swerving I dost pent in song quicker,
Whose widow of prisoner bare an pen,
Beauties me instinct eyes seen yours from a,
Comfort mine surety-like dwell bequest two,
Willing hungry up that were all you from,
Her he consum'st all something me better,
Of thou or thy leaves which and much world should,
Bars am thy scythe to I often most.


In [9]:
# word_pairs = []
# rhyme_pairs = []
# for poem in poems_by_lines:
#     word_pairs.append((poem[0][-1], poem[1][-1]))
#     word_pairs.append((poem[2][-1], poem[3][-1]))
#     word_pairs.append((poem[4][-1], poem[5][-1]))
#     word_pairs.append((poem[6][-1], poem[7][-1]))
#     word_pairs.append((poem[8][-1], poem[9][-1]))
#     word_pairs.append((poem[10][-1], poem[11][-1]))
    
# for poem in poems_by_lines:
#     rhyme_pairs.append((poem[0][-1], poem[2][-1]))
#     rhyme_pairs.append((poem[1][-1], poem[3][-1]))
#     rhyme_pairs.append((poem[4][-1], poem[6][-1]))
#     rhyme_pairs.append((poem[5][-1], poem[7][-1]))
#     rhyme_pairs.append((poem[8][-1], poem[10][-1]))
#     rhyme_pairs.append((poem[9][-1], poem[11][-1]))
#     try:
#         rhyme_pairs.append((poem[12][-1], poem[13][-1]))
#     except IndexError:
#         pass
    
# print("Number of word pairs (excluding last pair): ", len(word_pairs)) 
# print("Number of unique word pairs (excluding last pair): ", len(set(word_pairs)))
# print("Number of rhyme pairs: ", len(rhyme_pairs))
# print("Number of rhyme pairs: ", len(set(rhyme_pairs)))

Number of word pairs (excluding last pair):  924
Number of unique word pairs (excluding last pair):  921
Number of rhyme pairs:  1077
Number of rhyme pairs:  864
