In [111]:
import nltk
from nltk.corpus import cmudict
import itertools
from hmmlearn import hmm
import pickle
import json

Load the sonnets from the file provided. `sonnets` is indexed by sonnet number (0-indexed, so shifted -1 from sonnet number), then by line number (again, 0-indexed).

`sonnets[0][3]` for example returns the 4th line of the first sonnet.

In [2]:
with open('data/shakespeare.txt') as f:
    lines = [line.strip('\n ,.:') for line in f]
sonnets = []
ln_start = 0
ln_end = 0
for ln, content in enumerate(lines):
    if content.isdigit():
        ln_start = ln + 1
    elif not content:
        if ln - 1 == ln_end:
            sonnets.append(lines[ln_start:ln_end + 1])
    elif ln + 1 == len(lines):
        sonnets.append(lines[ln_start:ln_end + 1])
    else:
        ln_end = ln

In [4]:
[w for s in sonnets for l in s for w in l.split()]

['From',
 'fairest',
 'creatures',
 'we',
 'desire',
 'increase',
 'That',
 'thereby',
 "beauty's",
 'rose',
 'might',
 'never',
 'die',
 'But',
 'as',
 'the',
 'riper',
 'should',
 'by',
 'time',
 'decease',
 'His',
 'tender',
 'heir',
 'might',
 'bear',
 'his',
 'memory',
 'But',
 'thou',
 'contracted',
 'to',
 'thine',
 'own',
 'bright',
 'eyes',
 "Feed'st",
 'thy',
 "light's",
 'flame',
 'with',
 'self-substantial',
 'fuel',
 'Making',
 'a',
 'famine',
 'where',
 'abundance',
 'lies',
 'Thy',
 'self',
 'thy',
 'foe,',
 'to',
 'thy',
 'sweet',
 'self',
 'too',
 'cruel',
 'Thou',
 'that',
 'art',
 'now',
 'the',
 "world's",
 'fresh',
 'ornament',
 'And',
 'only',
 'herald',
 'to',
 'the',
 'gaudy',
 'spring',
 'Within',
 'thine',
 'own',
 'bud',
 'buriest',
 'thy',
 'content',
 'And',
 'tender',
 'churl',
 "mak'st",
 'waste',
 'in',
 'niggarding',
 'Pity',
 'the',
 'world,',
 'or',
 'else',
 'this',
 'glutton',
 'be',
 'To',
 'eat',
 'the',
 "world's",
 'due,',
 'by',
 'the',
 'grave

Punctuation sings that appear at the end of the lines in sonnets.

In [52]:
end_toks = ['!', "'", ')', ',', '.', ':', ';', '?']
for l in lines:
    if l and not l[-1].isalnum():
         end_toks.append(l[-1])

In [97]:
from collections import defaultdict
words = []
syllable_dict = {}
rev_syllable_dict = defaultdict(list)
rev_end_syllable_dict = defaultdict(list)

with open('data/Syllable_dictionary.txt') as f:
    for i, line in enumerate(f):
        tokens = line.strip().split(' ')
        words.append(tokens[0])
        syllable_dict[i] = tokens[1:]
        for syl in tokens[1:]:
            if syl[0] == 'E':
                rev_end_syllable_dict[int(syl[1:])].append(i)
            else:
                rev_syllable_dict[int(syl)].append(i)

Preprocessing step: ignoring punctuation signs

In [99]:
def strip_word(word, words):
    if word not in words:
                word = word.rstrip("!'(),.:;?")
                if word not in words:
                    word = word.strip("!'(),.:;?")
    return word

Creating pairs of rhymes from the sonnets. Treating the from quatrains and couplets separately.

In [105]:
def get_rhymes_pairs(sonnets, words):
    quatrains_rhymes = []
    couplets_rhymes = []
    num_quatrains = 3
    i=0
    for sonnet in sonnets:
        line_idx = 0
        for quatrain in range(num_quatrains):
            line0 = sonnet[line_idx].split()
            line1 = sonnet[line_idx+1].split()
            line2 = sonnet[line_idx+2].split()
            line3 = sonnet[line_idx+3].split()
            quatrains_rhymes.append((strip_word(line0[-1], words), strip_word(line2[-1], words)))
            quatrains_rhymes.append((strip_word(line1[-1], words), strip_word(line3[-1], words)))
            line_idx += 4
    diff_len_sonnets = [98,125,153]
    for i in range(len(sonnets)):
        if i not in diff_len_sonnets:
            line0 = sonnets[i][12].split()
            line1 = sonnets[i][13].split()
            couplets_rhymes.append((strip_word(line0[-1], words), strip_word(line1[-1], words)))
    return quatrains_rhymes, couplets_rhymes

In [106]:
quatrains_rhymes, couplets_rhymes = get_rhymes_pairs(sonnets, words)

In [114]:
with open("quatrains_rhymes.json", "w") as f: 
    json.dump(quatrains_rhymes, f)
with open("couplets_rhymes.json", "w") as f: 
    json.dump(couplets_rhymes, f)