In [57]:
import numpy as np
import pandas as pd
import os
import re
import pickle

In [84]:
def split_punc(sentence):
    # taken from https://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation
    split_sentence = re.findall(r"[\w'-]+|[.,!?;:()]", sentence)
    # "Hello, I'm a string!"
    # 'Hello', ',', 'I\'m', 'a', 'string', '!'
    return split_sentence

assert(split_punc("Hello, I'm a st-ring!") ==
       ['Hello', ',', 'I\'m', 'a', 'st-ring', '!'])

In [134]:
shakespeare_file = open("../data/shakespeare.txt")
data = []
sonnet = []
word_to_id = {}
apostrophe_start_words = ["'gainst", "'greeing", "'scaped", "'tis",
                    "'twixt"]
apostrophe_end_words = ["th'", "t'"]

for line in shakespeare_file:
    strip_line = line.strip()
    if len(strip_line) <= 3:
        if len(sonnet) > 0:
            data.append(sonnet)
            sonnet = []
    else:
        # lowercase the words, and split puncuation into new words
        line_words = split_punc(strip_line.lower())
        line_ids = []
        for word in line_words:
            if word[-1] == "'" and word not in apostrophe_end_words:
                word = word[:-1]
            if len(word) == 0:
                continue
            if word[0] == "'" and word not in apostrophe_start_words:
                word = word[1:]
                
            if word not in word_to_id:
                word_id = len(word_to_id)
                word_to_id[word] = word_id
            else:
                word_id = word_to_id[word]
            line_ids.append(word_id)
        sonnet.append(line_ids)
if len(sonnet) > 0:
    data.append(sonnet)

In [135]:
id_to_word = {word_id: word for word, word_id in word_to_id.items()}

In [136]:
print(len(data), len(data[0]), len(data[0][0]))
print(len(word_to_id))
print(len(id_to_word))

154 14 7
3212
3212


In [137]:
print(data[:1])

[[[0, 1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12, 13, 6], [14, 15, 16, 17, 18, 19, 20, 21, 6], [22, 23, 24, 11, 25, 22, 26, 27], [14, 28, 29, 30, 31, 32, 33, 34, 6], [35, 36, 37, 38, 39, 40, 41, 6], [42, 43, 44, 45, 46, 47, 6], [36, 48, 36, 49, 6, 30, 36, 50, 48, 51, 52, 27], [28, 7, 53, 54, 16, 55, 56, 57, 6], [58, 59, 60, 30, 16, 61, 62, 6], [63, 31, 32, 64, 65, 36, 66, 6], [58, 23, 67, 68, 69, 70, 71, 27], [72, 16, 73, 6, 74, 75, 76, 77, 78, 6], [30, 79, 16, 55, 80, 6, 19, 16, 81, 58, 82, 83]]]


In [138]:
syllable_file = open("../data/syllable_dict.txt")
end_syllable_to_words = {}
word_to_end_syllables = {}
syllable_to_words = {}
word_to_syllables = {}


for line in syllable_file:
    split_line = line.strip().split()
    if len(split_line) < 2:
        continue
    word = split_line[0]
    num_syllables = set()
    num_end_syllables = set()
    for i in range(1, len(split_line)):
        if "E" in split_line[i]:
            num_end_syllables.add(int(split_line[i][1:]))
        else:
            num_syllables.add(int(split_line[i]))
    
    word_to_syllables[word] = num_syllables
    word_to_end_syllables[word] = num_end_syllables
    
    for num_syllable in num_syllables:
        if num_syllable not in syllable_to_words:
            syllable_to_words[num_syllable] = set()
        syllable_to_words[num_syllable].add(word)

    for num_syllable in num_end_syllables:
        if num_syllable not in end_syllable_to_words:
            end_syllable_to_words[num_syllable] = set()
        end_syllable_to_words[num_syllable].add(word)
    


In [142]:
preprocessed_data = {
    "data": data,
    "word_to_id": word_to_id,
    "id_to_word": id_to_word,
    "end_syllable_to_words": end_syllable_to_words,
    "word_to_end_syllables": word_to_end_syllables,
    "syllable_to_words": syllable_to_words,
    "word_to_syllables": word_to_syllables
}
pickle.dump(preprocessed_data, open("../data/preprocessed_data.pkl", "wb"))

In [140]:
for word in word_to_id:
    word_to_syllables[word]

In [77]:
syllable_to_words

{0: {"t'", "th'"},
 1: {'do',
  'cloak',
  'ear',
  'hems',
  'rent',
  'each',
  'growth',
  'proof',
  'boast',
  'gift',
  'breathed',
  'tombed',
  'struck',
  'way',
  'fast',
  'queen',
  'terms',
  'lifts',
  'while',
  'wards',
  'kiss',
  'crossed',
  'dye',
  'sound',
  'friends',
  'nor',
  'veins',
  'writes',
  'dark',
  'boot',
  'day',
  'frame',
  'knew',
  'loud',
  'on',
  'bring',
  'crave',
  'hell',
  'like',
  'short',
  'by',
  'stamp',
  'tall',
  "sea's",
  'borne',
  'stands',
  'bide',
  'flies',
  'thorns',
  'thou',
  'glass',
  'strife',
  'foe',
  'hard',
  'dyed',
  'glance',
  'sings',
  'sweets',
  'wont',
  'woo',
  'phrase',
  'clear',
  'old',
  'crests',
  'fore',
  "stol'n",
  'twain',
  'wink',
  'goes',
  'nine',
  'sleep',
  'slight',
  'minds',
  'known',
  'weep',
  'ears',
  'throw',
  'sell',
  "eye's",
  'heats',
  'prize',
  'set',
  'kept',
  "is't",
  'theft',
  'good',
  'toward',
  "'twixt",
  'since',
  'warm',
  'slide',
  'plants',

In [73]:
[item for item in word_to_id.items()][:100]

[('do', 331),
 ('am', 881),
 ('ear', 414),
 ('boast', 955),
 ('tombed', 245),
 ('marigold', 962),
 ('return', 1049),
 ('struck', 2219),
 ('fast', 507),
 ('pity', 72),
 ('livery', 96),
 ('fiend', 3065),
 ('harmful', 2567),
 ('glazed', 944),
 ("offender's", 1252),
 ('veins', 1919),
 ('overturn', 1681),
 ('noted', 2056),
 ("bosom's", 941),
 ('on', 99),
 ('external', 1655),
 ('through', 191),
 ('dispatch', 3043),
 ('civil', 1293),
 ('sheaves', 566),
 ('thorns', 1266),
 ("travel's", 1579),
 ('stands', 1794),
 ('bide', 1743),
 ('corrupting', 1278),
 ('contented', 1097),
 ('strife', 2035),
 ('fairing', 2851),
 ('straying', 1410),
 ('credit', 2987),
 ('sings', 434),
 ('ransom', 1262),
 ('old', 140),
 ('metre', 751),
 ('crests', 2522),
 ('evident', 485),
 ('twain', 1301),
 ('darkening', 2419),
 ('adore', 366),
 ('slight', 1357),
 ('uttering', 1942),
 ('beggared', 1916),
 ('thriftless', 122),
 ('longer', 589),
 ('heats', 3200),
 ('prize', 1557),
 ('theft', 2412),
 ('legions', 3191),
 ('fearfully