In [47]:
import numpy as np
import pandas as pd
import os
import re
import pickle
import nltk

In [84]:
def split_punc(sentence):
    # taken from https://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation
    split_sentence = re.findall(r"[\w'-]+|[.,!?;:()]", sentence)
    # "Hello, I'm a string!"
    # 'Hello', ',', 'I\'m', 'a', 'string', '!'
    return split_sentence

assert(split_punc("Hello, I'm a st-ring!") ==
       ['Hello', ',', 'I\'m', 'a', 'st-ring', '!'])

In [81]:
nltk.download('cmudict')
from nltk.corpus import cmudict 
d = cmudict.dict()

def count_syllables(word):
    n_syllables = 0
    for letter in d[word][0]:
        if len(letter) > 1:
            n_syllables += 1
    return n_syllables

# Copied from https://github.com/hyperreality/Poetry-Tools/blob/master/poetrytools/countsyl.py
# Count syllables in a word.
#
# Doesn't use any fancy knowledge, just a few super simple rules:
# a vowel starts each syllable;
# a doubled vowel doesn't add an extra syllable;
# two or more different vowels together are a diphthong,
# and probably don't start a new syllable but might;
# y is considered a vowel when it follows a consonant.
#
# Even with these simple rules, it gets results far better
# than python-hyphenate with the libreoffice hyphenation dictionary.
#
# Copyright 2013 by Akkana Peck http://shallowsky.com.
# Share and enjoy under the terms of the GPLv2 or later.

import sys

verbose = False

def count_syllables_any(word):
    vowels = ['a', 'e', 'i', 'o', 'u']

    on_vowel = False
    in_diphthong = False
    minsyl = 0
    maxsyl = 0
    lastchar = None

    word = word.lower()
    for c in word:
        is_vowel = c in vowels

        if on_vowel == None:
            on_vowel = is_vowel

        # y is a special case
        if c == 'y':
            is_vowel = not on_vowel

        if is_vowel:
            if not on_vowel:
                # We weren't on a vowel before.
                # Seeing a new vowel bumps the syllable count.
                minsyl += 1
                maxsyl += 1
            elif on_vowel and not in_diphthong and c != lastchar:
                # We were already in a vowel.
                # Don't increment anything except the max count,
                # and only do that once per diphthong.
                in_diphthong = True
                maxsyl += 1

        on_vowel = is_vowel
        lastchar = c

    # Some special cases:
    if word[-1] == 'e':
        minsyl -= 1
    # if it ended with a consonant followed by y, count that as a syllable.
    if word[-1] == 'y' and not on_vowel:
        maxsyl += 1

    return minsyl

print(count_syllables('hyperinflation'))
print(count_syllables_any('hyperinflation'))
print(count_syllables_any('.'))

[nltk_data] Downloading package cmudict to /home/ubuntu/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
7
5
0


In [77]:
# limericks from http://pun.me/pages/funny-limericks.php
limerick_file = open("../data/funny_limericks.txt")
data = []
sonnet = []
word_to_id = {}
apostrophe_start_words = ["'gainst", "'greeing", "'scaped", "'tis",
                    "'twixt"]
apostrophe_end_words = ["th'", "t'"]

for i, line in enumerate(limerick_file):
    strip_line = line.strip()
    if i % 5 == 0:
        if len(sonnet) > 0:
            data.append(sonnet)
            sonnet = []

    # lowercase the words, and split puncuation into new words
    line_words = split_punc(strip_line.lower())
    line_ids = []
    for word in line_words:
        if word[-1] == "'" and word not in apostrophe_end_words:
            word = word[:-1]
        if len(word) == 0:
            continue
        if word[0] == "'" and word not in apostrophe_start_words:
            word = word[1:]

        if word not in word_to_id:
            word_id = len(word_to_id)
            word_to_id[word] = word_id
        else:
            word_id = word_to_id[word]
        line_ids.append(word_id)
    sonnet.append(line_ids)
if len(sonnet) > 0:
    data.append(sonnet)

In [78]:
id_to_word = {word_id: word for word, word_id in word_to_id.items()}

In [79]:
print(len(data), len(data[0]), len(data[0][0]))
print(len(word_to_id))
print(len(id_to_word))

75 5 8
834
834


In [80]:
print(data[:1])

[[[0, 1, 2, 3, 0, 4, 5, 6], [7, 8, 0, 9, 10, 11, 12], [13, 14, 15, 16, 17, 6], [18, 0, 19, 20, 21, 22, 6], [23, 24, 25, 26, 27, 3, 0, 5, 12]]]


In [92]:
end_syllable_to_words = {}
word_to_end_syllables = {}
syllable_to_words = {}
word_to_syllables = {}

for word in word_to_id:
    try:
        n_syllables = count_syllables(word)
    except KeyError:
        n_syllables = count_syllables_any(word)
    finally:
        word_to_syllables[word] = {n_syllables}
        if n_syllables not in syllable_to_words:
            syllable_to_words[n_syllables] = set()
        syllable_to_words[n_syllables].add(word)

end_syllable_to_words = syllable_to_words
word_to_end_syllables = word_to_syllables

In [93]:
word_id_to_rhyme_id = {}
rhyme_id_to_word_ids = {}

# data: sonnet: line: word id

def get_rhyme_id(word_id):
    # Either find where the word is already stored,
    # or create a new rhyme_id for it
    if word_id in word_id_to_rhyme_id:
        return word_id_to_rhyme_id[word_id]
    else:
        rhyme_id = len(rhyme_id_to_word_ids)
        word_id_to_rhyme_id[word_id] = rhyme_id
        rhyme_id_to_word_ids[rhyme_id] = {word_id}
        return rhyme_id

def add_rhyming_word(word_id, rhyme_id):
    word_id_to_rhyme_id[word_id] = rhyme_id
    rhyme_id_to_word_ids[rhyme_id].add(word_id)    
        
punct_set = set(",.?!();:'")
        
for sonnet in data:
    for quadruplet_ind in range(1):
        end_words = []
        for line in range(5):
            for word_id in reversed(sonnet[line + quadruplet_ind * 4]):
                if id_to_word[word_id] not in punct_set:
                    end_words.append(word_id)
                    break
        rhyme_0 = get_rhyme_id(end_words[0])
        add_rhyming_word(end_words[1], rhyme_0)
        add_rhyming_word(end_words[4], rhyme_0)
        rhyme_1 = get_rhyme_id(end_words[2])
        add_rhyming_word(end_words[3], rhyme_1)

for i in range(0, 100, 10):
    print([id_to_word[word_id] for word_id in rhyme_id_to_word_ids[i]])

['fall', 'wall', 'small']
['canny', 'he', 'granny']
['today', 'way', 'play', 'say']
['doubt', 'out']
['it']
['perkins', "working's", 'gherkins']
['suppose', 'nose', 'clothes', 'toes', 'rose']
['knew', 'few']
['joey', 'zoe', 'snowy']
['sing', 'king', 'wing']


In [94]:
preprocessed_data = {
    "data": data,
    "word_to_id": word_to_id,
    "id_to_word": id_to_word,
    "end_syllable_to_words": end_syllable_to_words,
    "word_to_end_syllables": word_to_end_syllables,
    "syllable_to_words": syllable_to_words,
    "word_to_syllables": word_to_syllables,
    "rhyme_id_to_word_ids": rhyme_id_to_word_ids,
    "word_id_to_rhyme_id": word_id_to_rhyme_id
}
pickle.dump(preprocessed_data, open("../data/limerick_preprocessed_data.pkl", "wb"))

In [89]:
for word in word_to_id:
    word_to_syllables[word]

In [44]:
shakespeare_file = open("../data/shakespeare.txt")
min_line_length = 10000
for line in shakespeare_file:
    strip_line = line.strip()
    if len(strip_line) > 3:
        min_line_length = min(min_line_length, len(strip_line))
print(min_line_length)

29


In [90]:
[item for item in word_to_id.items()][:100]

[('a', 0),
 ('fellow', 1),
 ('jumped', 2),
 ('off', 3),
 ('high', 4),
 ('wall', 5),
 (',', 6),
 ('and', 7),
 ('had', 8),
 ('most', 9),
 ('terrible', 10),
 ('fall', 11),
 ('.', 12),
 ('he', 13),
 ('went', 14),
 ('back', 15),
 ('to', 16),
 ('bed', 17),
 ('with', 18),
 ('bump', 19),
 ('on', 20),
 ('his', 21),
 ('head', 22),
 ("that's", 23),
 ('why', 24),
 ('you', 25),
 ("don't", 26),
 ('jump', 27),
 ('limericks', 28),
 ('i', 29),
 ('cannot', 30),
 ('compose', 31),
 ('noxious', 32),
 ('smells', 33),
 ('in', 34),
 ('my', 35),
 ('nose', 36),
 ('but', 37),
 ('this', 38),
 ('one', 39),
 ('was', 40),
 ('easy', 41),
 ('only', 42),
 ('felt', 43),
 ('queasy', 44),
 ('because', 45),
 ('sniffing', 46),
 ('toes', 47),
 ('there', 48),
 ('once', 49),
 ('man', 50),
 ('from', 51),
 ('peru', 52),
 ('who', 53),
 ('lot', 54),
 ('of', 55),
 ('growing', 56),
 ('up', 57),
 ('do', 58),
 ("he'd", 59),
 ('ring', 60),
 ('doorbell', 61),
 ('then', 62),
 ('run', 63),
 ('like', 64),
 ('hell', 65),
 ('until', 66),
 ('