In [1]:
"""Find words in haiku corpus missing from cmudict & build exceptions dict."""
import sys
from string import punctuation
import pprint
import json
from nltk.corpus import cmudict

In [2]:
cmudict = cmudict.dict()  # Carnegie Mellon University Pronouncing Dictionary


In [3]:
def main():
    haiku = load_haiku('train.txt')
    exceptions = cmudict_missing(haiku)
    build_dict = input("\nManually build an exceptions dictionary (y/n)? \n")
    if build_dict.lower() == 'n':
        sys.exit()
    else:
        missing_words_dict = make_exceptions_dict(exceptions)
        save_exceptions(missing_words_dict)

In [4]:
def load_haiku(filename):
    """Open and return training corpus of haiku as a set."""
    with open(filename) as in_file:
        haiku = set(in_file.read().replace('-', ' ').split())
        return haiku

In [5]:
def cmudict_missing(word_set):
    """Find and return words in word set missing from cmudict."""
    exceptions = set()
    for word in word_set:
        word = word.lower().strip(punctuation)
        if word.endswith("'s") or word.endswith("’s"):
            word = word[:-2]
        if word not in cmudict:
            exceptions.add(word)
    print("\nexceptions:")
    print(*exceptions, sep='\n')
    print("\nNumber of unique words in haiku corpus = {}".format(len(word_set)))
    print("Number of words in corpus not in cmudict = {}"
          .format(len(exceptions)))
    membership = (1 - (len(exceptions) / len(word_set))) * 100
    print("cmudict membership = {:.1f}{}".format(membership, '%'))
    return exceptions

In [6]:
def make_exceptions_dict(exceptions_set):
    """Return dictionary of words and syllable counts from set of words."""
    missing_words = {}
    print("Input # syllables in word. Mistakes can be corrected at end. \n")
    for word in exceptions_set:
        while True:
            num_sylls = input("Enter number syllables in {}: ".format(word))
            if num_sylls.isdigit():
                break
            else:
                print("                   Not a valid answer!", file=sys.stderr)                    
        missing_words[word] = int(num_sylls)              
    print()
    pprint.pprint(missing_words, width=1)

    print("\nMake Changes to Dictionary Before Saving?")
    print("""
    0 - Exit & Save
    1 - Add a Word or Change a Syllable Count 
    2 - Remove a Word
    """)

    while True:
        choice = input("\nEnter choice: ")   
        if choice == '0':
            break
        elif choice == '1':
            word = input("\nWord to add or change: ")
            missing_words[word] = int(input("Enter number syllables in {}: "
                                            .format(word)))
        elif choice == '2':
            word = input("\nEnter word to delete: ")
            missing_words.pop(word, None)
            
    print("\nNew words or syllable changes:")
    pprint.pprint(missing_words, width=1)

    return missing_words

In [7]:
def save_exceptions(missing_words):
    """Save exceptions dictionary as json file."""
    json_string = json.dumps(missing_words)
    f = open('missing_words.json', 'w')
    f.write(json_string)
    f.close()
    print("\nFile saved as missing_words.json")

In [8]:
if __name__ == '__main__':
    main()


exceptions:
cloudbank
colour
priestling
dragonfly
samisen
fie
pattering
cumulus
ridgelines
evenfall
tendrilled
morningglory
paperweights
battlers
furue
oranged
nightingales
stretchings
yowl
swordhand
skims
froglings
moonrise
inuyasha
samuri
beholders
archways
shadeless
asakura
foregather
whippoorwill
creepers
tendrils
spiritless
storks
nursemaid
camellia
cloudbanks
windblown
deepener
scatters
bathwater
hibiscus
woodcutter
mooing
treeline
watersplash
petaled
windless
atsuta
persimmons
wintery
treehouse
dusky
lichened
dewdrop
wisteria
carven

Number of unique words in haiku corpus = 1523
Number of words in corpus not in cmudict = 58
cmudict membership = 96.2%
Input # syllables in word. Mistakes can be corrected at end. 


{'archways': 2,
 'asakura': 4,
 'atsuta': 3,
 'bathwater': 3,
 'battlers': 3,
 'beholders': 3,
 'camellia': 3,
 'carven': 2,
 'cloudbank': 2,
 'cloudbanks': 2,
 'colour': 2,
 'creepers': 2,
 'cumulus': 3,
 'deepener': 3,
 'dewdrop': 2,
 'dragonfly': 3,
 'dusky': 2,
 'e