In [1]:
"""Find words in haiku corpus missing from cmudict & build exceptions dict."""
import sys
from string import punctuation
import pprint
import json
import pandas as pd
from nltk.corpus import cmudict

In [2]:
cmudict = cmudict.dict()  # Carnegie Mellon University Pronouncing Dictionary

In [8]:
heads = pd.read_csv("headlines.csv")

In [9]:
def txt_dump(heads):
    """Dump headlines to text file."""
    headlines = pd.read_csv('../headlines.csv')
    heads = headlines['headline']
    heads = heads.str.replace("  ", "")
    heads = heads.to_list()
    lesson = lesson = " ".join(heads)
    lesson = lesson.replace(" ...", "").lower()
    lesson = lesson.replace("  ", " ")
    with open('data/haiku_corpus.txt', 'w') as f:
        f.write(lesson)

In [10]:
txt_dump(heads)

In [11]:
def main():
    haiku = load_haiku('data/lesson.txt')
    exceptions = cmudict_missing(haiku)
    build_dict = input("\nManually build an exceptions dictionary (y/n)? \n")
    if build_dict.lower() == 'n':
        sys.exit()
    else:
        missing_words_dict = make_exceptions_dict(exceptions)
        save_exceptions(missing_words_dict)

In [12]:
def load_haiku(filename):
    """Open and return training corpus of haiku as a set."""
    with open(filename) as in_file:
        return set(in_file.read().replace('-', ' ').split())

In [13]:
def cmudict_missing(word_set):
    """Find and return words in word set missing from cmudict."""
    exceptions = set()
    for word in word_set:
        word = word.lower().strip(punctuation)
        if word.endswith("'s") or word.endswith("’s"):
            word = word[:-2]
        if word not in cmudict:
            exceptions.add(word)
    print("\nexceptions:")
    print(*exceptions, sep='\n')
    print(f"\nNumber of unique words in haiku corpus = {len(word_set)}")
    print(f"Number of words in corpus not in cmudict = {len(exceptions)}")
    membership = (1 - len(exceptions) / len(word_set)) * 100
    print("cmudict membership = {:.1f}{}".format(membership, '%'))
    return exceptions

In [14]:
def make_exceptions_dict(exceptions_set):
    """Return dictionary of words and syllable counts from set of words."""
    missing_words = {}
    print("Input # syllables in word. Mistakes can be corrected at end. \n")
    for word in exceptions_set:
        while True:
            num_sylls = input(f"Enter number syllables in {word}: ")
            if num_sylls.isdigit():
                break
            else:
                print("                   Not a valid answer!", file=sys.stderr)
        missing_words[word] = int(num_sylls)
    print()
    pprint.pprint(missing_words, width=1)
    print("\nMake Changes to Dictionary Before Saving?")
    print("""
    0 - Exit & Save
    1 - Add a Word or Change a Syllable Count 
    2 - Remove a Word
    """)

    while True:
        choice = input("\nEnter choice: ")
        if choice == '0':
            break
        elif choice == '1':
            word = input("\nWord to add or change: ")
            missing_words[word] = int(input(f"Enter number syllables in {word}: "))
        elif choice == '2':
            word = input("\nEnter word to delete: ")
            missing_words.pop(word, None)
    print("\nNew words or syllable changes:")
    pprint.pprint(missing_words, width=1)
    return missing_words

In [15]:
def save_exceptions(missing_words):
    """Save exceptions dictionary as json file."""
    json_string = json.dumps(missing_words)
    with open('data/missing_words.json', 'w') as f:
        f.write(json_string)
    print("\nFile saved as ../data/missing_words.json")

In [16]:
if __name__ == '__main__':
    main()


exceptions:

l.a
sayin
tiktok
shakira
1
flyin
tokes
ableist
tro
memba
don’t
19
88
kelis
mgk
blueface
nba
mcmuffins
s/o
elon
rhony
mystikal
75
chainz
5
luci
90
wifey
wandavision
gop
rhoa
50
100
britney
ig
pics
bs
bf
12
bure
nfler
wwe
facebook
f***ing
700
1.28
shahs
aniston
95
beyoncé
jojo
motherf***ing
25
chrisean
b’day
durk
jolie
‘renaissance’
moakler
udub
dt
tichina
suge
highjacked
atl
she/her
hasim
r&beef
bueller
paraglider
badass
tyga
911
1.2
89
vanderpump
kuwtk
zelensky
docs
hornswoggle
bezos
minaj
hochul
dua
yeezy
singin
dv
nfl
d.c
h.s
bridgerton
11
maybach…
mccrackdown
beyonce
44
a$ap
1st
binki
elgort
mlb
hangin
huskies
2021
hittin
brittney
rockin
8
1.8k
75th
ny
ovo
2.7
dj
2022
nypd
46
6
selfie
covid
f***ed
gigandet
kylie
whatchu
vacay
‘nuisance’
kiddo
volodymyr
bday
cordae
monkeypox
nyc
scotus
77
frigin
lottos
fl
paltrow
siwa
34m
2
airbnb
prez
gf
nichelle
faints
800
they/them
deshaun
netflix
beyhive
osundairo
shanna
orgs
kanye
gwenyth
94
jaydayoungan

Number of unique words in 

                   Not a valid answer!
                   Not a valid answer!
                   Not a valid answer!
                   Not a valid answer!
                   Not a valid answer!
                   Not a valid answer!
                   Not a valid answer!
                   Not a valid answer!
                   Not a valid answer!
                   Not a valid answer!
                   Not a valid answer!
                   Not a valid answer!
                   Not a valid answer!


KeyboardInterrupt: Interrupted by user