In [1]:
import nltk
import numpy as np
from nltk.corpus import cmudict
from nltk.corpus import stopwords
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import string
from newsapi import NewsApiClient
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS

In [15]:
#pulling an example text from npr plain text site
def NPR_text(id_number):
    html_page = requests.get('https://text.npr.org/s.php?sId={}'.format(id_number))
    soup = BeautifulSoup(html_page.content, 'html.parser')
    text = soup.text.split('\n')[15:-10]
    text = ' '.join(text)
    return text

text = NPR_text(737628786)
text[:50]

'NPR.org, July 8, 2019 · Before Jarret Stopforth ta'

Now, from this we're going to create a nonsense limerick. Find rhymes, assemble a rhyme scheme, fill in lines with words that roughly scan.

In [2]:
#the pronunciations dictionary from NLTK - put into a more search friendly form as a dictionary with the
#word in English as the key and its pronunciations as the value
entries = nltk.corpus.cmudict.entries()
pron_dict = {}
for entry in entries:
    pron_dict[entry[0]]=entry[1]
    

def find_rhymes(text):
    text = nltk.word_tokenize(text)
    text = [word.lower() for word in text]
    text_set = list(set(text)) #to avoid duplicate words being rhymed with themselves
    #The way NLTK tokenizes, it leaves these 's around, so need to remove them along with the normal stop words
    stop_words = list(set(stopwords.words('english')))+["'s"]  
    text_set = [word for word in text_set if word not in stop_words] #removing stop words as well
    pronunciations = []
    #build a master list of all pronunciations that is easier to iterate over than the dictionary
    for word in text_set:
        try:
            pronunciations.append((word,pron_dict[word]))
        except:
            pass
    
    #here is the actual matching
    match_dict = {}
    for n in range(0,len(pronunciations)): #given one word in the list
        for i in range(n+1,len(pronunciations)): #we'll check it against all words that follow
            if (pronunciations[n][1][1:] == pronunciations[i][1][-len(pronunciations[n][1][1:]):] or
                    pronunciations[i][1][1:] == pronunciations[n][1][-len(pronunciations[i][1][1:]):]):
                # need to cutt out super short rhymes - else you get things like 'is' rhyming
                # with every word that ends with s
                if ((len(pronunciations[n][1][1:]) > 2 or len(pronunciations[n][1]) ==3) and 
                    (len(pronunciations[i][1][1:]) > 2 or len(pronunciations[i][1]) ==2)):
                    if len(pronunciations[n][1][1:]) < len(pronunciations[i][1][1:]): 
                        rhyme = ''.join(pronunciations[n][1][1:])
                    else:
                        rhyme = ''.join(pronunciations[i][1][1:])
                    if len(rhyme) > 2:
                    #if this rhyme has already been added to the dictionary of rhymes we can include it
                        if rhyme in match_dict.keys(): 
                            match_dict[rhyme] = list(set(match_dict[rhyme]+
                                                         [pronunciations[n][0],pronunciations[i][0]]))
                        else: #if the dictionary doesn't have an entry for this rhyme, we put in a 
                            match_dict[rhyme] = [pronunciations[n][0],pronunciations[i][0]]

    return match_dict

In [17]:
rhymes = find_rhymes(text)
rhymes

{'EY1KS': ['takes', 'makes'],
 'EY1KIH0NG': ['waking', 'making'],
 'AE1ND': ['demand', 'land'],
 'AH1M': ['come', 'become'],
 'EY1N': ['gain', 'campaign', 'maintain'],
 'AW1ND': ['ground', 'compound', 'round', 'found'],
 'RIH1NGK': ['drink', 'shrink'],
 'EH1TAH0L': ['kettle', 'settle'],
 'UW1D': ['food', 'brewed'],
 'IH1NGKIH0NG': ['drinking', 'thinking'],
 'AY1T': ['right', 'despite'],
 'LIY0': ['early', 'exactly', 'really'],
 'BAH0L': ['possible', 'able', 'global', 'suitable'],
 'IH1TIH0NG': ['getting', 'spitting']}

Next step is to assemble a corpus of words to use. That is, units that can be parsed for syllable/meter and then selected to be put into each line. A couple of challenges:

1. Certain words are actually groups that need to be kept togeter, particularly names
2. To avoid obvious tense/usage issues, it seems to make sense to only include certain parts of speech or to otherwise lemmatize the words
3. What to do with stopwords: you obviously don't want to assemble an entire line of them by accident, but then you don't want to throw out words like 'the', 'there' or 'was'

First step is finding proper nouns and names

In [3]:
#helper function, this is going to take in a list of things labeled as 'proper nouns' and return the 
#names by looking for bigrams where the two proper nouns occur next to eachother.
def find_names(text_words, nnps):
    text_bigrams = list(nltk.bigrams(text_words))
    names = [bigram for bigram in text_bigrams if bigram[0] in nnps
             and bigram[1] in nnps]
    names = list(set(names))
    return names

def find_proper_nouns(text):
    text_words = nltk.word_tokenize(text)
    text_pos = nltk.pos_tag(text_words)
    stop_words = set(stopwords.words('english'))
    text_pos_minus_stop = [(word, pos) for (word,pos) in text_pos if word not in stop_words]
    NNPs = [word for (word,pos) in text_pos_minus_stop if pos=='NNP']
    names = find_names(text_words, NNPs)
    NNPs = list(set(NNPs))
    for word in NNPs: #and here we remove the proper nouns that are part of the names list
        if word in [name1 for (name1,name2) in names]+[name2 for (name1,name2) in names]:
            NNPs.remove(word)
    return NNPs + names

In [19]:
find_proper_nouns(text)

['Soylent',
 'Atomo',
 'Kickstarter',
 'Chobani',
 ']',
 'Fire',
 'Kettle',
 'University',
 'Taylor',
 'July',
 'Never',
 'Washington',
 'NPR.org',
 'Helmer',
 'Center',
 'Tropical',
 'Kleitsch',
 'Graduate',
 'New',
 'North',
 'U.S.',
 'Drug',
 'Oregon',
 ('Taylor', 'Moore'),
 ('North', 'Carolina'),
 ('Jarret', 'Stopforth'),
 ('Andy', 'Kleitsch'),
 ('Jodi', 'Helmer'),
 ('Tropical', 'Agriculture'),
 ('Drug', 'Administration'),
 ('Christopher', 'Hendon'),
 ('U.S.', 'Food'),
 ('New', 'Orleans'),
 ('International', 'Center')]

In [20]:
#And now I'm going to pull content words: nouns, adjectives, verbs, adverbs. This will be the dictionary
#of words from which limerick lines will be formed.
def words_to_use(text):
    text_tokens = nltk.word_tokenize(text)
    text_pos = nltk.pos_tag(text_tokens)
    stop_words = list(set(stopwords.words('english')))+["'s",'.','·',"n't",'%']
    text_words = [(word,pos) for (word, pos) in text_pos if word not in stop_words]
    pos_tags = ['NN','NNS','JJ','JJR','JJS','VB','VBD','VBG','VBN','VBP','VBZ','RB','RBR','RBS']
    words_to_use = [word for (word,pos) in text_words if pos in pos_tags]
    neatened_words_to_use = []
    for n in range(0,len(words_to_use)): #to fix issue with hyphenated words
        neatened_words_to_use += words_to_use[n].split('-')
    return neatened_words_to_use

words_to_use(text)

['takes',
 'first',
 'sip',
 'coffee',
 'adds',
 'cream',
 'sugar',
 'mask',
 'bitterness',
 'thought',
 'settle',
 'regular',
 'cup',
 'joe',
 'food',
 'scientist',
 'decided',
 'reengineer',
 'coffee',
 'brewing',
 'bitterness',
 '—',
 'bean',
 'started',
 'thinking',
 'able',
 'break',
 'coffee',
 'core',
 'components',
 'look',
 'optimize',
 'explains',
 'worked',
 'food',
 'brands',
 'partnered',
 'entrepreneur',
 'launch',
 'pair',
 'turned',
 'Seattle',
 'garage',
 'brewing',
 'lab',
 'spent',
 'months',
 'running',
 'green',
 'beans',
 'roasted',
 'beans',
 'brewed',
 'coffee',
 'gas',
 'liquid',
 'chromatography',
 'separate',
 'catalog',
 'compounds',
 'coffee',
 'create',
 'product',
 'color',
 'aroma',
 'flavor',
 'mouthfeel',
 'coffee',
 'got',
 'deeper',
 'process',
 'learned',
 'threats',
 'coffee',
 'world',
 'whole',
 '—',
 'threats',
 'environment',
 'deforestation',
 'global',
 'warming',
 '[',
 'devastating',
 'fungus',
 'called',
 'rust',
 'even',
 'committed',
 'm

In [4]:
#Now we're going to build up pronunciation tables with numbers of syllables and where the stresses
#So, here I build a function that we can apply to a given word to find out the number of syllables
#It's broken into two steps, one to apply to an individual word and one to apply to a tuple like a name
def word_pronouncer(word):
    pron = pron_dict[word.lower()]
    stresses = str()
    for phoneme in pron:
        if phoneme[-1].isdigit():
            #NLTK has markings of 0 for unstressed and 1 and 2 for stressed (primary and secondary)
            if phoneme[-1] == '0':
                stresses += phoneme[-1]
            else:
                stresses += '1'
    return {'stresses':stresses, 'syllables':len(stresses)}

def robust_pronouncer(word):
    if type(word) != tuple: #we need to account for the names that will be passed in as tuples
        return word_pronouncer(word)
    word1 = word_pronouncer(word[0].lower())
    word2 = word_pronouncer(word[1].lower())
    return {'stresses':word1['stresses']+word2['stresses'],'syllables':word1['syllables']+word2['syllables']}

In [22]:
robust_pronouncer('newsworthy')

{'stresses': '110', 'syllables': 3}

In [23]:
robust_pronouncer(('Christopher', 'Hendon'))

{'stresses': '10010', 'syllables': 5}

In [24]:
#And now I will create a master dictionary to call from with all the words and their stresses, 
#organized by syllable counts
def master_dict_maker(text):
    list_of_words = find_proper_nouns(text) + words_to_use(text)
    master_dict = {}
    for word in list_of_words:
        try:
            pron = robust_pronouncer(word)
            if pron['syllables'] in master_dict.keys():
                master_dict[pron['syllables']].append((word, pron['stresses']))
            else:
                master_dict[pron['syllables']] = [(word, pron['stresses'])]
        except:
            pass
    return master_dict

master_dict = master_dict_maker(text)
master_dict

{1: [('Fire', '1'),
  ('New', '1'),
  ('North', '1'),
  ('Drug', '1'),
  ('takes', '1'),
  ('first', '1'),
  ('sip', '1'),
  ('adds', '1'),
  ('cream', '1'),
  ('mask', '1'),
  ('thought', '1'),
  ('cup', '1'),
  ('joe', '1'),
  ('food', '1'),
  ('bean', '1'),
  ('break', '1'),
  ('core', '1'),
  ('look', '1'),
  ('worked', '1'),
  ('food', '1'),
  ('brands', '1'),
  ('launch', '1'),
  ('pair', '1'),
  ('turned', '1'),
  ('lab', '1'),
  ('spent', '1'),
  ('months', '1'),
  ('green', '1'),
  ('beans', '1'),
  ('beans', '1'),
  ('brewed', '1'),
  ('gas', '1'),
  ('got', '1'),
  ('threats', '1'),
  ('world', '1'),
  ('whole', '1'),
  ('threats', '1'),
  ('called', '1'),
  ('rust', '1'),
  ('great', '1'),
  ('says', '1'),
  ('land', '1'),
  ('shrink', '1'),
  ('steeped', '1'),
  ('made', '1'),
  ('says', '1'),
  ('found', '1'),
  ('food', '1'),
  ('adds', '1'),
  ('blend', '1'),
  ('first', '1'),
  ('first', '1'),
  ('brew', '1'),
  ('beans', '1'),
  ('made', '1'),
  ('foods', '1'),
  ('fa

Ok, so here's the order of operations as I imagine it:
1. Find a topic, find the rhyme-set from the overall text
2. Decide which words are going to be the line ending rhymes: If the topic occurs within the rhyme set, use that, else we'll slot the topic in elsewhere
3. Create the master dictionary of words by syllable count and stresses
4. Build lines by pulling from from the master dictionary
5. Check how many syllables are left to assign in a line, pull all the words from the master dictionary that are shorter than that
6. Pull a random word from this sub-list and check if it fits into the stress pattern, if it does, slot it in, else pull another random word until you find one that works
7. Finalize your non-sense line

I'm going to use a loose stress pattern requirement: every third syllable stressed

In [26]:
# Here's a function to find all the rhyming sets, and pick two semi-randomly to be the a and b rhymes
def assign_rhymes(text):
    rhyme_dict = find_rhymes(text)
    rhyme_list = list(rhyme_dict.values())
    rhyme_list_3 = [rhyme for rhyme in rhyme_list if len(rhyme)>2]
    a_rhyme = rhyme_list_3[np.random.randint(1,len(rhyme_list_3))]
    searching = True
    while searching == True: #Putting check in here so we don't pick the same rhyme set for a and b
        b_rhyme = rhyme_list[np.random.randint(1,len(rhyme_list_3))]
        if b_rhyme != a_rhyme:
            searching = False
    a_in = np.random.choice(len(a_rhyme), 3, replace=False) #sample 3 words out of the rhyme list
    b_in = np.random.choice(len(b_rhyme), 2, replace=False)
    return (a_rhyme[a_in[0]],a_rhyme[a_in[1]],a_rhyme[a_in[2]]), (b_rhyme[b_in[0]],b_rhyme[b_in[1]])

assign_rhymes(text)

(('possible', 'able', 'suitable'), ('become', 'come'))

In [5]:
#Here's a function that picks out a word to use for a line, such that it is withing a given syllable count
# and that it has stressed accents at the right places
def find_word_for_line(max_syllables, master_dict, stress_pattern):
    word_list = []
    for n in master_dict.keys():
        if n <= max_syllables:
            word_list = word_list + master_dict[n]
    searching = True
    while searching == True:
        word = word_list[np.random.randint(1,len(word_list)+1)]
        stress_test = 0
        for i in range(0,len(word[1])):
            if stress_pattern[i] == '1':
                if word[1][i] != stress_pattern[i]:
                    stress_test +=1
        if stress_test == 0:
            searching = False
    return word

# and here's the function to fill in a line given the word dictionary, number of remaining syllables, stress pattern
# it will take in a line that's partially filled
def fill_out_line(syllables_total, master_dict, stress_pattern, line_beginning=[]):
    line = [word for word in line_beginning]
    syllables_left = syllables_total
    stress_pattern = stress_pattern
    while syllables_left >0:
        word = find_word_for_line(syllables_left, master_dict, stress_pattern)
        line.append(word[0])
        syllables_left -= len(word[1])
        stress_pattern = stress_pattern[len(word[1])-1:]
    return line

In [6]:
# And now to actually build out our limerick:
def write_nonsense_limerick_simple(text):
    a_rhymes, b_rhymes = assign_rhymes(text)
    master_dict = master_dict_maker(text)
    a_1 = (a_rhymes[0],robust_pronouncer(a_rhymes[0])['syllables'])
    line_1 = fill_out_line(8-a_1[1], master_dict, '10010010'[:-a_1[1]])+[a_1[0]]
    a_2 = (a_rhymes[1],robust_pronouncer(a_rhymes[1])['syllables'])
    line_2 = fill_out_line(8-a_2[1], master_dict, '10010010'[:-a_2[1]])+[a_2[0]]
    b_1 = (b_rhymes[0],robust_pronouncer(b_rhymes[0])['syllables'])
    line_3 = fill_out_line(5-b_1[1], master_dict, '10010010'[:-b_1[1]])+[b_1[0]]
    b_2 = (b_rhymes[1],robust_pronouncer(b_rhymes[1])['syllables'])
    line_4 = fill_out_line(5-b_2[1], master_dict, '10010010'[:-b_2[1]])+[b_2[0]]
    a_3 = (a_rhymes[2],robust_pronouncer(a_rhymes[2])['syllables'])
    line_5 = fill_out_line(8-a_3[1], master_dict, '10010010'[:-a_3[1]])+[a_3[0]]
    return line_1, line_2,line_3,line_4,line_5

In [29]:
write_nonsense_limerick_simple(text)

(['suspects', 'New', 'coffee', 'food', 'writes', 'ground'],
 ['forced', 'drinking', 'devastating', 'found'],
 ['beverage', 'share', 'waking'],
 ['proud', 'brew', 'thought', 'making'],
 ['wanted', 'coffee', 'like', 'bean', 'compound'])

Since the article is about a non-coffee bean based coffe like alternative, this last line is shockingly apt! And we built this from randomly pulled words. Says something about the flexibility of language and how suggestible we are.

Now, I'm going to incorporate the text parsing. First step is simply to just to parse the text into the nouns, chunks, adjectives from your function. Then I'll tweak my functions to work with everything split out. Steps are:

1. find rhymes from among nouns and noun chunks or among verbs - the things that can come at the end of a given line as the 'objects' or as verbs in simpler sentences (it occurs to me to make the short lines end with verbs)
2. create pronunciation dictionaries for each part of speech - have these as seperate dictionaries
3. create a new line creator function that will take into account the part of speech - start with a noun chunk, add a verb end with the object

In [7]:
#Word parsing tweaked somewhat from my partner Jen McKaig's work
def word_parse(x):
    text = nlp(str(x)) 
    tokens = [token for token in text if not token.is_stop]
    res = {
            'nouns':[],
            'verbs':[],
            'adjectives':[],
            'pronouns':[],
            'adverbs':[],
            'noun_chunks':[]
        }
    for token in tokens:
        if token.pos_ == 'NOUN':
            res['nouns'].append(token)
        elif token.pos_ == 'VERB':
            res['verbs'].append(token)
        elif token.pos_ == 'ADJ':
            res['adjectives'].append(token)
        elif token.pos_ == 'PRON':
            res['pronouns'].append(token)
        elif token.pos_ == 'ADV':
            res['adverbs'].append(token)
            
    for chunk in text.noun_chunks:
            res['noun_chunks'].append(chunk)
    res['noun_chunks'] = list(set(res['noun_chunks']))
            
    return res

In [118]:
parsed_text = word_parse(text)
parsed_text['nouns']

[sip,
 coffee,
 cream,
 sugar,
 bitterness,
 cup,
 joe,
 food,
 scientist,
 reengineer,
 coffee,
 bitterness,
 bean,
 coffee,
 core,
 components,
 food,
 brands,
 entrepreneur,
 pair,
 garage,
 brewing,
 lab,
 months,
 beans,
 beans,
 coffee,
 gas,
 chromatography,
 compounds,
 coffee,
 product,
 color,
 aroma,
 flavor,
 mouthfeel,
 coffee,
 process,
 threats,
 coffee,
 world,
 threats,
 environment,
 deforestation,
 warming,
 fungus,
 rust,
 coffee,
 environment,
 future,
 coffee,
 land,
 coffee,
 %,
 report,
 concept,
 history,
 beanless,
 coffee,
 company,
 mixture,
 dozens,
 compounds,
 food,
 antioxidants,
 flavonoids,
 coffee,
 acids,
 caffeine,
 blend,
 products,
 coffee,
 beans,
 startups,
 beverage,
 foods,
 mushrooms,
 acorns,
 market,
 share,
 chicory,
 proof,
 beanless,
 coffee,
 ground,
 root,
 namesake,
 plant,
 chicory,
 1800s,
 coffee,
 shortages,
 people,
 substitutes,
 staple,
 coffee,
 importation,
 chicory,
 pieces,
 wood,
 coffee,
 coffee,
 professor,
 chemistry,
 

In [157]:
len(parsed_text['noun_chunks'][0])

2

In [8]:
# now we build out rhymes. Rhymes from noun chunks and rhymes from verbs kept seperately
# need to turn multi word noun chunks into tuples or lists of text
def create_text(token):
    stop_words = list(set(stopwords.words('english'))) + ['"']
    if len(token) == 1:
        if token.text.lower() not in stop_words:
            return token.text.lower()
        else:
            return False
    else:
        return [sub.text.lower() for sub in token if sub.text.lower() not in ['"',',',"'",'-']]


def rhyme_from_list(pronunciations):
    match_dict = {}
    for n in range(0,len(pronunciations)): #given one word in the list
        for i in range(n+1,len(pronunciations)): #we'll check it against all words that follow
            if (pronunciations[n][1][1:] == pronunciations[i][1][-len(pronunciations[n][1][1:]):] or
                    pronunciations[i][1][1:] == pronunciations[n][1][-len(pronunciations[i][1][1:]):]):
                # need to cutt out super short rhymes - else you get things like 'is' rhyming
                # with every word that ends with s
                if ((len(pronunciations[n][1][1:]) > 2 or len(pronunciations[n][1]) ==3) and 
                    (len(pronunciations[i][1][1:]) > 2 or len(pronunciations[i][1]) ==2)):
                    if len(pronunciations[n][1][1:]) < len(pronunciations[i][1][1:]): 
                        rhyme = ''.join(pronunciations[n][1][1:])
                    else:
                        rhyme = ''.join(pronunciations[i][1][1:])
                    if len(rhyme) > 2:
                    #if this rhyme has already been added to the dictionary of rhymes we can include it
                        if rhyme in match_dict.keys(): 
                            match_dict[rhyme] = list(set(match_dict[rhyme]+
                                                         [pronunciations[n][0],pronunciations[i][0]]))
                        else: #if the dictionary doesn't have an entry for this rhyme, we put in a 
                            match_dict[rhyme] = [pronunciations[n][0],pronunciations[i][0]]

    return match_dict
    match_dict = {}
    
    
def parsed_rhymes(parsed_text):
    stop_words = list(set(stopwords.words('english')))
    nouns = [] #minor snag taking the set of a list with nested lists...
    for item in parsed_text['nouns']:
        nouns.append(item.text.lower())
    noun_list = list(set(nouns))

    #building simple text lists to iterate rhyme finding function over
    verb_list = [token.text.lower() for token in parsed_text['verbs']]
    verb_list = [item for item in set(verb_list) if item not in stop_words]
    #attaching pronunciations 
    noun_pron = []
    for word in noun_list:
        if type(word) == str:
            try:
                noun_pron.append((word,pron_dict[word]))
            except:
                pass
        else:
            try:
                noun_pron.append((word,pron_dict[word[-1]]))
            except:
                pass
    noun_rhymes = rhyme_from_list(noun_pron)
    
    verb_pron = []
    for word in verb_list:
        if type(word) == str:
            try:
                verb_pron.append((word,pron_dict[word]))
            except:
                pass
    verb_rhymes = rhyme_from_list(verb_pron)
    master_dict = {'nouns':noun_rhymes, 'verbs':verb_rhymes}
    return master_dict

In [149]:
parsed_rhymes(parsed_text)

{'nouns': {'AW1ND': ['ground', 'compound', 'round'],
  'AE1ND': ['demand', 'land']},
 'verbs': {'RIH1NGK': ['shrink', 'drink'],
  'EY1KS': ['makes', 'takes'],
  'EY1KIH0NG': ['making', 'waking'],
  'IH1TIH0NG': ['getting', 'spitting']}}

And now we can assign our rhyme scheme, using nouns for the a rhymes and verbs for the b rhymes.

I'll also adjust my pronunciation functions to pass over the dictionary of parsed words/chunks and return a dictionary that is still partitioned by part of speech.

In [9]:
def assign_pos_rhymes(parsed_text):
    rhyme_dict = parsed_rhymes(parsed_text)
    noun_rhyme_list = list(rhyme_dict['nouns'].values())
    rhyme_list_3 = [rhyme for rhyme in noun_rhyme_list if len(rhyme)>2]
    if len(rhyme_list_3) >1:
        a_rhyme = rhyme_list_3[np.random.randint(1,len(rhyme_list_3))]
    else:
        a_rhyme = rhyme_list_3[0]
    verb_rhyme_list = list(rhyme_dict['verbs'].values())
    if len(verb_rhyme_list) > 1:
        b_rhyme = verb_rhyme_list[np.random.randint(1,len(verb_rhyme_list))]
    else:
        b_rhyme = verb_rhyme_list[0]
    
    a_in = np.random.choice(len(a_rhyme), 3, replace=False) #sample 3 words out of the rhyme list
    b_in = np.random.choice(len(b_rhyme), 2, replace=False)
    return (a_rhyme[a_in[0]],a_rhyme[a_in[1]],a_rhyme[a_in[2]]), (b_rhyme[b_in[0]],b_rhyme[b_in[1]])


In [153]:
assign_pos_rhymes(parsed_text)

(('compound', 'round', 'ground'), ('makes', 'takes'))

In [10]:
def robust_pronouncer_pos(word):
    if type(word) != spacy.tokens.span.Span: 
        return word_pronouncer(word.text.lower())
    else:
        temp = []
        for n in range(0,len(word)):
            temp.append(word_pronouncer(word[n].text.lower()))
        stresses = str()
        for item in temp:
            stresses += item['stresses']
    return {'stresses':stresses,'syllables':len(stresses)}

def master_dict_maker_for_parsed(list_of_words):
    master_dict = {}
    for word in list_of_words:
        try:
            pron = robust_pronouncer_pos(word)
            if pron['syllables'] in master_dict.keys():
                master_dict[pron['syllables']].append((word, pron['stresses']))
            else:
                master_dict[pron['syllables']] = [(word, pron['stresses'])]
        except:
            pass
    return master_dict
#returning a slightly different sort of master dictionary, now it's nested, 
#with sub-dictionaries for each part of speach
def parsed_pronunciation_dict(parsed_text):
    pron_dict = {}
    pron_dict['nouns'] = master_dict_maker_for_parsed(parsed_text['nouns'])
    pron_dict['verbs'] = master_dict_maker_for_parsed(parsed_text['verbs'])
    pron_dict['adjectives'] = master_dict_maker_for_parsed(parsed_text['adjectives'])
    pron_dict['noun_chunks'] = master_dict_maker_for_parsed(parsed_text['noun_chunks'])
    return pron_dict

In [195]:
master_dict = parsed_pronunciation_dict(parsed_text)

In [187]:
master_dict['noun_chunks']

{4: [(roasted pieces, '1010'),
  (the bitterness, '0100'),
  (global warming, '1010'),
  (the consumer, '0010'),
  (coffee acids, '1010'),
  (The chemistry, '0100'),
  (its namesake plant, '0111'),
  (Jodi Helmer, '1010'),
  (the bitterness, '0100'),
  (a brewing lab, '1101'),
  (its first products, '0110'),
  (the same color, '0110'),
  (Other startups, '1011'),
  (the attention, '0010'),
  (development, '0100'),
  (required drinking, '0110'),
  (the company, '0100'),
  (the bitterness, '0100'),
  (that bitterness, '0100'),
  (a concoction, '1010'),
  (exactly what, '0101'),
  (the coffee world, '0101'),
  (the properties, '0100'),
  (growing coffee, '1010'),
  (identity, '0100'),
  (the ratios, '0101'),
  (other food brands, '1011')],
 3: [(a smooth cup, '111'),
  (its product, '010'),
  (the market, '010'),
  (our coffee, '110'),
  (market share, '101'),
  (rave reviews, '101'),
  (A concept, '110'),
  (my coffee, '110'),
  (the process, '011'),
  (consumers, '010'),
  (substitutes,

Now an update to my fill_out_line function into two forms: one to pull from noun-verb-object sections of the dictionary in that order for the long lines and the other to just pull out an adjective-noun or something like that to append to the short lines of the limerick

In [11]:
def find_word_of_leng_for_line(max_syllables, min_syllables, master_dict, stress_pattern):
    word_list = []
    for n in master_dict.keys():
        if n <= max_syllables and n>min_syllables:
            word_list = word_list + master_dict[n]
    searching = True
    while searching == True:
        if len(word_list) == 1:
            word = word_list[0]
            searching = False
        word = word_list[np.random.randint(0,len(word_list))]
        stress_test = 0
        for i in range(0,len(word[1])):
            if stress_pattern[i] == '1':
                if word[1][i] != stress_pattern[i]:
                    stress_test +=1
        if stress_test == 0:
            searching = False
    return word

def fill_out_long_line(syllables_total, master_dict, stress_pattern, line_beginning=[]):
    line = [word for word in line_beginning]
    syllables_left = syllables_total
    stress_pattern = stress_pattern
    if len(line) == 0:
        word = find_word_of_leng_for_line(syllables_left, 2, master_dict['noun_chunks'], stress_pattern)
        line.append(word[0])
        syllables_left -= len(word[1])
        stress_pattern = stress_pattern[len(word[1])-1:]
    if syllables_left >0:
        if syllables_left in master_dict['verbs'].keys():
            word = find_word_of_leng_for_line(syllables_left,syllables_left-1, master_dict['verbs'], stress_pattern)
            line.append(word[0])
            syllables_left -= len(word[1])
            stress_pattern = stress_pattern[len(word[1])-1:]
        else:
            word = find_word_of_leng_for_line(syllables_left,syllables_left-2, master_dict['verbs'], stress_pattern)
            line.append(word[0])
            syllables_left -= len(word[1])
            stress_pattern = stress_pattern[len(word[1])-1:]
    return line

def fill_out_short_line(syllables_total, master_dict, stress_pattern, line_beginning=[]):
    line = [word for word in line_beginning]
    syllables_left = syllables_total
    stress_pattern = stress_pattern
    if len(line) == 0:
        word = find_word_of_leng_for_line(syllables_left, syllables_left-1, 
                                          master_dict['noun_chunks'], stress_pattern)
        line.append(word[0])
        syllables_left -= len(word[1])
        stress_pattern = stress_pattern[len(word[1])-1:]
    else:
        word = find_word_of_leng_for_line(syllables_left, syllables_left-1, 
                                          master_dict['adjectives'], stress_pattern)
        line = [word] + line
    return line

In [186]:
fill_out_long_line(6, master_dict, '10010010', line_beginning=[])

[a report, completed]

In [189]:
fill_out_short_line(3, master_dict, '10010010', line_beginning=[])

[coffee beans]

In [12]:
def write_nonsense_limerick(text):
    parsed_text = word_parse(text)
    a_rhymes, b_rhymes = assign_pos_rhymes(parsed_text)
    master_dict = parsed_pronunciation_dict(parsed_text)
    a_1 = (a_rhymes[0],robust_pronouncer(a_rhymes[0])['syllables'])
    line_1 = fill_out_long_line(8-a_1[1], master_dict, '10010010'[:-a_1[1]])+[a_1[0]]
    a_2 = (a_rhymes[1],robust_pronouncer(a_rhymes[1])['syllables'])
    line_2 = fill_out_long_line(8-a_2[1], master_dict, '10010010'[:-a_2[1]])+[a_2[0]]
    b_1 = (b_rhymes[0],robust_pronouncer(b_rhymes[0])['syllables'])
    line_3 = fill_out_short_line(5-b_1[1], master_dict, '10010010'[:-b_1[1]])+[b_1[0]]
    b_2 = (b_rhymes[1],robust_pronouncer(b_rhymes[1])['syllables'])
    line_4 = fill_out_short_line(5-b_2[1], master_dict, '10010010'[:-b_2[1]])+[b_2[0]]
    a_3 = (a_rhymes[2],robust_pronouncer(a_rhymes[2])['syllables'])
    line_5 = fill_out_long_line(8-a_3[1], master_dict, '10010010'[:-a_3[1]])+[a_3[0]]
    return line_1, line_2,line_3,line_4,line_5

In [199]:
write_nonsense_limerick(text)

([beekeeper, experience, 'round'],
 [coffee beans, experience, 'ground'],
 [A taste test, 'waking'],
 [A concept, 'making'],
 [a level, expected, 'compound'])

Below is some code to try and plug into various news apis to pull texts in a more automated manner as well as some example limericks. Results are mixed!

In [200]:
from newsapi import NewsApiClient
from newspaper import Article


In [203]:
def nyt_top_api(num_texts):
    resp = requests.get('https://api.nytimes.com/svc/mostpopular/v2/viewed/1.json?api-key=xQnzVylvmGHfGWNKzhP6AvSpBTr1fpcK')
    data = resp.json()
    df = pd.DataFrame(data['results'][0:num_texts])
    return df

articles = nyt_top_api(25)

In [211]:
articles.loc[14]['url']

'https://www.nytimes.com/interactive/2019/07/06/us/migrants-border-patrol-clint.html'

In [204]:
def get_text(df):
    urls = df.url
    texts = []
    for url in urls:
        article = Article(url)
        article.download()
        article.parse()
        t = article.text
        texts.append(t)
    return pd.DataFrame(texts)

texts = get_text(articles)

In [205]:
[len(text) for text in texts[0]]

[1130,
 985,
 1913,
 1907,
 1196,
 2366,
 1955,
 2286,
 1914,
 2095,
 1425,
 2277,
 1544,
 3691,
 24773,
 1653,
 1051,
 1572,
 1443,
 960]

In [207]:
write_nonsense_limerick(texts[0][14])

([a fixture, equipped, 'director'],
 [older boys, described, 'inspector'],
 [a border wall, 'tried'],
 [instant oatmeal, 'cried'],
 [their parents, including, 'sector'])

In [219]:
write_nonsense_limerick(texts[0][14])

([Mexico, equipped, 'inspection'],
 [nobody, conducted, 'action'],
 [nobody, 'return'],
 [her infant son, 'learn'],
 [Fan Trailer, converted, 'section'])

In [220]:
write_nonsense_limerick(texts[0][14])

([their parents, expressed, 'inspection'],
 [Mr. Hull, intended, 'section'],
 [her infant son, 'tracked'],
 [A Migrant Jail, 'backed'],
 [all her years, resemble, 'action'])

In [210]:
write_nonsense_limerick_simple(texts[0][14])

(['sleep', 'Tents', 'hospital', 'took', 'return'],
 ['agency', 'Officials', 'held', 'learn'],
 ['asked', 'floor', 'held', 'table'],
 ['went', 'said', 'unable'],
 [('A', 'Forward'), 'said', 'reports', 'concern'])

In [235]:
alt_text = '''
Speaker Nancy Pelosi said they have no following in Congress. Representative Alexandria Ocasio-Cortez of New York shot back that she and three of her fellow liberal freshmen, darlings of the left known collectively as “the squad,” are wielding the real power in the party.
Six months into the new House Democratic majority, long-simmering tensions between the speaker and the squad — Representatives Ocasio-Cortez, Ilhan Omar of Minnesota, Rashida Tlaib of Michigan and Ayanna Pressley of Massachusetts — have boiled over in the most public of ways, setting off a flurry of criticism of Ms. Pelosi among liberal activists and reinvigorating a debate within the party about how best to stand up to President Trump.
The fire was lit by a $4.6 billion border aid package passed by Congress that the quartet argued had empowered Mr. Trump’s immigration crackdown. But the forest already was a tinder box, dried by the monthslong debate over impeachment, earlier dust-ups with Ms. Omar and Ms. Tlaib and over Ms. Ocasio-Cortez’s Green New Deal, and looming debates over a $15-an-hour minimum wage bill and funding for Immigration and Customs Enforcement.
The squabble is all the more notable because it pits Ms. Pelosi, the liberal San Francisco congresswoman who is the most powerful elected woman in American history, against a group of progressive Democratic women of color who have broken barriers of their own as part of the most diverse class ever to serve in the House.
"This is an inevitable tension between a few progressives with one priority, which is their ideology, and a speaker with many priorities, including preserving the majority in the House, electing a Democratic president against Trump, and responding to the consensus of her caucus,” said Steve Israel, a Democrat and former representative of New York. “To the extent that it distracts from Donald Trump and becomes a circular firing squad among Democrats, it can be lethal.”
Others see an old guard defending itself against powerful young voices demanding change.
“Those freshman members are breaking through, and they’re building a movement, and the more power that movement gains, the more persuasive they will be to Pelosi,” said Brian Fallon, a former spokesman for Senator Chuck Schumer and Hillary Clinton.
The contretemps began when Maureen Dowd, the New York Times columnist, asked Ms. Pelosi about the squad’s fury over the border aid package. The speaker noted that the group had failed to persuade any other Democrats to join them last month in voting against the House’s version of the bill, which placed restrictions on how the administration could spend the money and demanded standards of care at migrant detention centers.
“All these people have their public whatever and their Twitter world,” Ms. Pelosi told Ms. Dowd in an interview published over the weekend by The Times. “But they didn’t have any following. They’re four people, and that’s how many votes they got.”
Ms. Ocasio-Cortez, the Queens congresswoman who upset a 20-year Democratic incumbent in a primary and who has carved out a reputation as an outspoken and social-media savvy firebrand in the halls of Congress, responded tartly in a string of Twitter posts — a public show of defiance to the leader of her party 50 years her senior.
“That public ‘whatever’ is called public sentiment,” she wrote to her more than 4.7 million followers in a message that was recirculated 10,000 times and “liked” by 65,000 people. “And wielding the power to shift it is how we actually achieve meaningful change in this country.”
Ms. Omar chimed in with a tweet of solidarity. “Patetico!” she wrote on her personal Twitter account, with more than 1 million followers. “You know they’re just salty about WHO is wielding the power to shift ‘public sentiment’ these days, sis. Sorry not sorry.”
Ms. Ocasio-Cortez’s chief of staff, Saikat Chakrabarti, went much further, arguing in a series of tweets that his boss and her first-term colleagues were better at leading than Ms. Pelosi was, that Democratic leaders were not willing to fight for their principles, and that the speaker had failed to deliver any Democratic victories while shrinking from impeachment proceedings against Mr. Trump.
“Pelosi claims we can’t focus on impeachment because it’s a distraction from kitchen table issues,” Mr. Chakrabarti wrote. “But I’d challenge you to find voters that can name a single thing House Democrats have done for their kitchen table this year. What is this legislative mastermind doing?”
The back and forth has less to do with ideological differences between Ms. Pelosi and the young crop of progressives than their divergent styles and agendas.
Ms. Pelosi, whose legislative triumphs include muscling the Affordable Care Act through the House in 2010, has focused on using the House Democrats’ power to challenge Mr. Trump by advancing legislation that appeals to the broadest possible swath of Democrats, including the more than two dozen moderate lawmakers elected in districts carried by the president in 2016. She has kept the fractious caucus united on measures addressing health care, gun safety, election reforms and immigration, even as divisions persist over whether to impeach Mr. Trump, a step she has so far refused to endorse.
The speaker is also giving voice to an undercurrent of resentment among Democratic lawmakers toward Ms. Ocasio-Cortez and her group, whom they see as using their megaphones to sow intraparty divisions and burnish their own brands without achieving any results for Democrats.
Ms. Pelosi seemed to allude to that on Monday when she was asked to clarify her remarks to Ms. Dowd.
“It wasn’t dismissive; it was a statement of fact,” Ms. Pelosi told a reporter in San Francisco on Monday, saying while most House Democrats had “voted to protect the children” by supporting the House’s humanitarian aid bill, the squad had chosen not to. “They were four who argued against the bill, and they were the only four who voted against the bill. All I said was nobody followed their lead.”
“They have a following in the public,” Ms. Pelosi added. “I’m just talking about in the Congress.”
The foursome has helped to redefine their party’s message, pushing multi-trillion-dollar ideas like the Green New Deal, Medicare for all, and tuition-free college that have drawn broad rhetorical support, including from Democratic presidential candidates. But they have yet to translate their vision into concrete legislative achievement.
The squad and its allies argue that they are tapping into the real energy in the Democratic base with their uncompromising and unapologetic stances.
“Representing the movement that actually helped to put everyone in Congress into office and give Pelosi her gavel is a critical role, and they’ve been at the forefront of pushing the boundaries of what is possible in Congress,” said Leah Greenberg, the executive director of Indivisible, a progressive advocacy group.
Liberal activists tried to use the speaker’s comments to stoke outrage — and to raise money to mount primaries against incumbent Democrats they deem insufficiently liberal.
“AOC and The Squad have changed the entire national debate,” said an email rehashing the spat from the Progressive Change Campaign Committee, which offered a colorful “I STAND WITH AOC” sticker to anyone who donated to their work “electing more AOC’s to Congress.”
Mr. Fallon, now the executive director of the grass-roots progressive group Demand Justice, said Ms. Ocasio-Cortez has demonstrated a unique ability to grab the public spotlight for liberal candidates and causes, as she did last week when she visited a migrant detention center in Texas. But Ms. Pelosi has an entirely different mandate, he argued, one that her recent comments may have been designed to subtly convey.
“I think more than anything it’s a challenge to this ascending wing of the party, that if they actually want to move beyond being the protest wing and have leadership follow their strategy, that they need to grow their base of support and leave her with no option,” he said.
'''

In [238]:
write_nonsense_limerick(alt_text)

([their megaphones, simmering, 'years'],
 [President Trump, focus, 'centers'],
 [Mr. Trump, 'voted'],
 [Ms. Omar, 'noted'],
 [nobody, translate, 'followers'])

In [239]:
write_nonsense_limerick(alt_text)

([a few progressives, argued, 'years'],
 [a public show, grow, 'followers'],
 [darlings, 'demanded'],
 [standards, 'responded'],
 [meaningful change, follow, 'centers'])

In [241]:
write_nonsense_limerick(alt_text)

([Steve Israel, ascending, 'centers'],
 [their own brands, appeals, 'followers'],
 [any results, 'find'],
 [her gavel, 'designed'],
 [meaningful change, visited, 'years'])

In [245]:
alt_text2 = '''
Ross Perot, Brash Texas Billionaire Who Ran for President, Dies at 89.
Ross Perot, the wiry Texas gadfly who made a fortune in computer services, amazed the nation with audacious paramilitary missions to Vietnam and Iran, and ran for president in 1992 and 1996 with populist talk of restoring Norman Rockwell’s America, died on Tuesday at his home in Dallas. He was 89.

The cause was leukemia, a family spokesman, James Fuller, said.

They called him the man from Texarkana, but he really came out of an era — the Great Depression, World War II and the exuberant postwar years — when boys had paper routes, folks tuned in to the radio and patriots rolled up their sleeves for Uncle Sam and built innovative companies and a powerful nation.

“Most people give up just when they’re about to achieve success,” Mr. Perot liked to say. “They quit on the one-yard line. They give up at the last minute of the game one foot from a winning touchdown.”

He was no quitter: an Eagle Scout, a Navy officer out of Annapolis, a top I.B.M. salesman, the founder of wildly successful data processing enterprises, a crusader for education and against drugs, a billionaire philanthropist. In 1969, he became a kind of folk hero with a quixotic attempt to fly medicine and food to American prisoners of war in North Vietnam.
In 1979 he staged a commando raid that he asserted had freed two of his employees, and thousands of criminals and political prisoners, from captivity in revolutionary Iran.
And in 1992 he became one of the most unlikely candidates ever to run for president. He had never held public office, and he seemed all wrong, like a cartoon character sprung to life: an elfin 5 feet 6 inches and 144 pounds, with a 1950s crew cut; a squeaky, nasal country-boy twang; and ears that stuck out like Alfred E. Neuman’s on a Mad magazine cover. Stiff-necked, cantankerous, impetuous, often sentimental, he was given to homespun epigrams: “If you see a snake, just kill it. Don’t appoint a committee on snakes.”

Under the banner “United We Stand America,” he spent $65 million of his billions in a campaign that featured innovative half-hour infomercials about himself and his ideas. They were popular, with ratings that sometimes surpassed those of prime-time sitcoms. Ignoring negative newspaper and magazine articles, he laid siege to radio and television talk shows. Switchboards lit up with calls from people wanting to volunteer.

Before long, millions were responding to his calls to cut government deficits, red tape and waste, to begin rebuilding the crumbling cities and to restore his vision of America: the small-town life idealized in Rockwell’s homey portraits of ballpark patriotism, barbershop wisdom and flag-draped Main Street, a world away from corrupt Washington.
While Mr. Perot had done business with every administration since Lyndon B. Johnson’s, the federal government was one of his favorite targets. Washington, he told its own denizens, “has become a town with sound bites, shell games, handlers, media stuntmen who posture, create images, talk, shoot off Roman candles, but don’t ever accomplish anything. We need deeds, not words, in this city.”

Improbably, he surged in the polls while the Republican incumbent, George Bush, and the Democrat, Bill Clinton, trained their fire on each other. Polls showed that Mr. Perot’s support came from across the spectrum, from Democrats and Republicans, conservatives and liberals, mostly from the middle class. Citizen drives got him on the ballot in all 50 states. He was on the cover of Time magazine.

But at the peak of his popularity, he unexpectedly dropped out of the race. Months later, he jumped back in, saying his withdrawal had been prompted by Republican “dirty tricks” to sabotage his daughter’s wedding with faked compromising photographs.

He did surprisingly well in three presidential debates, often mocking the “gridlock” in Washington. “It’s not the Republicans’ fault, of course, and it’s not the Democrats’ fault,” he said in the second round. “Somewhere out there there’s an extraterrestrial that’s doing this to us, I guess.”
On Election Day, Mr. Perot finished with 19 percent of the popular vote — almost 20 million ballots — compared with 38 percent for Mr. Bush and 43 percent for Mr. Clinton. It was the strongest third-party showing since Theodore Roosevelt’s Bull Moose run in 1912.

It also led to claims by some Republicans, including the president’s son and future president George W. Bush, that Mr. Perot’s candidacy had cost President Bush a second term — a contention refuted by many political analysts, who pointed to, among other things, exit polls showing that Mr. Perot’s strength had not come disproportionately from defecting Republicans.

In 1996, Mr. Perot ran again, this time on the new Reform Party ticket, but he fared poorly. By then the epigrams had paled, and voters suspected that his business strengths, the risk-taking and stubborn autocratic personality, might not serve a president constrained by Congress and public opinion. And by then more was known of Mr. Perot, who could be thin-skinned and meanspirited, who had subjected employees to moral codes and lie detector tests, who was drawn to conspiracy theories and had hired private detectives to chase his suspicions.

His candidacy was crippled when a commission refused to let him join debates between President Clinton and the Republican nominee, Senator Bob Dole, on the grounds that he did not have a realistic chance of being elected. He won only 8 percent of the vote. But, as he liked to say, “Failures are like skinned knees: painful but superficial.”
'''


In [246]:
write_nonsense_limerick(alt_text2)

([populist talk, 'education'],
 [Uncle Sam, 'administration'],
 [Mr. Perot, 'rolled'],
 [North Vietnam, 'told'],
 [Mr. Perot, winning, 'nation'])

In [249]:
write_nonsense_limerick(alt_text2)

([images, won, 'education'],
 [government deficits, 'nation'],
 [President Bush, 'rolled'],
 [North Vietnam, 'told'],
 [a campaign, 'administration'])

In [274]:
write_nonsense_limerick(alt_text2)

([no quitter, restoring, 'nation'],
 [Ross Perot, came, 'education'],
 [Mr. Perot, 'quit'],
 [President Bush, 'lit'],
 [a campaign, 'administration'])

In [276]:
write_nonsense_limerick(alt_text2)

([Senator Bob Dole, cut, 'nation'],
 [Mr. Perot, 'education'],
 [Time magazine, 'trained'],
 [anything, 'constrained'],
 [other things, 'administration'])

In [13]:
def scrape_articles_text(url):

    session = requests.Session()

    req = session.get(url)
    soup = BeautifulSoup(req.text, 'lxml')

    paragraph_tags = soup.find_all('p', class_= 'css-exrw3m evys1bk0')
    if paragraph_tags == []:
        paragraph_tags = soup.find_all('p', itemprop = 'articleBody')

    article = ''
    for p in paragraph_tags:
        article = article + ' ' + p.get_text()

    # Clean article replacing unicode characters
    article = article.replace(u'\u2018', u"'").replace(u'\u2019', u"'").replace(u'\u201c', u'"').replace(u'\u201d', u'"')

    return article

In [24]:
url = 'https://www.nytimes.com/2019/09/12/science/solar-energy-power-electricity.html'
text = scrape_articles_text(url)


In [25]:
write_nonsense_limerick(text)

([a chafing dish, generates, 'day'],
 [a single light, generate, 'way'],
 [solar cells, 'tested'],
 [Raman, 'suggested'],
 [researchers, developing, 'spray'])