In [1]:
import nltk #NLTK will do the lion's share of the work here
from bs4 import BeautifulSoup #this is only to scrape the NPR plain text site, everything else just takes any text
import requests
import numpy as np
from nltk.corpus import cmudict
pronounciations = cmudict.dict() #the NLTK pronunciation dictionary
from nltk.corpus import stopwords


In [2]:
#first thing is just to get text, the NPR plain text site is easy to work with, so that's what I'm using here
#Their article urls all follow the same pattern, so this takes in an article id number 
def NPR_text(id_number):
    html_page = requests.get('https://text.npr.org/s.php?sId={}'.format(id_number))
    soup = BeautifulSoup(html_page.content, 'html.parser')
    text = soup.text.split('\n')[15:-10]
    return text

The first task was to make a guess at a 'topic' for any piece of text. In this case, it's more like a 'key-word' than a topic. The notion is that the text is likely to be about a proper noun, a person or institution, say, and that this proper noun will appear disproportionally often in the text. So, I have a function that simply parses the text for part of speech and returns the most common proper noun.

This being my first project, I simply used the built in NLTK POS tagger, the notion being that I would later return to it and build my own tagger that could be inserted into this in place of the NLTK tagger. I've since started to use SpaCy and realize that a) SpaCy's tagger is better and faster both than NLTK and anything I'm likely to make and b) that Spacy would have solved another of my issues. So everything after this is going to be Spacy based.

One of the quirks about doing this with NLTK was that names would get split up into two tokens, and NLTK wouldn't/couldn't keep track of names together as entities. For instance, a news article about Trump might refer to 'Trump' or 'Donald Trump'. If you counted up all the tagged proper nouns in the text, it would return 'Trump' as the most often used, but it wouldn't know to return the full name. Hence in this case I built a little helper funtion to look for 'names', which I reasoned would be bigrams in which both words were tagged as proper nouns, so that I could search my list of names for the most common proper noun and return the full name if the most common proper noun was part of a name.

In [3]:
def find_names(text_words, nnps):
    text_bigrams = list(nltk.bigrams(text_words))
    names = [bigram for bigram in text_bigrams if bigram[0] in nnps
             and bigram[1] in nnps]
    return names

def find_topic(text):
    if type(text) == list:
        text_words = nltk.word_tokenize(text[0])
        for n in range(1,len(text)):
            text_words += nltk.word_tokenize(text[n])
            
    elif type(text) == str:
        text_words = nltk.word_tokenize(text)
    text_pos = nltk.pos_tag(text_words)
    
    stop_words = set(stopwords.words('english'))
    text_pos_minus_stop = [(word, pos) for (word,pos) in text_pos if 
                           word not in stop_words]
    NNPs = [word for (word,pos) in text_pos_minus_stop if pos=='NNP'] #NNP is just 'proper nouns'
    names = find_names(text_words, NNPs)
    most_common = nltk.FreqDist(NNPs).most_common(1)[0][0] 
    #NLTK has a convenient, built in frequency dictionary function
    
    #now to check if the most common proper noun is in a name, either first or last name
    if most_common in [name1 for (name1,name2) in names]:
        for name in names:
            if most_common == name[0]:
                return name
    if most_common in [name2 for (name1,name2) in names]:
        for name in names:
            if most_common == name[1]:
                return name
    return most_common

Next a series of functions to create working dictionaries from which to pull words for the haiku. 
1. tokenize the text, remove stop words, parse for part of speach
2. query the built in NLTK pronunciation dicitonary to calculate number of syllables
3. Return the lemmas of all the nouns and adjectives in the text, these will be the basis of the haiku to avoid tense/morphological issues
4. create a master syllable dictionary, indexed by number of syllables

In [4]:
def text_wordify(text):
    text_tokens = nltk.word_tokenize(text[0])
    for n in range(1,len(text)):
        text_tokens += nltk.word_tokenize(text[n])
    stop_words = set(stopwords.words('english'))
    text_pos = nltk.pos_tag(text_tokens)
    text_words = [(word,pos) for (word, pos) in text_pos if word not in stop_words]
    return text_words

#The NLTK pronunciation dictionary returns a list of phonemes for a given word, finding the number of syllables
#for a given word isn't intuitive. 'lay', 'play' and 'splay' all have one syllable, but would be different length
#lists in the pronunciation dictionary. Each syllable comes with a stress mark which is a number 0, 1 or 2, 
#so counting these digits will tell you how many syllables a word is.
def syllable_count(word):
    count = 0
    pron = pronounciations[word.lower()][0]
    for syl in pron:
        if syl[-1].isdigit():
            count +=1
    return count

def find_nouns_adjs(text_words):
    noun_adj_tags = ['NN','NNS','JJ','JJR','JJS']
    noun_adjs = [word for (word,pos) in text_words if pos in noun_adj_tags]
    wnl = nltk.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in noun_adjs]
    return lemmas

def create_syllable_lists(lemmas):
    syllables = {1:[],2:[],3:[],4:[],5:[]}
    freq_dict =nltk.FreqDist(lemmas)
    lemmas = list(set(lemmas))
    for word in lemmas:
        try:
            syllables[syllable_count(word)].append((word,freq_dict[word]))
        except:
            pass
    return syllables

Finally the functions that actually build the haiku:
1. select a random word, weighted by how often the word appears
2. fill out a line in the haiku by taking a number of syllables and filling it with random words until you run out of syllables
3. finally, write the haiku, inserting the 'topic' into line 1 (or line two if it is 6 or 7 syllables long) and filling out the rest of the haiku from there.

In [5]:
def rand_word(cfd):
    cum_dic = {}
    count = 0
    if len(cfd) == 1:
        return cfd[0][0]
    for word in cfd:
        count += word[1]
        cum_dic[word[0]] = count
    rand_int = np.random.randint(1,count+1)
    for word in cum_dic:
        if cum_dic[word] > rand_int:
            return word
        
def fill_out_line(num_syllables, syllable_dict, line = [], already_used=[]):
    while num_syllables > 1:
        find_syl = np.random.randint(1,6)
        if find_syl <= num_syllables and len(syllable_dict[find_syl]) >0:
            words = syllable_dict[find_syl]
            new_word = rand_word(words)
            if new_word not in already_used: #make sure we don't use the same word more than once
                line.append(new_word)
                already_used.append(new_word)
            num_syllables -= find_syl
    if num_syllables == 1:
        words = syllable_dict[1]
        new_word = rand_word(words)
        if new_word not in already_used:
            line.append(rand_word(words))
            already_used.append(new_word)
    return line, already_used #also return the already used list so we can use the same list across multiple lines

def write_haiku(id_num):
    text = NPR_text(id_num)
    text_words = text_wordify(text)
    topic = find_topic(text)
    noun_adjs = find_nouns_adjs(text_words)
    syllable_dict = create_syllable_lists(noun_adjs)
    
    if type(topic) !=str:
        topic_syls = syllable_count(topic[0]) + syllable_count(topic[1])
    else:
        topic_syls = syllable_count(topic)
    line_1 = []
    line_2 = []
    line_3 = []
    already_used = []
    if topic_syls <=5:
        if type(topic) !=str:
            line_1 += [word for word in topic]
        else:
            line_1.append(topic)
        line_1, already_used = fill_out_line(5-topic_syls, syllable_dict, line_1, already_used)
        line_2, already_used = fill_out_line(7, syllable_dict, line_2, already_used)
        line_3, already_used = fill_out_line(5, syllable_dict, line_3, already_used)
    elif topic_syls <8:
        if type(topic) !=str:
            line_2 += [word for word in topic]
        else:
            line_2.append(topic)
        line_1, already_used = fill_out_line(5, syllable_dict, line_1, already_used)
        line_2, already_used = fill_out_line(7-topic_syls, syllable_dict, line_2, already_used)
        line_3, already_used = fill_out_line(5, syllable_dict, line_3, already_used)

    return line_1, line_2, line_3

In [30]:
write_haiku(735578519)

(['Health', 'Care', 'People', 'short'],
 ['proponent', 'room', 'lobbying'],
 ['executive', 'health'])

In [32]:
write_haiku(735424158)

(['Iran', 'many', 'thing'],
 ['strong', 'sanction', 'sign', 'week', 'last'],
 ['financial', 'chance'])

In [29]:
write_haiku(735424158)

(['Iran', 'sanction', 'lot'],
 ['administration', 'speech', 'planned'],
 ['move', 'chance', 'nuclear'])

In [16]:
write_haiku(735525569)

(['letter', 'interview'],
 ['Treasury', 'Department', 'note'],
 ['new', 'indication'])

In [20]:
write_haiku(735274808)

(['China', 'trade', 'fact', 'trade'],
 ['national', 'month', 'wu', 'word'],
 ['least', 'education'])

In [15]:
write_haiku(735379562)

(['Islamic', 'Iran'],
 ['market', 'ambassador', 'drone'],
 ['country', 'power', 'oil'])

In [21]:
write_haiku(735578519)

(['Health', 'Care', 'insurer'],
 ['laboratory', 'care', 'People'],
 ['many', 'hospital'])

In [6]:
write_haiku(741382999)

(['Louisiana'],
 ['hurricane', 'morning', 'part', 'storm'],
 ['emergency', 'close'])

In [7]:
write_haiku(741382999)

(['Louisiana'], ['evacuation', 'road', 'line'], ['rain', 'infrastructure'])