1. represent sentence structure as a pattern
2. search for all lines that have the structure 
3. arrange with rhyming

In [203]:
import gzip, json
import pronouncing
import random
import re
import string
from collections import defaultdict
import tweepy
import spacy
from spacy.util import filter_spans
from spacy.matcher import Matcher

In [4]:
nlp = spacy.load('en_core_web_lg')

In [5]:
all_lines = []
for line in gzip.open("gutenberg-poetry-v001.ndjson.gz"):
    all_lines.append(json.loads(line.strip())['s'])

In [79]:
len(all_lines)    # check size

3085117

In [84]:
all_lines[0]

'The Song of Hiawatha is based on the legends and stories of'

In [None]:
# this cell not used

doc1 = nlp('Slung trousers melt in a roseate box.')
doc2 = nlp('A broken calendar oscillates like sunny tin.')
doc1.similarity(doc2)


all_lines_random_sample2 = random.sample(all_lines, k=1000)
all_lines_docs = []
for line in all_lines_random_sample2:
    all_lines_docs.append(nlp(line))

for token in all_lines_docs[9]:
    print(token.text, token.pos_)
    
list(all_lines_docs[9].noun_chunks)

In [149]:
%%time
# initial error: Text of length 121,559,782 exceeds maximum of 1,000,000.
# nlp.max_length = 2000000

all_lines_random_sample = random.sample(all_lines, k=10000)
all_lines_onestring = "\n".join(all_lines_random_sample)
all_lines_doc = nlp(all_lines_onestring)
len(all_lines_doc)

CPU times: user 27.3 s, sys: 2.66 s, total: 30 s
Wall time: 31.5 s


97502

https://dsackerman.com/three-things-you-can-do-with-spacy/

spaCy tag list: https://github.com/explosion/spaCy/blob/master/spacy/glossary.py

spaCy matcher: https://spacy.io/api/matcher

In [16]:
test = nlp('Slung trousers melt in a roseate box.\nA broken calendar oscillates like sunny tin.\nThe craven linden growls swimmingly. Blowfish.\nThe man walks away.\nThe woman drives.')
#for token in test:
#    print(token.text, token.pos_)

Note: {'OP': '?'} wildcards may include newlines, so each found line may include two lines. Keeping this bug for now to add some variety in line structure type

In [150]:
def matchStructure(doc2match):    
    pattern = [
        {'POS': 'SPACE'},
        {'OP': '?'},{'OP': '?'},{'OP': '?'},{'OP': '?'},{'OP': '?'},{'OP': '?'},{'OP': '?'},
        {'POS': 'ADJ', 'OP': '+'},
        {'POS': 'NOUN'},
        {'POS': 'PRON', 'OP': '?'},
        {'POS': 'VERB'},
        {'OP': '?'},{'OP': '?'},{'OP': '?'},{'OP': '?'},{'OP': '?'},{'OP': '?'},{'OP': '?'},
        {'POS': 'SPACE'}
        ]    
    matcher = Matcher(nlp.vocab)
    matcher.add('corpse', [pattern])
    matches = matcher(doc2match)
    spans = [doc2match[start:end] for match_id, start, end in matches]
    return filter_spans(spans)

In [155]:
matched_lines = matchStructure(all_lines_doc)
len(matched_lines)

318

In [176]:
matched_lines_list = [l.text.strip() for l in matched_lines]

Cells below from this notebook: https://github.com/aparrish/gutenberg-poetry-corpus/blob/master/quick-experiments.ipynb

Build a dictionary that maps rhyming parts to a dictionary that maps words with that rhyming part to the lines of poetry that they're found at the end of:

In [200]:
all_lines_random_sample3 = random.sample(all_lines, k=100)

by_rhyming_part = defaultdict(lambda: defaultdict(list))

for line in matched_lines_list:
    match = re.search(r'(\b\w+\b)\W*$', line)
    if match:
        last_word = match.group()
        # strip punctuation
        last_word = last_word.translate(str.maketrans('', '', string.punctuation)) 
        pronunciations = pronouncing.phones_for_word(last_word)
        if len(pronunciations) > 0:
            rhyming_part = pronouncing.rhyming_part(pronunciations[0])
            # group by rhyming phones (for rhymes) and words (to avoid duplicate words)
            by_rhyming_part[rhyming_part][last_word.lower()].append(line)

# random_rhyming_part = random.choice(list(by_rhyming_part.keys()))
# random_rhyming_part, by_rhyming_part[random_rhyming_part]

Find the groups from the by_rhyming_part dictionary that have at least two different line-ending words:

In [201]:
rhyme_groups = [group for group in by_rhyming_part.values() if len(group) >= 2]

Find rhyming couplets loop:
1. Select a random rhyming group
2. Sample two keys (words) from that group
3. Print a random line from both groups

In [206]:
amt_couplets = 100

for i in range(amt_couplets):
    group = random.choice(rhyme_groups)
    words = random.sample(list(group.keys()), 2)
    print(random.choice(group[words[0]]))
    print()
    print(random.choice(group[words[1]]))

In the fresh shade, his white flocks feeding near,

Bidding his tinted leaves glide, bidding clear
Whence no public virtues bloom?

Through the wide forest pricked the weary groom.
With equal hurry quit th' invaded shore,

But, ere the steely clouds began their war,
"The fiction pleased, our generous train complies,

Makes them such raptures know
That poorly satisfy our eyes
As now its strange swift shine

Lest my wee thing be na mine.
And volleys of small plaudits bring

A golden goblet gave.
I see them swing,
Or with their own hands wounded, or by fate

With these foreboding words restrains their hate:
Her tired hands crossed on her shrunken breast.

And then, with bitter grief oppressed,
So I paid a good dollar fer an ole black mar'.

As the spirits of strong men are,
The ox and ass their patient foreheads bow

Closing his benediction,
Good sailor come and tell me now
And choral Virgins listen to the song;

Where the blue river winds along,
Whose right hand blesses with increase and

In [None]:
# Authenticate to Twitter
auth = tweepy.OAuthHandler("CONSUMER_KEY", "CONSUMER_SECRET")
auth.set_access_token("ACCESS_TOKEN", "ACCESS_TOKEN_SECRET")

# Create API object
api = tweepy.API(auth)

# Create a tweet
api.update_status("Hello Tweepy")