In [None]:
import numpy as np
%load_ext cython

import re, sys, random, math
from collections import Counter
from timeit import default_timer as timer
from IPython.display import display, clear_output, Markdown, Latex

from matplotlib import pyplot

word_pattern = re.compile("\w[\w\-\']*\w|\w")

import pronouncing

In [None]:
## Read the documents file
        
word_counts = Counter()
documents = []

for line in open("sonnets.tsv", encoding="utf-8"):    
    (poem_id, time_period, genre, text) = line.rstrip().split("\t")
    
    lines = [ word_pattern.findall(line) for line in text.split("|") ]
    
    documents.append({ "original": line, "lines": lines,
                      "poem_id": poem_id, "time_period": time_period, "genre": genre })


In [None]:
num_topics = 60
doc_topic_probs = np.load("poetry_doc_topics.npy")
word_topic_probs = np.load("poetry_word_topics.npy")
vocabulary = np.load("vocabulary.npy")

In [None]:
def topic_words(topic, n_words=12):
    sorted_words = sorted(zip(word_topic_probs[:,topic], vocabulary), reverse=True)
    return " ".join([w for x, w in sorted_words[:n_words]])

def print_all_topics():
    for topic in range(num_topics):
        print(topic, topic_words(topic))

In [None]:
print_all_topics()

In [None]:
def top_docs(topic, n_docs=10):
    for doc_id in np.argsort(-doc_topic_probs[:,topic])[:n_docs]:
        print("{} {:.1f}% | {}".format(doc_id, 100 * doc_topic_probs[doc_id,topic], documents[doc_id]["original"]))

In [None]:
time_period_counter = Counter([d["time_period"] for d in documents])

time_period_counter.most_common()

In [None]:
time_periods = ['Fifteenth-Century Poetry', 'Tudor 1500-1580',
                'Jacobean and Caroline 1603-1660', 'Restoration 1660-1700',
                'Early Eighteenth-Century 1700-1749',
                'Later Eighteenth-Century 1750-1799', 'Early Nineteenth-Century 1800-1834', 
                'Mid Nineteenth-Century 1835-1869', 'Later Nineteenth-Century 1870-1899',
                'Twentieth-Century 1900-1999']

In [None]:
def rhyme(line):
    last_word = line[-1]
    phones = pronouncing.phones_for_word(last_word)
    rhymes = [pronouncing.rhyming_part(p) for p in phones]
    
    return (last_word.lower(), set(rhymes))

def sorted_tuple(a, b):
    if a < b:
        return (a, b)
    else:
        return (b, a)

In [None]:
line_num = 0

time_period_rhymes = {}

for time_period in time_periods:
    time_period_rhymes[time_period] = Counter()

for document in documents:
    
    time_period = document["time_period"]
    if not time_period in time_period_rhymes:
        continue
    
    ## each element is tuple (word, {set of possible rhymes})
    rhymes = [ rhyme(line) for line in document["lines"] if len(line) > 0 ]
    
    ## compare every word to every previous word
    for i in range(1, len(rhymes)):
        word_info_a = rhymes[i]
        for j in range(0, i):
            word_info_b = rhymes[j]
            
            ## look for set intersection of rhyme sets
            if word_info_a[1] & word_info_b[1]:
                # if there is a rhyme, record the two words in a tuple
                rhyme_pair = sorted_tuple(word_info_a[0], word_info_b[0])
                time_period_rhymes[time_period][rhyme_pair] += 1
                
                # only count a rhyme once
                break

    line_num += 1
    #if line_num > 100:
    #    break

In [None]:
for time_period in time_periods:
    print(time_period)
    print(time_period_rhymes[time_period].most_common(30))
