In [1]:
from collections import defaultdict
from datetime import datetime
import math
from operator import itemgetter
import os
import random
import re

import numpy as np
import pandas as pd

%matplotlib inline

import little_mallet_wrapper as lmw

<br><br>

# MALLET Path

Change the path below to the location of your local MALLET installation.

In [2]:
MALLET_PATH = '/Users/mah343/Documents/packages/mallet-2.0.8/bin/mallet'  # CHANGE THIS TO YOUR MALLET PATH

<br><br>

# Load poetry dataset

Data is available here: https://www.kaggle.com/johnhallman/complete-poetryfoundationorg-dataset

In [3]:
poetry_path = '/Volumes/Maria\'s Black Passport/data/kaggle-poem-dataset/kaggle_poem_dataset.csv'  # CHANGE THIS TO YOUR DATASET PATH

In [4]:
# 'https://www.kaggle.com/johnhallman/complete-poetryfoundationorg-dataset/download'

In [5]:
poetry_df = pd.read_csv(poetry_path)

poetry_df.sample(10)

Unnamed: 0.1,Unnamed: 0,Author,Title,Poetry Foundation ID,Content
10150,10150,Christopher Marlowe,The Passionate Shepherd to His Love,44675,"Come live with me and be my love,\nAnd we will..."
1509,1509,Mark Rudman,Birthday Blues,50033,Today's the rider's birthday.\n\nI see you're ...
666,666,Terese Svoboda,Aphra Plays,58604,Aphra Behn is not wearing all her clothes\nin ...
9294,9294,Anna Rabinowitz,Notes: Coercive Counterintelligence Interrogat...,145312,HOW TO SUCCEED IN TORTURE\nWITHOUT REALLY TRYI...
9245,9245,John Koethe,North Point North,48106,I\n\n\nIn these I find my calling:\nIn the sho...
8149,8149,Gail Mazur,Maternal,46803,"On the telephone, friends mistake us now\nwhen..."
9659,9659,Alan Dugan,On Hurricane Jackson,55174,"Now his nose’s bridge is broken, one eye\nwill..."
12357,12357,James Joyce,Song,53968,My love is in a light attire\nAmong the apple ...
1972,1972,Ezra Pound,Canto XXXVI,54318,A Lady asks me\nI speak in season\nShe seeks r...
5332,5332,Claudia Emerson,Ground Truth,58431,"My brother's funeral over, the dark-clothed\nc..."


In [6]:
print(len(poetry_df.index))

15652


In [7]:
print(len(poetry_df.loc[poetry_df['Author'] == 'Robert Browning'].index))

39


In [8]:
print(len(poetry_df.loc[poetry_df['Author'] == 'William Butler Yeats'].index))

47


In [9]:
print(len(poetry_df.loc[poetry_df['Author'] == 'John Keats'].index))

32


In [10]:
print(len(poetry_df.loc[poetry_df['Author'] == 'Emily Dickinson'].index))

57


In [11]:
print(len(poetry_df.loc[poetry_df['Author'] == 'Christina Rossetti'].index))

30


In [12]:
training_documents = [lmw.process_string(_text) for _text in poetry_df['Content'].tolist()]

len(training_documents)

15652

In [13]:
training_documents[100]

'year twelve daughters turn gone measured pace tho varying mien twelve froward sedater adorn festival reckless attire snow left mountain top fresh flowers withered meadow fig prune hung wrinkling last apple glow amid freckled leaves weary oxen blinkt trodden corn twisted vine whose bunches stood empty crate creak ere long beneath carried home season twelve months gentle hamadryad true love thy mansion thy dim mansion wood blasted laid desolate none dared violate precincts none dared pluck moss beneath alone remain thine old thallinos sat mute solitary sadness strange tale rhaicos died whole echion related force could ever make look back upon oaks father said echion thou must weigh carefully steady hand enough although longer comes store wax burn day night upon hollow stone milk honey lie may gods may dead pleas thallinos bore thither morn lighted left first visited upon solemn day hamadryad oak rhodope acon one age one hope one trust graceful nymph whose fate sorrowed slender pale firs

In [14]:
authors = poetry_df['Author'].tolist()

len(authors)

15652

<br><br>

# Train topic model

Depending on the size of your dataset, training can take a while. For very large datasets, I recommending training outside of a notebook.

While training, you can observe progress by checking the terminal window from which you started Jupyter.

In [15]:
num_topics = 20

In [16]:
output_directory_path = '/Users/mah343/Desktop/lmw-output' # CHANGE THIS TO YOUR OUTPUT DIRECTORY

training_data_path           = output_directory_path + '/training.txt'
formatted_training_data_path = output_directory_path + '/mallet.training'
model_path                   = output_directory_path + '/mallet.model.' + str(num_topics)
topic_keys_path              = output_directory_path + '/mallet.topic_keys.' + str(num_topics)
topic_distributions_path     = output_directory_path + '/mallet.topic_distributions.' + str(num_topics)

In [None]:
lmw.train_topic_model(MALLET_PATH,
                      training_data_path,
                      formatted_training_data_path,
                      model_path,
                      topic_keys_path,
                      topic_distributions_path,
                      training_documents,
                      num_topics)

Importing data...
Training topic model...


<br><br>

# Load the topics

In [None]:
topics = lmw.load_topic_keys(output_directory_path + '/mallet.topic_keys.20')

for i, t in enumerate(topics):
    print(i, '\t', t)

In [None]:
topic_distributions = lmw.load_topic_distributions(output_directory_path + '/mallet.topic_distributions.20')

len(topic_distributions), len(topic_distributions[0])

In [None]:
topic_distributions[0]

<br><br>

# Plot topics by category

In [None]:
target_labels = ['John Keats', 'Emily Dickinson', 'William Butler Yeats', 'Christina Rossetti']

lmw.plot_categories_by_topics_heatmap(authors,
                                      topic_distributions,
                                      topics, 
                                      output_directory_path + '/categories_by_topics.pdf',
                                      target_labels=target_labels,
                                      dim=(10,4))

In [None]:
target_labels = ['John Keats', 'Emily Dickinson', 'William Butler Yeats', 'Christina Rossetti']

for _topic_index in range(0, len(topics)):
    
    lmw.plot_categories_by_topic_boxplots(authors,
                                          topic_distributions,
                                          topics, 
                                          _topic_index,
                                          output_path=output_directory_path + '/boxplot.' + str(_topic_index) + '.pdf',
                                          target_labels=target_labels,
                                          dim=(4,4))

<br><br>

# Plot topics over time

In [None]:
divided_documents, document_ids, times = lmw.divide_training_data(training_documents,
                                                                  num_chunks=10)

len(divided_documents), len(document_ids), len(times)

In [None]:
new_training_data_path           = output_directory_path + '/training.split.txt'
new_formatted_training_data_path = output_directory_path + '/mallet.split.training'
new_topic_distributions_path     = output_directory_path + '/mallet.split.topic_distributions.' + str(num_topics)

In [None]:
lmw.infer_topics(MALLET_PATH,
                 formatted_training_data_path,
                 model_path,
                 new_training_data_path,
                 new_formatted_training_data_path,
                 new_topic_distributions_path,
                 divided_documents)
#                  [d for d in divided_documents if len(d) > 0])

In [None]:
topic_distributions = lmw.load_topic_distributions(new_topic_distributions_path)

len(topic_distributions), len(topic_distributions[2])

In [None]:
for d in topic_distributions:
#     assert(len(d) == 20)
    if len(d) != 20:
        print('no')

In [None]:
for _topic_index in range(0, len(topics)):
    lmw.plot_topics_over_time(topic_distributions, topics, times, _topic_index)