In [None]:
from collections import defaultdict
from datetime import datetime
import math
from operator import itemgetter
import os
import random
import re

import numpy as np
import pandas as pd

%matplotlib inline

import little_mallet_wrapper as lmw

In [None]:
# import imp
# imp.reload(lmw)

<br><br>

# MALLET Path

Change the path below to the location of your local MALLET installation.

In [None]:
MALLET_PATH = '/Users/mah343/Documents/packages/mallet-2.0.8/bin/mallet'  # CHANGE THIS TO YOUR MALLET PATH

<br><br>

# Load poetry dataset

Data is available here: https://www.kaggle.com/johnhallman/complete-poetryfoundationorg-dataset

In [None]:
poetry_path = '/Volumes/Maria\'s Black Passport/data/kaggle-poem-dataset/kaggle_poem_dataset.csv'  # CHANGE THIS TO YOUR DATASET PATH

In [None]:
# 'https://www.kaggle.com/johnhallman/complete-poetryfoundationorg-dataset/download'

In [None]:
poetry_df = pd.read_csv(poetry_path)

poetry_df.sample(10)

In [None]:
print(len(poetry_df.index))

In [None]:
print(len(poetry_df.loc[poetry_df['Author'] == 'Robert Browning'].index))

In [None]:
print(len(poetry_df.loc[poetry_df['Author'] == 'William Butler Yeats'].index))

In [None]:
print(len(poetry_df.loc[poetry_df['Author'] == 'John Keats'].index))

In [None]:
print(len(poetry_df.loc[poetry_df['Author'] == 'Emily Dickinson'].index))

In [None]:
print(len(poetry_df.loc[poetry_df['Author'] == 'Christina Rossetti'].index))

In [None]:
training_documents = [lmw.process_string(_text) for _text in poetry_df['Content'].tolist()]

len(training_documents)

In [None]:
training_documents[100]

In [None]:
authors = poetry_df['Author'].tolist()

len(authors)

<br><br>

# Train topic model

Depending on the size of your dataset, training can take a while. For very large datasets, I recommending training outside of a notebook.

While training, you can observe progress by checking the terminal window from which you started Jupyter.

In [None]:
num_topics = 20

In [None]:
output_directory_path = '/Users/mah343/Desktop/lmw-output' # CHANGE THIS TO YOUR OUTPUT DIRECTORY

training_data_path           = output_directory_path + '/training.txt'
formatted_training_data_path = output_directory_path + '/mallet.training'
model_path                   = output_directory_path + '/mallet.model.' + str(num_topics)
topic_keys_path              = output_directory_path + '/mallet.topic_keys.' + str(num_topics)
topic_distributions_path     = output_directory_path + '/mallet.topic_distributions.' + str(num_topics)

In [None]:
lmw.train_topic_model(MALLET_PATH,
                      training_data_path,
                      formatted_training_data_path,
                      model_path,
                      topic_keys_path,
                      topic_distributions_path,
                      training_documents,
                      num_topics)

<br><br>

# Load the topics

In [None]:
topics = lmw.load_topic_keys(output_directory_path + '/mallet.topic_keys.20')

for i, t in enumerate(topics):
    print(i, '\t', t)

In [None]:
topic_distributions = lmw.load_topic_distributions(output_directory_path + '/mallet.topic_distributions.20')

len(topic_distributions), len(topic_distributions[0])

In [None]:
topic_distributions[0]

<br><br>

# Plot topics by category

In [None]:
target_labels = ['John Keats', 'Emily Dickinson', 'William Butler Yeats', 'Christina Rossetti']

lmw.plot_categories_by_topics_heatmap(authors,
                                      topic_distributions,
                                      topics, 
                                      output_directory_path + '/categories_by_topics.pdf',
                                      target_labels=target_labels,
                                      dim=(10,4))

In [None]:
target_labels = ['John Keats', 'Emily Dickinson', 'William Butler Yeats', 'Christina Rossetti']

for _topic_index in range(0, len(topics)):
    
    lmw.plot_categories_by_topic_boxplots(authors,
                                          topic_distributions,
                                          topics, 
                                          _topic_index,
                                          output_path=output_directory_path + '/boxplot.' + str(_topic_index) + '.pdf',
                                          target_labels=target_labels,
                                          dim=(4,4))

<br><br>

# Plot topics over time

In [None]:
divided_documents, document_ids, times = lmw.divide_training_data(training_documents,
                                                                  num_chunks=10)

len(divided_documents), len(document_ids), len(times)

In [None]:
new_training_data_path           = output_directory_path + '/training.split.txt'
new_formatted_training_data_path = output_directory_path + '/mallet.split.training'
new_topic_distributions_path     = output_directory_path + '/mallet.split.topic_distributions.' + str(num_topics)

In [None]:
lmw.infer_topics(MALLET_PATH,
                 formatted_training_data_path,
                 model_path,
                 new_training_data_path,
                 new_formatted_training_data_path,
                 new_topic_distributions_path,
                 divided_documents)
#                  [d for d in divided_documents if len(d) > 0])

In [None]:
topic_distributions = lmw.load_topic_distributions(new_topic_distributions_path)

len(topic_distributions), len(topic_distributions[2])

In [None]:
for d in topic_distributions:
#     assert(len(d) == 20)
    if len(d) != 20:
        print('no')

In [None]:
for _topic_index in range(0, len(topics)):
    lmw.plot_topics_over_time(topic_distributions, topics, times, _topic_index)