In [1]:
import itertools
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from nltk.corpus import stopwords
from bokeh.sampledata.autompg import autompg
from bokeh.models import LinearColorMapper, ColorBar
from bokeh.palettes import Viridis256
from bokeh.plotting import figure, save
from bokeh.models import ColumnDataSource, LabelSet, HoverTool, mappers
from bokeh.io import output_notebook, show
from bokeh.transform import factor_cmap
from bokeh import palettes
from utils_lotr import *


%matplotlib inline
plt.rcParams['figure.figsize'] = 12, 8
plt.style.use('ggplot')

Using TensorFlow backend.


In [2]:
scrape = False
save = False
base_url = 'http://www.tk421.net/lotr/film/'


n_top_words = 7
n_topics = 20
n_iter = 500
tsne_components = 2
tsne_perplexity = 20

vect_mode = 'Count'
transform_mode = 'LDA'
threshold_confidence = True
threshold = 0.5

cv_params = {
    'stop_words': 'english', 
    'min_df': 2, 
    'max_df': 0.9,
    'ngram_range': (1,3),
    'analyzer': 'word',
}

stops = set(stopwords.words('english'))

In [3]:
script_subsets = scrape_lotr(base_url, scrape, save)
df = clean_transcript(script_subsets)
df.drop_duplicates(['text'], inplace=True)

df = lowercase(df)
df = remove_stops(df, stops)
df['text'] = df.text.apply(lambda x: ' '.join(x))
df['character'] = df.character.apply(lambda x: ' '.join(x))

df_interactions, interact_matrix = get_interactions_df(df)


X_topics, reducer, cv = transform_text_data(df, n_topics, cv_params, 
                                            vect_mode=vect_mode,
                                            transform_mode=transform_mode)

LoTR transcripts scraped, time it took: 0.541
Interactions computed, time it took: 14.026




LDA transformation done, time it took: 7.406


In [4]:
df_interactions

Unnamed: 0,characters,num_interactions
galadriel_narrator,galadriel_narrator,26.0
galadriel_gollum,galadriel_gollum,2.0
galadriel_gandalf,galadriel_gandalf,2.0
galadriel_frodo,galadriel_frodo,8.0
galadriel_sam,galadriel_sam,2.0
galadriel_hobbit,galadriel_hobbit,2.0
galadriel_hobbits,galadriel_hobbits,1.0
galadriel_saruman,galadriel_saruman,1.0
galadriel_man,galadriel_man,3.0
galadriel_ring,galadriel_ring,19.0


In [5]:
if threshold_confidence:
    idx_max = np.amax(X_topics, axis=1) > threshold
    X_topics = X_topics[idx_max]


tsne_model = TSNE(n_components=tsne_components, verbose=1, random_state=1337, 
                  perplexity=tsne_perplexity, angle=.2, init='pca')
tsne_lda = tsne_model.fit_transform(X_topics)

[t-SNE] Computing 61 nearest neighbors...
[t-SNE] Indexed 2143 samples in 0.004s...
[t-SNE] Computed neighbors for 2143 samples in 0.252s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2143
[t-SNE] Computed conditional probabilities for sample 2000 / 2143
[t-SNE] Computed conditional probabilities for sample 2143 / 2143
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 54.106915
[t-SNE] Error after 1000 iterations: 0.210524


In [6]:
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"])


lda_keys = []
for i in range(X_topics.shape[0]):
    lda_keys.append(X_topics[i, :].argmax())

topic_summaries = []
topic_word = reducer.components_
vocab = cv.get_feature_names()
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(
        topic_dist)][:-(n_top_words + 1):-1]
    topic_summaries.append(' '.join(topic_words))
print('Topic summaries:\n', topic_summaries)


dfb = pd.DataFrame()
dfb['content'] = df[idx_max].text.values.tolist()
dfb['topic_key'] = np.array(lda_keys)
dfb['X_tsne'] = tsne_lda[:, 0]
dfb['Y_tsne'] = tsne_lda[:, 1]

Topic summaries:
 ['look eyes oh right night stand keep', 'army elrond move dwarf attack mountain galadriel', 'hobbits see dead nothing little tree would', 'aragorn gandalf pippin frodo merry looks back', 'begins something climb uruks grond elvish strider', 'away come runs gandalf horse good going', 'turns faramir face takes give alone trees', 'frodo gollum gimli walks way ground falls', 'yes sméagol three march feet given work', 'frodo ring go time long come lord', 'around two appears black riders river small', 'run war death hobbit took behind peace', 'take precious help lost mine enter wish', 'must city gate leave door gandalf staff', 'legolas men us orc hand one fight', 'know saruman tirith last place arrows got', 'denethor light arms osgiliath seen made another', 'king théoden ride side say gríma holds', 'sam orcs frodo deep elves well mordor', 'get let minas think day gondor battle']


In [28]:
source = ColumnDataSource(dfb)
color_mapper = mappers.LinearColorMapper(
    palette=palettes.Category20_20, low=dfb.topic_key.min(), high=dfb.topic_key.max())


p = figure(plot_width=1200, plot_height=1000,
           title='t-SNE Lord of the Rings Topics',
           x_axis_label='X-coord', y_axis_label='Y-coord',
           tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave", toolbar_location='above',
           min_border=1)
p.scatter(x='X_tsne', y='Y_tsne', color={'field': 'topic_key',
                                        'transform': color_mapper}, size=7, alpha=0.5, source=source)
p.title.text_font_size = '15pt'
p.xaxis.major_label_text_font_size = '0pt'
p.yaxis.major_label_text_font_size = '0pt'


topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan
for topic_num in lda_keys:
    if not np.isnan(topic_coord).any():
        break
    topic_coord[topic_num] = tsne_lda[lda_keys.index(topic_num)]
for i in range(X_topics.shape[1]):
    p.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]],
          text_font_size='11pt', text_align='center',
          text_baseline='middle')


hover = p.select(dict(type=HoverTool))
hover.tooltips = {"Sentence": "@content, Topic number: @topic_key"}
show(p)

In [None]:
from bokeh.plotting import save
save(p, 'tSNE_topics{}_perplexity{}.html'.format(n_topics, tsne_perplexity))