In [1]:
import itertools
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from nltk.corpus import stopwords
from bokeh.sampledata.autompg import autompg
from bokeh.models import LinearColorMapper, ColorBar
from bokeh.palettes import Viridis256
from bokeh.plotting import figure, save
from bokeh.models import ColumnDataSource, LabelSet, HoverTool, mappers
from bokeh.io import output_notebook, show
from bokeh.transform import factor_cmap
from bokeh import palettes

import plotly
import plotly.graph_objs as go
import plotly.plotly as py

from utils_lotr import *


%matplotlib inline
plt.rcParams['figure.figsize'] = 12, 8
plt.style.use('ggplot')

Using TensorFlow backend.


In [2]:
scrape = False
save = False
base_url = 'http://www.tk421.net/lotr/film/'


n_top_words = 7
n_topics = 15
n_iter = 500
tsne_components = 2
tsne_perplexity = 20

vect_mode = 'Count'
transform_mode = 'LDA'
threshold_confidence = True
threshold = 0.5

cv_params = {
    'stop_words': 'english', 
    'min_df': 2, 
    'max_df': 0.9,
    'ngram_range': (1,3),
    'analyzer': 'word',
}

stops = set(stopwords.words('english'))

In [4]:
script_subsets = scrape_lotr(base_url, scrape, save)
df = clean_transcript(script_subsets)
df.drop_duplicates(['text'], inplace=True)

df = lowercase(df)
df = remove_stops(df, stops)
df['text'] = df.text.apply(lambda x: ' '.join(x))
df['character'] = df.character.apply(lambda x: ' '.join(x))

df_interactions, interact_matrix = get_interactions_df(df)


X_topics, reducer, cv = transform_text_data(df, n_topics, cv_params, 
                                            vect_mode=vect_mode,
                                            transform_mode=transform_mode)

LoTR transcripts scraped, time it took: 0.289
Interactions computed, time it took: 13.856



n_topics has been renamed to n_components in version 0.19 and will be removed in 0.21





LDA transformation done, time it took: 7.132


In [5]:
df_interactions['character1'] = df_interactions.apply(lambda x: x['characters'].split('_')[0], axis=1)
df_interactions['character2'] = df_interactions.apply(lambda x: x['characters'].split('_')[1], axis=1)
df_interactions.reset_index(inplace=True, drop=True)


characters = df.character.unique()

df_inter_group = df_interactions.groupby('character1').sum()
df_inter_group.num_interactions.fillna(0, inplace=True)
df_inter_group['character'] = df_inter_group.index
df_inter_group2 = df_interactions.groupby('character2').sum()
df_inter_group2.num_interactions.fillna(0, inplace=True)
df_inter_group2['character'] = df_inter_group2.index
df_inter_group_full = df_inter_group.merge(df_inter_group2, on='character', how='outer')
df_inter_group_full['num_interactions'] = df_inter_group_full.num_interactions_x.values + df_inter_group_full.num_interactions_y.values
df_inter_group_full['num_interactions'] = df_inter_group_full.max(axis=1)
df_inter_group_full = df_inter_group_full[df_inter_group_full.character != 'narrator']

df_inter_narrator = df_interactions[df_interactions.character1 == 'narrator']

In [14]:
df_inter_highest_group = df_inter_group_full[df_inter_group_full.num_interactions >= 20].reset_index(
    drop=True).sort_values('num_interactions', ascending=False).reset_index(drop=True)


N = df_inter_highest_group.character.nunique()
N2 = df_inter_highest_group.shape[0]

c = ['hsl(' + str(h) + ',50%' + ',50%)' for h in np.linspace(0, 60, N)]
l = []

colors_df = pd.DataFrame()
colors_df['color'] = c
colors_df['character'] = df_inter_highest_group.character.unique()

df_inter_highest_group = df_inter_highest_group.merge(colors_df, how='inner', on='character')


for i in range(N2):
    trace = go.Scatter(
        x=df_inter_highest_group.character[i],
        y=df_inter_highest_group.num_interactions[i],
        mode='markers',
        marker=dict(size=10,
                    color=df_inter_highest_group.color[i],
                    opacity=0.95,
                    colorscale='Viridis',
                    showscale=False
                    ))
    l.append(trace)
    

layout = go.Layout(
    title='Sum of mentions',
    hovermode='closest',
    xaxis=dict(
        title='Character',
        ticklen=5,
        zeroline=False,
        gridwidth=2,
        tickangle=270,
        titlefont=dict(
            size=16)
    ),
    yaxis=dict(
        title='Number of mentions',
        ticklen=5,
        gridwidth=2,
        titlefont=dict(
            size=16)
    ),
    showlegend=False,
    autosize=False,
    width=1000,
    height=800,
    margin=go.Margin(
        l=70,
        r=50,
        b=250,
        t=50,
        pad=4
    ),
)


fig = go.Figure(data=l, layout=layout)
py.iplot(fig, filename='SumMentions')

In [None]:
df_inter_narrator_highest = df_inter_narrator[df_inter_narrator.num_interactions >= 20].reset_index(
    drop=True).sort_values('num_interactions', ascending=False).reset_index(drop=True)
    

N = df_inter_narrator_highest.characters.nunique()
N2 = df_inter_narrator_highest.shape[0]

c = ['hsl(' + str(h) + ',50%' + ',50%)' for h in np.linspace(0, 240, N)]
l = []

colors_df = pd.DataFrame()
colors_df['color'] = c
colors_df['characters'] = df_inter_narrator_highest.characters.unique()

df_inter_narrator_highest = df_inter_narrator_highest.merge(colors_df, how='inner', on='characters')


for i in range(N2):
    trace = go.Scatter(
        x=df_inter_narrator_highest.character2[i],
        y=df_inter_narrator_highest.num_interactions[i],
        mode='markers',
        marker=dict(size=10,
                    color=df_inter_narrator_highest.color[i],
                    opacity=0.95,
                    colorscale='Viridis',
                    showscale=False
                    ),
        text=('Narrator mentions: {}'.format(df_inter_narrator_highest.characters[i].split('_')[1]))
    )
    l.append(trace)

    
    

layout = go.Layout(
    title='Mentions of a character by narrator',
    hovermode='closest',
    xaxis=dict(
        title='Character',
        ticklen=5,
        zeroline=False,
        gridwidth=2,
        tickangle=270,
    ),
    yaxis=dict(
        title='Number of mentions',
        ticklen=5,
        gridwidth=2,
    ),
    showlegend=False,
    autosize=False,
    width=1000,
    height=800,
    margin=go.Margin(
        l=70,
        r=50,
        b=250,
        t=50,
        pad=4
    ),
)


fig = go.Figure(data=l, layout=layout)
py.iplot(fig, filename='NarratorMentions')

In [None]:
df_inter_highest = df_interactions[df_interactions.num_interactions >= 20].reset_index(
    drop=True).sort_values('num_interactions', ascending=False).reset_index(drop=True)


N = df_inter_highest.characters.nunique()
N2 = df_inter_highest.shape[0]

c = ['hsl(' + str(h) + ',50%' + ',50%)' for h in np.linspace(0, 240, N)]
l = []

colors_df = pd.DataFrame()
colors_df['color'] = c
colors_df['characters'] = df_inter_highest.characters.unique()

df_inter_highest = df_inter_highest.merge(
    colors_df, how='inner', on='characters')


for i in range(N2):
    trace = go.Scatter(
        x=df_inter_highest.characters[i],
        y=df_inter_highest.num_interactions[i],
        mode='markers',
        marker=dict(size=10,
                    color=df_inter_highest.color[i],
                    opacity=0.95,
                    colorscale='Viridis',
                    showscale=False
                    ),
        text=(' & '.join([df_inter_highest.characters[i].split('_')[0],
                          df_inter_highest.characters[i].split('_')[1]]))
    )
    l.append(trace)


layout = go.Layout(
    title='Interactions between characters',
    hovermode='closest',
    xaxis=dict(
        title='Character Pair',
        ticklen=5,
        zeroline=False,
        gridwidth=2,
        tickangle=270,
    ),
    yaxis=dict(
        title='Number of mentions',
        ticklen=5,
        gridwidth=2,
    ),
    showlegend=False,
    autosize=False,
    width=1000,
    height=800,
    margin=go.Margin(
        l=70,
        r=50,
        b=250,
        t=50,
        pad=4
    ),
)


fig = go.Figure(data=l, layout=layout)
py.iplot(fig, filename='CharacterMentions')