In [1]:
import os
import gensim
import warnings
import pickle
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.manifold import MDS, TSNE
from scipy.spatial.distance import pdist, squareform
import pyLDAvis.gensim
from bokeh.models import ColumnDataSource, OpenURL, TapTool, HoverTool, CustomJS, Title
from bokeh.models.widgets import Slider
from bokeh.plotting import figure, output_file, output_notebook, save, show, reset_output
from bokeh.layouts import widgetbox, column
output_notebook()

In [2]:
ldamodel = gensim.models.ldamodel.LdaModel.load('output/ldasaved')
dictionary = gensim.corpora.Dictionary.load('output/ldadict')
corpus = gensim.corpora.MmCorpus('output/ldacorpus')

# Get the Model


In [3]:
with open('output/topic_model.p', 'rb') as r:
    topic_model = pickle.load(r)

In [4]:
ntopics = len(ldamodel.show_topics())
ldamodel.show_topics(ntopics, formatted = False)

[(0,
  [('qû[unit]n', 0.05338266),
   ('karānu[vine]n', 0.03162676),
   ('šikaru[beer]n', 0.03048134),
   ('šamnu[oil]n', 0.025941681),
   ('kusāpu[bread]n', 0.025353108),
   ('zamāru[sing]v', 0.021011205),
   ('dišpu[honey]n', 0.018200742),
   ('naqû[pour-(a-libation)]v', 0.01790541),
   ('immeru[sheep]n', 0.017024098),
   ('kaptukkû[two-sūtu-container]n', 0.013798218)]),
 (1,
  [('rabû[big]aj', 0.027726343),
   ('abu[father]n', 0.021949498),
   ('ilu[god]n', 0.021935947),
   ('ahu[brother]n', 0.020562299),
   ('adû[(treaty-)oath]n', 0.015737668),
   ('ridûtu[appropriation]n', 0.015153948),
   ('šarrūtu[kingship]n', 0.012755802),
   ('ṣabātu[seize]v', 0.012666815),
   ('ṭuppu[tablet]n', 0.012434674),
   ('antalû[eclipse]n', 0.0123059)]),
 (2,
  [('šību[witness]n', 0.15905823),
   ('manû[unit]n', 0.046707984),
   ('ṣarpu[silver]n', 0.03615639),
   ('eqlu[field]n', 0.03523169),
   ('dīnu[legal-decision]n', 0.026832364),
   ('dabābu[speak]v', 0.025840916),
   ('imēru[unit]n', 0.022776565

# pyLDAvis
Use pyLDAvis to visualize the topic model. By default, pyLDAvis will order the topics by [prevalence](https://github.com/bmabey/pyLDAvis/issues/59) (topic 1 is the most prevalent topic). That means that the topic numbers in the visualization do not agree with the topic numbers in the lda model. To prevent this behaviour one may use `sort_topics=False` in the `prepare` command. The advantage of ordering the topics by prevalence, however, is that new instances of the lda model are more comparable (that is, the same topic will receive the same number). Note that the library was written in Java for R, and so the numbering in the visualization begins with 1 (not with 0). The topic numbers in the Document/Topic and Topic/Term matrices below will be adjusted to be compatible with the pyLDAvis visualization.

PyLDAvis needs a large output box. The `%%html` lines below create such a box (for the code see [here](http://stackoverflow.com/questions/18770504/resize-ipython-notebook-output-window)). 

In [5]:
%%html
<style>
.output_wrapper, .output {
    height:auto !important;
    max-height:1000px;  /* your desired max-height here */
}
.output_scroll {
    box-shadow:none !important;
    webkit-box-shadow:none !important;
}
</style>


In [6]:
warnings.simplefilter(action='ignore', category=FutureWarning)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
if not os.path.exists('vis'):
    os.makedirs('vis')
pyLDAvis.save_html(vis, 'vis/lda_terms.html')
pyLDAvis.display(vis)

# Visualize the Documents 1: Using MDS
While pyLDAvis is an excellent tool for exploring the topic/term aspect of a topic model (the words and their probabilities in each topic) it does not provide access to the document/topic aspect (the probability distribution of topics in each document). The visualization below plots all the documents according to their (cosine) distances (using Multi-Dimensional Scaling) in the Document/Term DataFrame. Each document (data point in the visualization) is colored according to the most prevalent topic and the size of the dot represents the probability of the most prevalent topic in that document.

Compute the distances between each of the documents. Use either the Document/Topic Dataframe or the Document/Term Dataframe (constructed below) to measure distance.

Since the data is already in list format, CountVectorizer does not need to preprocess or tokenize. The only way to prevent CountVectorizer from doing so is by creating dummy functions for the preprocessor and the tokenizer. These functions simply return the argument they receive.

In [7]:
df = topic_model['df']
texts = topic_model['texts']
cv = CountVectorizer(analyzer='word', preprocessor=lambda x: x, tokenizer=lambda x: x)
dtm = cv.fit_transform(texts)
dtm_df = pd.DataFrame(dtm.toarray(), columns = cv.get_feature_names(), index = df.index.values)
dtm_df.head()

Unnamed: 0,aban-bāšti[(a-stone)]n,aban-lamassi[(a-precious-stone)]n,aban-râmi['love'-stone]n,abati[(meaning-unknown)]n,abašmû[(a-stone)]n,abbušu[(meaning-unknown)]n,abbūtu[fatherhood]n,abiktu[defeat]n,abku[captive]n,ablu[brought]aj,...,ṭēmūtu[of-order]n,ṭīdu[clay]n,ṭīmu[yarn]n,ṭīpu[addition]n,ṭīru[impression]n,ṭūbtu[peace]n,ṭūbu[goodness]n,ṭūbātu[happiness]n,ṭūdu[way]n,ṭūru[opopanax]n
P224378,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P224382,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P224383,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P224386,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P224388,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
dist = squareform(pdist(dtm_df, 'cosine'))

Compute the position of each document using Multi-Dimensional Scaling. The variable `pos` holds the `x` and `y`  coordinates. Execution of the following cell may take several minutes.

In [9]:
seed = 15
mds = MDS(n_components=2, max_iter=3000,
       random_state=seed, dissimilarity="precomputed", n_jobs=1)
pos = mds.fit_transform(dist)

Create lists of x and y values (coordinates).

In [10]:
mds_x = [x for x, y in pos]
mds_y = [y for x, y in pos]

Create lists of the most prevalent topic, the probability of the most prevalent topic, and the text name for each document. These lists are used in the tooltips of the Bokeh visualization.

In [11]:
d_t_df = topic_model['d_t_df']
prevalent_topic = d_t_df.idxmax(axis=1)
probability = d_t_df.max(axis=1)
designation = list(df['designation'])

In [12]:
len(prevalent_topic), len(probability), len(designation)

(2997, 2997, 2997)

# Define Colors

Create a colormap. 

In [13]:
colormap = {0: 'grey', 1: "orange", 2: "olive", 3: "firebrick", 
          4: "gold", 5: "red", 6: "fuchsia", 7: "green", 
          8: "blue", 9: "purple", 10: "aqua", 11: "yellow", 
          12: "indigo", 13: "blueviolet", 14: "beige", 15:"navy", 16: 'chocolate',
          17: 'azure', 18: 'coral', 19: 'crimson', 20: 'darkblue', 21: 'darkkhaki', 
          22: 'darkseagreen', 23: 'darkturquoise', 24: 'deeppink', 25: 'black'}

Create a dictionary, to be used by Bokeh for drawing the visualization. In the dictionary each key is a feature and each value is a list with the values of that feature for each data point. All lists (all values) should be of equal length (the number of data points). The features include x and y coordinates, color, size, etc. 

In [14]:
d_mds = dict(
        x=mds_x,
        y=mds_y,
        id_text=list(df.id_text),
        size = probability/max(probability)*15,
        probability = probability,
        topic = prevalent_topic,
        color = [colormap[n] for n in prevalent_topic],
        alpha = [0.5] * len(mds_x),
        designation = designation
    )
instructions = [
    "Highlight (color) one or two topics by moving the sliders. If both sliders are 0, all topics are colored.",
    "Hover over a data point for more information. Click on a data point to go to the document edition.",
    "Use the toolbar to zoom, pan, reset, or save as .png."]

# JavaScript Code
Interactive features in Bokeh, such as sliders, use a callback function that is activated when a certain event takes place. This event can be a mouse movement, a click, or a change in the slider. Custom callback functions need to be written in JavaScript.



In [15]:
code = """
        var data = source.data;
        topic = data['topic']
        for (i = 0; i < topic.length; i++) {
            data.alpha[i] = 0.5
            data.color[i] = cm[topic[i]]
            if ((topic1.value == 0) && (topic2.value == 0)) {
                continue;
            } else if ((topic[i] == topic1.value) || (topic[i] == topic2.value)) {
                continue;
            } else {
                data.color[i] = 'grey'
                data.alpha[i] = '0.1'
            } 
        }
        source.change.emit();
        """

Draw the visualization. The visualization provides various tools for further exploration:
- tooltips (provides topic, probability, text name and URL)
- box zoom
- wheel zoom
- pan
- reset
- link to document edition
- save the visualization

In addition, the visualization has two sliders that allow the user to select two topics.

In [16]:
def drawviz(data, title, outputfile):
    source_mds = ColumnDataSource(data=data)
    p = figure(
        plot_width=1000, plot_height=1000,
        tools="tap,pan,wheel_zoom,box_zoom,reset,save", 
        title=title)
    p.add_tools(HoverTool(
        tooltips=[
            ("url", "http://oracc.org/" + "@id_text"),
            (("topic, probability"), ("@topic, @probability")),
            ("designation", "@designation")
        ]
        ))

    p.circle('x', 
         'y', 
         color='color', 
         fill_alpha='alpha', 
         size='size', 
         source=source_mds
         )
    p.axis.visible = False

    slider1 = Slider(start=0, end=ntopics, value=0, step=1, title="Topic A")
    slider2 = Slider(start=0, end=ntopics, value=0, step=1, title="Topic B")

    callback = CustomJS(args=dict(source=source_mds, topic1 = slider1, 
                              topic2 = slider2, cm = colormap), code = code)
    slider1.js_on_change('value', callback)
    slider2.js_on_change('value', callback)
    
    url = "http://oracc.museum.upenn.edu/@id_text"
    taptool = p.select(type=TapTool)
    taptool.callback = OpenURL(url=url)

    for line in instructions:
        p.add_layout(Title(text=line), 'below')

    layout = column(slider1, slider2, p)
    show(layout)
    output_file(outputfile)
    save(layout);

In [18]:
reset_output()
output_notebook()
title = "Projection with MDS. Size of the circle represents prevalence of the topic."
outputfile = 'vis/mds1.html'
drawviz(d_mds, title, 'vis/mds1.html')

## Alternative: plotting based on Document/Topic table
The following visualization uses the same approach, but takes the document/topic table as the basis for distance measurements. Documents that share approximately the same distribution of topics will be plotted n the same region. Since the sum of each row in the document/topic table is 1 the distance matrix is computed with euclidean distance (not cosine).

In [19]:
dist_dt = squareform(pdist(d_t_df))

In [20]:
mds = MDS(n_components=2, max_iter=3000,
       random_state=seed, dissimilarity="precomputed", n_jobs=1)
pos = mds.fit_transform(dist_dt)

In [21]:
d_mds2 = d_mds.copy() # the data source is the same as for the previous visualization, except for the x and y coordinates.
d_mds2['x'] = [x for x, y in pos]
d_mds2['y'] = [y for x, y in pos]

In [22]:
reset_output()
output_notebook()
title = "Projection with MDS, based on Document/Topic distribution. Size of the circle represents prevalence of the topic."
outputfile = 'vis/mds2.html'
drawviz(d_mds2, title, outputfile)

# Visualize the Documents 2: Using TSNE

# TSNE based on Document/Term Matrix (Cosine distance)

Cosine distances have been computed earlier; the matrix is stored in the variable `dist`.

In [23]:
X = dist
tsne = TSNE(n_components = 2, random_state=0, metric="precomputed")
X_tsne = tsne.fit_transform(X)

In [24]:
d_tsne = d_mds.copy() # the data source is the same as for the previous visualization, except for the x and y coordinates.
d_tsne['x'] = [x for x, y in X_tsne]
d_tsne['y'] = [y for x, y in X_tsne]

In [25]:
title = "Projection with tSNE. Size of the circle represents prevalence of the topic."
outputfile = 'vis/tsne1.html'
drawviz(d_tsne, title, outputfile)

# TSNE based on Document/Topic Matrix

In [26]:
X = dist_dt
tsne = TSNE(n_components = 2, random_state=0, metric="precomputed")
X_tsne = tsne.fit_transform(X)

In [27]:
d_tsne2 = d_mds.copy() # the data source is the same as for the previous visualization, except for the x and y coordinates.
d_tsne2['x'] = [x for x, y in X_tsne]
d_tsne2['y'] = [y for x, y in X_tsne]

In [28]:
title = "Projection with tSNE, based on Document/Topic distribution. Size of the circle represents prevalence of the topic."
outputfile = 'vis/tsne2.html'
drawviz(d_tsne2, title, outputfile)