In [1]:
import gensim
import numpy as np
from ipywidgets import interact, fixed
import ipywidgets as widgets
from IPython.display import display, clear_output
import pickle
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, output_file, save
from bokeh.models import ColumnDataSource, LabelSet, Legend, LegendItem, Range1d, OpenURL, TapTool
from bokeh.layouts import column
from bokeh.palettes import all_palettes
from bokeh.palettes import Category20
output_notebook()
# note: more recent bokeh versions require notebook 5 or JupyterLab

# Load Vectors in Gensim
The Gensim package allows one to compute word vectors with word2vec. Instead, we will load the vectors we computed with PMI/SVD (6.2) as keyed vectors. The keyed vector format that we used to save the vectors in 6.2 allows us to do exactly that. 

For the code see https://stackoverflow.com/questions/27139908/load-precomputed-vectors-gensim

In [2]:
model_file = "output/vec_file.txt"
model = gensim.models.keyedvectors.Word2VecKeyedVectors.load_word2vec_format(model_file)

# Most Similar
The advantage of loading the vectors into Gensim is that we may now use the functions and methods available in Gensim, for instance the method `most_similar()`, which will find the vectors with the highest cosine similarity to the target word(s).

In [3]:
def mostsimilar(change):
    lemm = dropdown.value
    topn = slider.value
    with out:
        clear_output()
        data = model.most_similar(lemm, topn=topn)
        df = pd.DataFrame(data, columns = ["lemma", "sim"])
        print(df)
    return 
#model.most_similar(positive=model.vocab.keys(), topn = 10))

In [4]:
out = widgets.Output(width = 200)
sortorder = " []'ʾaāâbcdeēêfgŋhiīîjklmnopqrsṣštṭuūûvwxyz0123456789₀₁₂₃₄₅₆₇₈₉ₓ{}[]().-/~?!@×|&'<>"
word = sorted(model.index_to_key, key=lambda w: [sortorder.index(c.casefold()) for c in w]) # use custom sort order
dropdown = widgets.Dropdown(options = word, description = 'Target Word')
slider = widgets.IntSlider(value=5, min=1, max = 25, description = 'Topno')
dropdown.observe(mostsimilar, "value")
slider.observe(mostsimilar, "value")
col1 = widgets.VBox([dropdown, out])
col2 = widgets.VBox([slider])
widgets.HBox([col1, col2])

HBox(children=(VBox(children=(Dropdown(description='Target Word', options=('a[arm]N', 'a[time]N', 'a[water]N',…

In [5]:
def word_categories(model, words, topn=10):
    """word_categories takes an iterable with one or more words from the vocabulary of model.
    For each word a dictionary of similar words (including the target word) is construed.
    The value of each word is numerical (integer) and indicates the category to which it belongs.
    The function returns a dictionary."""
    if len(words) > 5: 
        words = words[:5]
    word_d = {}
    for idx, word in enumerate(words):
        w = model.most_similar(word, topn=topn)
        w = [m[0] for m in w]
        w.append(word)
        for item in w:
            if item in word_d:
                word_d[item] = len(words)
            else:
                word_d[item] = idx
    return word_d

# Dictionary to link word to stable identifier (oid) 

In [6]:
with open("output/x2oid.p", "rb") as r:
    x2oid = pickle.load(r)

# Project and Visualize Related Words
Pick a maximum of 5 words. For each of those words the 15 most similar words are selected. The words are projected on a two-dimensional plot.

In [7]:
def tsne_bokeh2(model, words, fontsize="12pt"):

    labels, tokens, categories, legend, oid = [], [], [], [], []
    words_d = word_categories(model, words, 15)
    for word in words_d:
        tokens.append(model[word])
        labels.append(word)
        categories.append(words_d[word])
        oid.append(x2oid.get(word,""))
        if words_d[word] == len(words): 
            legend.append("mixed")
        else: 
            legend.append(words[words_d[word]])
    
    color_d = {0: "black", 1: "red", 2: "green", 3: "yellow", 4: "brown", 5: "blue"}
    colors = [color_d[category] for category in categories]
    
    #perplexity = 15
    #if len(words_d) < perplexity: 
    perplexity = abs(len(words_d)*0.55)
    tsne = TSNE(perplexity=perplexity, n_components=2, init='pca', n_iter=2500, random_state=23)
    tsne_embedding = tsne.fit_transform(tokens)
    
    source = ColumnDataSource(
        data=dict(
        x = tsne_embedding[:, 0],
        y = tsne_embedding[:, 1],
        colors = colors,
        labels= labels,
        legend = legend,
        oid = oid
        )
    )

    l = LabelSet(x='x', y='y', text='labels', level='glyph',
              x_offset=3, y_offset=3, source=source, render_mode='canvas', 
             text_font_size=fontsize, text_font="CuneiformComposite")
    tools = ["pan", "wheel_zoom", "zoom_in", "zoom_out", "tap", "reset", "box_zoom", "save"]
    plot_tsne = figure(plot_width=800, plot_height=600, tools=tools, toolbar_location='right')
    plot_tsne.add_layout(Legend(), "below")
    plot_tsne.circle('x', 'y', size=7, fill_color='colors', 
                  line_alpha=0, line_width=0.01, source=source, legend_group="legend")
    plot_tsne.add_layout(l)
    plot_tsne.legend.location = "bottom_right"
    url = "http://oracc.org/epsd2/@oid"
    taptool = plot_tsne.select(type=TapTool)
    taptool.callback = OpenURL(url=url)
    show(plot_tsne)
    return 
#source

In [8]:
interact(tsne_bokeh2, 
        model = fixed(model), 
        words = widgets.SelectMultiple(options = word, 
                                  description = "Word", 
                                  value = ["šimgig[tree]N", "hiz[vegetable]N"]) , 
         fontsize=fixed('12pt')
         ); 

interactive(children=(SelectMultiple(description='Word', index=(5767, 2576), options=('a[arm]N', 'a[time]N', '…