# Load Vectors in Gensim
For the code see https://stackoverflow.com/questions/27139908/load-precomputed-vectors-gensim
The advantage of loading in gensim is that we can use existing functions.

In [28]:
import gensim
import numpy as np
import pickle
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, output_file, save
from bokeh.models import ColumnDataSource, LabelSet, Legend, LegendItem, Range1d, OpenURL, TapTool
from bokeh.layouts import column
from bokeh.palettes import all_palettes
from bokeh.palettes import Category20
output_notebook()
# note: more recent bokeh versions require notebook 5 or JupyterLab

In [2]:
model_file = "output/vec_file.txt"
model = gensim.models.keyedvectors.Word2VecKeyedVectors.load_word2vec_format(model_file)

In [8]:
def word_categories(model, words, topn=10):
    """word_categories takes an iterable with one or more words from the vocabulary of model.
    For each word a dictionary of similar words (with the target word) is construed.
    The value of each word is numerical (integer) and indicates the category to which it belongs.
    The function returns a dictionary."""
    word_d = {}
    for idx, word in enumerate(words):
        w = model.most_similar(word, topn=topn)
        w = [m[0] for m in w]
        w.append(word)
        for item in w:
            if item in word_d:
                word_d[item] = len(words)
            else:
                word_d[item] = idx
    return word_d

In [17]:

with open("output/x2oid.p", "rb") as r:
    x2oid = pickle.load(r)

# Project and Visualize Related Words

In [77]:
def tsne_bokeh(model, word_d, words, fontsize="12pt"):

    labels, tokens, categories, legend, oid = [], [], [], [], []
    
    for word in word_d:
        tokens.append(model[word])
        labels.append(word)
        categories.append(word_d[word])
        oid.append(x2oid.get(word,""))
        if word_d[word] == len(words): 
            legend.append("mixed")
        else: 
            legend.append(words[word_d[word]])
    
    color_d = {0: "black", 1: "red", 2: "green", 3: "yellow", 4: "brown", 5: "blue"}
    colors = [color_d[category] for category in categories]
    
    tsne = TSNE(perplexity=20, n_components=2, init='pca', n_iter=2500, random_state=23)
    tsne_embedding = tsne.fit_transform(tokens)
    
    source = ColumnDataSource(
        data=dict(
        x = tsne_embedding[:, 0],
        y = tsne_embedding[:, 1],
        colors = colors,
        labels= labels,
        legend = legend,
        oid = oid
        )
    )

    l = LabelSet(x='x', y='y', text='labels', level='glyph',
              x_offset=5, y_offset=5, source=source, render_mode='canvas', 
             text_font_size=fontsize, text_font="CuneiformComposite")
    tools = ["pan", "wheel_zoom", "zoom_in", "zoom_out", "tap", "reset", "box_zoom", "save"]
    plot_tsne = figure(plot_width=600, plot_height=400, tools=tools) #, tools=tools_tsne, title='Papers')
    plot_tsne.circle('x', 'y', size=7, fill_color='colors', 
                  line_alpha=0, line_width=0.01, source=source, legend_group="legend")
    plot_tsne.add_layout(l)
    plot_tsne.legend.border_line_width = 3
    plot_tsne.legend.border_line_color = "black"
    plot_tsne.legend.border_line_alpha = 1
    plot_tsne.legend.location = "bottom_right"
    plot_tsne.legend.background_fill_color = "beige"
    plot_tsne.legend.background_fill_alpha = 0.5
    url = "http://oracc.org/epsd2/@oid"
    taptool = plot_tsne.select(type=TapTool)
    taptool.callback = OpenURL(url=url)
    show(plot_tsne)
    return source

In [78]:
#output_file("graphs/vegetables_perfumes.html")
vegetables_perfumes = ["šimgig[tree]N", "hiz[vegetable]N"]
words_d = word_categories(model, vegetables_perfumes, 10)
p2 = tsne_bokeh(model, words_d, vegetables_perfumes, "12pt")
#save(p2)
#show(p2)

In [None]:
x2oid

In [52]:
p = figure(plot_width=400, plot_height=400,
           tools="tap", title="Click the Dots")

source = ColumnDataSource(data=dict(
    x=[1, 2, 3, 4, 5],
    y=[2, 5, 8, 2, 7],
    color=["navy", "orange", "olive", "firebrick", "gold"]
    ))

p.circle('x', 'y', color='color', size=20, source=source)

url = "http://www.colors.commutercreative.com/@color/"
taptool = p.select(type=TapTool)
taptool.callback = OpenURL(url=url)

show(p)