# Visualizing Embeddings with t-SNE and Bokeh
The tsne_bokeh function in this notebook looks overly complex with assembling data in a DataFrame and then transforming this same data into a dictionary in ColumnDataSource and then calling this data in plot_tsne.circle. Any suggestions?

Inspired by [*LDA visualized using t-SNE and Bokeh*](https://www.kaggle.com/yohanb/lda-visualized-using-t-sne-and-bokeh) by Yohan, and [*Visualizing Word Vectors with t-SNE*](https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne) by Jeff Delayney.

## Prerequisites
More recent versions of Bokeh are compatible with Jupyter Notebook 5 or JupyterLab, but not with Jupyter Notebook 4.


In [None]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim') # this is only relevant in Windows.
warnings.filterwarnings(action='ignore', category=FutureWarning, module='gensim' ) # warning will disappear in a future version of Gensim
from gensim.models.fasttext import FastText as FT_gensim
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save
from bokeh.models import ColumnDataSource, LabelSet, Legend, LegendItem #, HoverTool, CustomJS, , Slider
output_notebook()
# note: more recent bokeh versions require notebook 5 or JupyterLab

## Load the Sumerian Model

In [None]:
model_sux = FT_gensim.load("model/model_lemm.model")

# Select core words
Select between 1 and 5 words for display. Vegetables are luxury food items. The lemma `šimgig[tree]N` appears primarily in deliveries for the production of perfumes.

In [None]:
vegetables_perfumes = ["šimgig[tree]N", "hiz[vegetable]N"]

# Find related words for each of the core words

In [None]:
def word_categories(model, words, topn=10):
    """word_categories takes an iterable with one or more words (up to 5) from the vocabulary of model.
    For each word a dictionary of similar words (with the target word) is construed.
    The value of each word is numerical (integer) and indicates the category to which it belongs.
    The function returns a dictionary."""
    word_d = {}
    for idx, word in enumerate(words):
        w = model.wv.most_similar(word, topn=topn)
        w = [m[0] for m in w]
        w.append(word)
        for item in w:
            if item in word_d:
                word_d[item] = 5
            else:
                word_d[item] = idx
    return word_d

In [None]:
words_d = word_categories(model_sux, vegetables_perfumes)

# Feed the Result to T-SNE and Bokeh

In [None]:
def tsne_bokeh(model, word_d, words, fontsize="12pt"):

    labels = []
    tokens = []
    categories = []
    legend = []
    
    for word in word_d:
        tokens.append(model.wv[word])
        labels.append(word)
        categories.append(word_d[word])
        if word_d[word] == 5: 
            legend.append("mixed")
        else: 
            legend.append(words[word_d[word]])
    
    color_d = {0: "black", 1: "red", 2: "green", 3: "yellow", 4: "brown", 5: "blue"}
    colors = [color_d[category] for category in categories]
    
    tsne = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    tsne_embedding = tsne.fit_transform(tokens)
    tsne_embedding = pd.DataFrame(tsne_embedding, columns=['x','y'])
    tsne_embedding["color"] = colors
    tsne_embedding["labels"] = labels
    tsne_embedding["legend"] = legend

    source = ColumnDataSource(
        data=dict(
        x = tsne_embedding.x,
        y = tsne_embedding.y,
        colors = tsne_embedding.color,
        labels=tsne_embedding.labels,
        legend = tsne_embedding.legend
        )
    )

    l = LabelSet(x='x', y='y', text='labels', level='glyph',
              x_offset=5, y_offset=5, source=source, render_mode='canvas', 
             text_font_size=fontsize, text_font="CuneiformComposite")

    plot_tsne = figure(plot_width=900, plot_height=900) #, tools=tools_tsne, title='Papers')
    plot_tsne.circle('x', 'y', size=7, fill_color='colors', 
                  line_alpha=0, line_width=0.01, source=source, legend="legend")
    plot_tsne.add_layout(l)
    return plot_tsne

In [None]:
p2 = tsne_bokeh(model_sux, words_d, vegetables_perfumes)
show(p2)