# Visualizing FastText Model

This notebook develops functions for visualizing semantic relationships preserved in a FastText model of the Sumerian corpus using t-SNE and Bokeh.

Inspired by [*LDA visualized using t-SNE and Bokeh*](https://www.kaggle.com/yohanb/lda-visualized-using-t-sne-and-bokeh) by Yohan, and [*Visualizing Word Vectors with t-SNE*](https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne) by Jeff Delayney.

Note: for visualization Bokeh is preferred over Matplotlib primarily because of the difficulties in using a custom font in Matplotlib. This becomes a major obstacle when trying to represent tokens in cuneiform.

In [1]:
import gensim
import numpy as np
import pickle
from sklearn.manifold import TSNE
from gensim.models.fasttext import FastText as FT_gensim
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, output_file, save
from bokeh.models import ColumnDataSource, LabelSet, Legend, LegendItem, Range1d #, HoverTool, CustomJS, , Slider
from bokeh.layouts import column
from bokeh.palettes import all_palettes
from bokeh.palettes import Category20
output_notebook()
# note: more recent bokeh versions require notebook 5 or JupyterLab

# Load the Models
There are three models: 
- model_cuneiform.model: the Sumerian copus in Unicode cuneiform
- model_tl.model: the Sumerian corpus in transliteration
- model_lemm.model: the Sumerian corpus in lemmatization

In [2]:
model_c = FT_gensim.load("model/model_cuneiform.model")
model_l = FT_gensim.load("model/model_lemm.model")
model_t = FT_gensim.load("model/model_tl.model")

# Cuneify
Create a function that allows input in transliteration, with output in cuneiform.

In [3]:
with open("output/ogsl.p", "rb") as p:
    o = pd.read_pickle(p, compression = None)
signs_d = dict(zip(o["value"], o["utf8"]))

In [4]:
def cun(text): 
    """transform transliterated input into cuneiform. Use unicode subscript numbers and separate all signs with hyphens; separate words with blanks
    Examples: 'ma-an-gi₄'; 'd-en-lil₂ nibru-ki'.
    Transliteration style (sugal₇ vs. sukkal; dug₄ vs. du₁₁; gen vs. ŋen; etc.) and capitalization are unimportant.
    """
    cun_line = []
    words = text.lower().split()
    for word in words: 
        signs = word.lower().split('-')
        seq = [signs_d[s] if s in signs_d else s for s in signs]
        seq = ''.join(seq)
        cun_line.append(seq)
    line = ' '.join(cun_line)
    return(line)

# Create Lists of Semantically Related Words

In [5]:
def word_categories(model, words, topn=10):
    """word_categories takes an iterable with one or more words from the vocabulary of model.
    For each word a dictionary of similar words (with the target word) is construed.
    The value of each word is numerical (integer) and indicates the category to which it belongs.
    The function returns a dictionary."""
    word_d = {}
    for idx, word in enumerate(words):
        w = model.wv.most_similar(word, topn=topn)
        w = [m[0] for m in w]
        w.append(word)
        for item in w:
            if item in word_d:
                word_d[item] = len(words)
            else:
                word_d[item] = idx
    return word_d

# Project and Visualize Related Words

In [6]:
def tsne_bokeh(model, word_d, words, fontsize="12pt"):

    labels = []
    tokens = []
    categories = []
    legend = []
    
    for word in word_d:
        tokens.append(model.wv[word])
        labels.append(word)
        categories.append(word_d[word])
        if word_d[word] == len(words): 
            legend.append("mixed")
        else: 
            legend.append(words[word_d[word]])
    
    color_d = {0: "black", 1: "red", 2: "green", 3: "yellow", 4: "brown", 5: "blue"}
    colors = [color_d[category] for category in categories]
    #c = Category20[len(words) + 1]
    #colors = [c[category] for category in categories]
    
    tsne = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    tsne_embedding = tsne.fit_transform(tokens)
    tsne_embedding = pd.DataFrame(tsne_embedding, columns=['x','y'])
    tsne_embedding["color"] = colors
    tsne_embedding["labels"] = labels
    tsne_embedding["legend"] = legend

    source = ColumnDataSource(
        data=dict(
        x = tsne_embedding.x,
        y = tsne_embedding.y,
        colors = tsne_embedding.color,
        labels=tsne_embedding.labels,
        legend = tsne_embedding.legend
        )
    )

    l = LabelSet(x='x', y='y', text='labels', level='glyph',
              x_offset=5, y_offset=5, source=source, render_mode='canvas', 
             text_font_size=fontsize, text_font="CuneiformComposite")

    plot_tsne = figure(plot_width=900, plot_height=900) #, tools=tools_tsne, title='Papers')
    plot_tsne.circle('x', 'y', size=7, fill_color='colors', 
                  line_alpha=0, line_width=0.01, source=source, legend_label="legend")
    plot_tsne.add_layout(l)
    plot_tsne.legend.border_line_width = 3
    plot_tsne.legend.border_line_color = "black"
    plot_tsne.legend.border_line_alpha = 1
    plot_tsne.legend.location = "top_right"
    plot_tsne.legend.background_fill_color = "beige"
    plot_tsne.legend.background_fill_alpha = 0.5
    show(plot_tsne)
    return plot_tsne

In [7]:
#output_file("graphs/vegetables_perfumes.html")
vegetables_perfumes = ["šim-gig", "hi{sar}"]
words_d = word_categories(model_l, vegetables_perfumes, 35)
p2 = tsne_bokeh(model_l, words_d, vegetables_perfumes, "12pt")
#save(p2)
#show(p2)

In [8]:
central_bureau = [cun("na-sa₆"), cun("ab-ba-sa₆-ga"), cun("in-ta-e₃-a")]
c_b = word_categories(model_c, central_bureau, 20)
p1 = tsne_bokeh(model_c, c_b, central_bureau, "20pt")
#save(p1)
#show(p1)

In [10]:
output_file("graphs/vegetables_perfumes_l.html")
vegetables_perfumes = ["šimgig[tree]N", "hiz[vegetable]N"]
words_d = word_categories(model_l, vegetables_perfumes, 35)
p2 = tsne_bokeh(model_l, words_d, vegetables_perfumes, "12pt")
save(p2)
show(p2)

FileNotFoundError: [Errno 2] No such file or directory: 'graphs/vegetables_perfumes_l.html'

In [None]:
model_c.wv.most_similar(cun("ab-ba-sa₆-ga"), topn = 20)

In [None]:
naramili = cun("na-ra-am-i₃-li₂")

In [None]:
model_c.wv.most_similar(naramili)

In [None]:
model_c.wv.most_similar(cun("lugal-iti-da"))

In [None]:
babati_abbasaga = [cun("ba-ba-ti"), cun("ab-ba-sa₆-ga")]
words_d = word_categories(model_c, babati_abbasaga, 10)
p2 = tsne_bokeh(model_c, words_d, babati_abbasaga, "20pt")
show(p2)

In [None]:
p_d = ["ab-ba-sa₆-ga", "u₂-ta₂-mi-šar-ra-am", "a-hu-we-er", "tu-ra-am-d-da-gan"]

In [None]:
p_d = ["šu-{d}idim", "šu-kab-ta", "šu-{d}kab₂-ta₂", "šu-{d}kab-ta₂"]

In [None]:
for item in p_d: 
    print(item in model_t.wv.vocab)

In [None]:
pd = [model_c.wv[p] for p in p_d]

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt

In [None]:
Z = linkage(pd, 'ward')

In [None]:
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist

In [None]:
c, coph_dists = cophenet(Z, pdist(pd))
c

In [None]:
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    Z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
plt.show()

In [None]:
def dendro(model, word, topn=10): 
    words = model.wv.most_similar(word, topn = topn)
    words = [w[0] for w in words]
    words.append(word)
    vectors = [model.wv[p] for p in words]
    l = linkage(vectors, method='complete', metric='seuclidean')

# calculate full dendrogram
    plt.figure(figsize=(25, 10))
    plt.title('Hierarchical Clustering Dendrogram')
    plt.ylabel('word')
    plt.xlabel('distance')

    dendrogram(
        l,
        leaf_rotation=90.,  # rotates the x axis labels
        leaf_font_size=16.,  # font size for the x axis labels
        leaf_label_func=lambda v: str(words[v])
        )
    plt.show()

In [None]:
dendro(model_t, "ha-bu-um")

In [None]:
model_c.wv.most_similar(positive=["𒊺", "𒆬𒄀"], negative=["𒃻"])

In [None]:
nakabtum_a = [cun("a-hu-ni"), cun("d-šul-gi-a-a-mu"), cun("šu-ma-ma"), cun("zu-ba-ga")]

In [None]:
for name in directors: 
    print(name in model_c.wv.vocab)

In [None]:
nakabtum_b = [cun("lu₂-dingir-ra"), cun("a-hu-we-er"), cun("igi-d-en-lil₂-še₃")]

In [None]:
royal_court = [cun("u₂-ta₂-mi-šar-ra-am"), cun("tu-ra-am-d-da-gan"), cun("tah-ša-tal")]

In [None]:
tummal = [cun("en-dingir-mu")]

In [None]:
enlila = [cun("d-en-lil₂-la₂")]

In [None]:
urkununa = [cun("ur-kug-nun-na")]

In [None]:
nalu = [cun("na-lu₅")]

In [None]:
central = [cun("ab-ba-sa₆-ga"), cun("in-ta-e₃-a"), cun("na-sa₆")]

In [None]:
directors = [cun("na-ra-am-i₃-li₂"), cun("lugal-iti-da"), cun("d-šara₂-kam")]

In [None]:
offices = [nakabtum_a, nakabtum_b, enlila, nalu, 
           urkununa, central, directors]
office_names = ["Nakabtum A", "Nakabtum B", "Enlila", "Nalu", "Urkununa", "Central Bureau", "Directors"]
p = figure(plot_width=900, plot_height=900) 

In [None]:
tokens = []
labels = []
categories = []
legend = []
for idx, office in enumerate(offices): 
    for name in office: 
        tokens.append(model_c.wv[name])
        labels.append(name)
        categories.append(idx)
        legend.append(office_names[idx])

In [None]:
from bokeh.palettes import Category20
c = all_palettes["Spectral"][len(offices)]
colors = [c[category] for category in categories]

In [None]:
tsne = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
tsne_embedding = tsne.fit_transform(tokens)
tsne_embedding = pd.DataFrame(tsne_embedding, columns=['x','y'])
tsne_embedding["color"] = colors
tsne_embedding["labels"] = labels
tsne_embedding["legend"] = legend

In [None]:
source = ColumnDataSource(
        data=dict(
        x = tsne_embedding.x,
        y = tsne_embedding.y,
        colors = tsne_embedding.color,
        labels=tsne_embedding.labels,
        legend = tsne_embedding.legend
        )
    )

In [None]:
l = LabelSet(x='x', y='y', text='labels', level='glyph',
              x_offset=5, y_offset=5, source=source, render_mode='canvas', 
             text_font_size="20pt", text_font="CuneiformComposite")

plot_tsne = figure(plot_width=900, plot_height=900) #, tools=tools_tsne, title='Papers')
plot_tsne.circle('x', 'y', size=7, fill_color='colors', 
                  line_alpha=0, line_width=0.01, source=source, legend="legend")
plot_tsne.add_layout(l)
plot_tsne.legend.border_line_width = 3
plot_tsne.legend.border_line_color = "black"
plot_tsne.legend.border_line_alpha = 1
plot_tsne.legend.location = "top_left"
plot_tsne.legend.background_fill_color = "beige"
plot_tsne.legend.background_fill_alpha = 0.5

In [None]:
show(plot_tsne)

In [None]:
all_palettes["Spectral"][6]

In [None]:
with open("output/utf8_lemm_d.p", 'rb') as f: 
    utf8_lemm = pickle.load(f)
utf8_lemm

In [None]:
utf8_lemm[cun("za-bir")]

In [None]:
nakabtum_a

In [None]:
wild_animals = [cun("ur-mah"), cun("muš"), cun("az"), cun("ka₅-a"), cun("šeg₉"), cun("maš-da₃"), cun("šeg₉-bar"), 
                cun("dara₄"), cun("am"), cun("am-si"), cun("am-si-kur-ra")]

In [None]:
domestic_animals = [cun("udu"), cun("gud"), cun("u₈"), cun("ab₂"), cun("anše"), cun("eme₅"), cun("sila₄"), cun("kir₁₁"), cun("amar"), 
                  cun("uz₃"), cun("maš₂") ]

In [None]:
animals = wild_animals.copy()
animals.extend(domestic_animals)

In [None]:
animal_like = model_c.wv.most_similar(animals, topn=20)
animal_like = [word for word, similarity in animal_like]

In [None]:
animals.extend(animal_like)

In [None]:
x = [model_c.wv.similarity(cun("ur-mah"), word) for word in animals]
y = [model_c.wv.similarity(cun("udu"), word) for word in animals]

In [None]:
df = pd.DataFrame({"x" : x, "y": y, "labels" : animals})

In [None]:
source = ColumnDataSource(
        data=dict(
        x = df.x,
        y = df.y,
        labels=df.labels,
        )
    )

In [None]:
lab = LabelSet(x='x', y='y', text='labels', level='glyph',
              x_offset=5, y_offset=5, source=source, render_mode='canvas', 
             text_font_size="15pt", text_font="CuneiformComposite")
p3 = figure(plot_width=900, plot_height=900) #, tools=tools_tsne, title='Papers')
p3.circle('x', 'y', size=7, fill_color='red', 
                  line_alpha=0, line_width=0.01, source=source)
left, right, bottom, top = .8, 1.01, .8, 1.01
p3.x_range=Range1d(left, right)
p3.y_range=Range1d(bottom, top)
p3.add_layout(lab)
p3.line([0, 1], [0, 1], line_width=2)

In [None]:
show(p3)

In [None]:
model_c

In [None]:
model_c.wv[cun("lugal")]

In [None]:
from bokeh.plotting import figure, show
from bokeh.layouts import row, gridplot
from bokeh.models import ColumnDataSource
from bokeh.io import output_file
import pandas as pd

feature_groups  = [['ciao'],['bye']]


df = pd.DataFrame.from_dict({'x':[0,1,2,3,4], 'y':[2,3,4,5,6]})
x_test = [0,1,2,3,4]
y_test = [2,3,4,5,6]
source = ColumnDataSource(df)


for features_columns in feature_groups:
    output_file('features_labels' + features_columns[0] +'.html')
    p = []

    for k,f in enumerate(features_columns):
        p_k = figure(title=f)
        p_k.circle(x=f, y='ki', line_width=2, source=source,fill_alpha=0.5,line_alpha=0.5)
        p_k.circle_cross( x=x_test, y=y_test, color='red',fill_alpha=0.5,line_alpha=0.5)
        p_k.circle_cross( x = x_test, y = y_test, color='green',fill_alpha=0.5,line_alpha=0.5)
        p_k.xaxis.axis_label = f
        p_k.yaxis.axis_label = 'ki'
        p.append(p_k)
    grid = gridplot(p, ncols=2)
    show(grid)


In [None]:
from bokeh.plotting import figure, show
from bokeh.layouts import row, gridplot
from bokeh.models import ColumnDataSource
from bokeh.io import output_file
import pandas as pd

feature_groups  = [['ciao'],['bye']]

df = pd.DataFrame.from_dict({'x':[0,1,2,3,4], 'y':[2,3,4,5,6]})
x_test = [0,1,2,3,4]
y_test = [2,3,4,5,6]

for features_columns in feature_groups:
    output_file('features_labels' + features_columns[0] +'.html')
    p = []
    source = ColumnDataSource(df)

    for k,f in enumerate(features_columns):
        p_k = figure(title=f)
        p_k.circle(x='x', y='y', line_width=2, source=source,fill_alpha=0.5,line_alpha=0.5)
        p_k.circle_cross( x=x_test, y=y_test, color='red',fill_alpha=0.5,line_alpha=0.5)
        p_k.circle_cross( x = x_test, y = y_test, color='green',fill_alpha=0.5,line_alpha=0.5)
        p_k.xaxis.axis_label = f
        p_k.yaxis.axis_label = 'ki'
        p.append(p_k)
    grid = gridplot(p, ncols=2)
    show(grid)