# Sumerian and Akkadian Word Embeddings

This notebook loads Sumerian and Akkadian word embedding models and gives some hints for how to explore these models. Will someone be brave enough to try and align the two models?

In [1]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
warnings.filterwarnings(action='ignore', category=FutureWarning, module='gensim' )
import gensim
import pickle
from sklearn.manifold import TSNE
from gensim.models.fasttext import FastText as FT_gensim
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, output_file, save
from bokeh.models import ColumnDataSource, LabelSet, Legend, LegendItem #, HoverTool, CustomJS, , Slider
from bokeh.layouts import column
from bokeh.palettes import all_palettes
output_notebook()
# note: more recent bokeh versions require notebook 5 or JupyterLab

# Load the Models
There is one Akkadian model, three Sumerian models (all three use the same data, but different data representations). If your Sumerian is rusty, the lemmatized model is recommended. 
- model_cuneiform.model: the Sumerian copus in Unicode cuneiform
- model_tl.model: the Sumerian corpus in transliteration
- model_lemm.model: the Sumerian corpus in lemmatization

In [2]:
model_sux = FT_gensim.load("model/model_lemm.model")
model_akk = FT_gensim.load("model/akk_model_lemm.model")

# Vocabulary
Lemmatized tokens have the form CitationForm[GuideWord]POS. To find possible vocabulary items you may inspect the `vocabulary`attribute.

In [6]:
model_sux.wv.vocab

({'1(barig@c)': <gensim.models.keyedvectors.Vocab at 0x4fbb860>,
  'še[barley]N': <gensim.models.keyedvectors.Vocab at 0x4fbb908>,
  'ba-lul': <gensim.models.keyedvectors.Vocab at 0x4fbb978>,
  'nagar[carpenter]N': <gensim.models.keyedvectors.Vocab at 0x4fbb9b0>,
  'niŋdu[appropriate-thing]N': <gensim.models.keyedvectors.Vocab at 0x4fbb9e8>,
  'aŋ[measure]V/t': <gensim.models.keyedvectors.Vocab at 0x4fbba20>,
  'hur-sag-še₃-mah': <gensim.models.keyedvectors.Vocab at 0x4fbba58>,
  'saŋ.DUN₃[recorder]N': <gensim.models.keyedvectors.Vocab at 0x4fbba90>,
  '2(iku@c)': <gensim.models.keyedvectors.Vocab at 0x4fbbac8>,
  'iku[unit]N': <gensim.models.keyedvectors.Vocab at 0x4fbbb00>,
  'har-tu-{d}sud₃': <gensim.models.keyedvectors.Vocab at 0x4fbbb38>,
  'nukirik[gardener]N': <gensim.models.keyedvectors.Vocab at 0x4fbbb70>,
  'me-zi-pa-e₃': <gensim.models.keyedvectors.Vocab at 0x4fbbba8>,
  '1/2(iku@c)': <gensim.models.keyedvectors.Vocab at 0x4fbbbe0>,
  'ša₃-gu₂-ba': <gensim.models.keyedvector

# Similar words

In [7]:
model_akk.wv.most_similar("šarru[king]N", topn = 2), model_sux.wv.most_similar("lugal[king]N", topn=2)

([('bēlu[lord]N', 0.9549878835678101), ('ana[to]PRP', 0.9517405033111572)],
 [('ki[place]N', 0.9306298494338989), ('kalam[land]N', 0.9235943555831909)])

# Out of Vocabulary Words
This is fastext ... You may also use OOV words such as partial matches.

In [10]:
model_akk.wv.most_similar("palace", topn = 2), model_sux.wv.most_similar("[tiger]N", topn=2)

([('Ekallu-eššetu[New-Palace-palace-in-Aššur]ON', 0.7083825469017029),
  ('egalturrû[little-palace]N', 0.6874814033508301)],
 [('uršub[tiger]N', 0.6488356590270996),
  ('uršubkuda[wild-animal]N', 0.5293694138526917)])

# Kings and Queens
Hmmmm

In [11]:
model_akk.wv.most_similar(positive=["šarru[king]N", "sinništu[woman]N"], negative = ["zikaru[male]N"])

[('ana[to]PRP', 0.8608435988426208),
 ('bēlu[lord]N', 0.8355156183242798),
 ('ša[of]DET', 0.8317270874977112),
 ('ša[that]REL', 0.82342529296875),
 ('ardu[slave]N', 0.8218632936477661),
 ('ina[in]PRP', 0.8146045207977295),
 ('māru[son]N', 0.80776047706604),
 ('abu[father]N', 0.8045011758804321),
 ('ēkallu[palace]N', 0.8006858229637146),
 ('muhhu[skull]N', 0.7995380163192749)]

# Oxen and Sheep
More culturally appropriate, perhaps.

In [13]:
model_sux.wv.most_similar(positive=["gud[oxen]N", "sila[lamb]N"], negative= ["amar[calf]N"])

[('mašgal[goat]N', 0.9077289700508118),
 ('ašgar[kid]N', 0.9058449268341064),
 ('u[ewe]N', 0.8994815349578857),
 ('udu[sheep]N', 0.8990205526351929),
 ('maš[goat]N', 0.8890479803085327),
 ('nua[~animal]N', 0.8813501596450806),
 ('niga[fattened]V/i', 0.875199556350708),
 ('gukkal[sheep]N', 0.8608914613723755),
 ('aslum[sheep]N', 0.8605831861495972),
 ('mašda[gazelle]N', 0.8534128665924072)]

# Some Bokeh Fun

In [14]:
def tsne_bokeh(model, word_d, words, fontsize="12pt"):

    labels = []
    tokens = []
    categories = []
    legend = []
    
    for word in word_d:
        tokens.append(model.wv[word])
        labels.append(word)
        categories.append(word_d[word])
        if word_d[word] == 5: 
            legend.append("mixed")
        else: 
            legend.append(words[word_d[word]])
    
    color_d = {0: "black", 1: "red", 2: "green", 3: "yellow", 4: "brown", 5: "blue"}
    colors = [color_d[category] for category in categories]
    
    tsne = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    tsne_embedding = tsne.fit_transform(tokens)
    tsne_embedding = pd.DataFrame(tsne_embedding, columns=['x','y'])
    tsne_embedding["color"] = colors
    tsne_embedding["labels"] = labels
    tsne_embedding["legend"] = legend

    source = ColumnDataSource(
        data=dict(
        x = tsne_embedding.x,
        y = tsne_embedding.y,
        colors = tsne_embedding.color,
        labels=tsne_embedding.labels,
        legend = tsne_embedding.legend
        )
    )

    l = LabelSet(x='x', y='y', text='labels', level='glyph',
              x_offset=5, y_offset=5, source=source, render_mode='canvas', 
             text_font_size=fontsize, text_font="CuneiformComposite")

    plot_tsne = figure(plot_width=900, plot_height=900) #, tools=tools_tsne, title='Papers')
    plot_tsne.circle('x', 'y', size=7, fill_color='colors', 
                  line_alpha=0, line_width=0.01, source=source, legend="legend")
    plot_tsne.add_layout(l)
    return plot_tsne

In [16]:
def word_categories(model, words, topn=10):
    """word_categories takes an iterable with one or more words from the vocabulary of model.
    For each word a dictionary of similar words (with the target word) is construed.
    The value of each word is numerical (integer) and indicates the category to which it belongs.
    The function returns a dictionary."""
    word_d = {}
    for idx, word in enumerate(words):
        w = model.wv.most_similar(word, topn=topn)
        w = [m[0] for m in w]
        w.append(word)
        for item in w:
            if item in word_d:
                word_d[item] = 5
            else:
                word_d[item] = idx
    return word_d

In [19]:
vegetables_perfumes = ["šimgig[tree]N", "hiz[vegetable]N"]
words_d = word_categories(model_sux, vegetables_perfumes, 35)
p2 = tsne_bokeh(model_sux, words_d, vegetables_perfumes, "20pt")
save(p2)
show(p2)

FileNotFoundError: [Errno 2] No such file or directory: 'graphs/vegetables_perfumes.html'