# Fasttext
Build a fasttext model and do some basic analyses.
Before running this code parse the ogsl data set to produce a dataframe of OGSL sign readings, sign names and utf8 representations, and parse the Ur 3 dataset to produce a text file in cuneiform utf8. 

The code below runs the Python implementation of FastText, which is rather underdeveloped and is hard to install in Windows. Other notebooks in this directory use the Gensim implementation.

In [1]:
import pickle
from fastText import train_unsupervised
from fastText import util
import pandas as pd
import numpy as np

In [2]:
model = train_unsupervised('corpus/sux_tl.txt', model='skipgram') #, ws=10, neg=70, epoch=100)
# increase negative sampling to 100 or 200 (70?)
# increase epochs 1000?

In [3]:
model.save_model("output/suxmodel_tl")

In [4]:
labels = model.get_words(include_freq=True)

In [5]:
df = pd.DataFrame({"label": list(labels[0]), "count":list(labels[1])})

In [6]:
len(df)

25873

In [7]:
df

Unnamed: 0,label,count
0,x,150217
1,1(diš),114830
2,sila₃,95159
3,</s>,88788
4,2(diš),76711
5,mu,71386
6,5(diš),66008
7,1(u),60889
8,gin₂,58827
9,3(diš),55176


In [8]:
class FastTextNN:
    """by Ali Abul Hawa; see https://github.com/facebookresearch/fastText/pull/552"""    
    def __init__(self, ft_model, ft_matrix=None):
        self.ft_model = ft_model        
        self.ft_words = ft_model.get_words()
        self.word_frequencies = dict(zip(*ft_model.get_words(include_freq=True)))
        self.ft_matrix = ft_matrix
        if self.ft_matrix is None:
            self.ft_matrix = np.empty((len(self.ft_words), ft_model.get_dimension()))
            for i, word in enumerate(self.ft_words):
                self.ft_matrix[i,:] = ft_model.get_word_vector(word)
    
    def find_nearest_neighbor(self, query_word, vectors, n=10,  cossims=None):
        """
        vectors is a 2d numpy array corresponding to the vectors you want to consider

        cossims is a 1d numpy array of size len(vectors), which can be passed for efficiency
        returns the index of the closest n matches to query within vectors and the cosine similarity (cosine the angle between the vectors)

        """
        
        query  = self.ft_model.get_word_vector(query_word)
        if cossims is None:
            cossims = np.matmul(vectors, query, out=cossims)

        norms = np.sqrt((query**2).sum() * (vectors**2).sum(axis=1))
        cossims = cossims/norms
        if query_word in self.ft_words:
            result_i = np.argpartition(-cossims, range(n+1))[1:n+1]
        else:
            result_i = np.argpartition(-cossims, range(n+1))[0:n]
        return list(zip(result_i, cossims[result_i]))

    def nearest_words(self, word, n=10, word_freq=None):
        result = self.find_nearest_neighbor(word, self.ft_matrix, n=n)
        if word_freq:
            return [(self.ft_words[r[0]], r[1]) for r in result if self.word_frequencies[self.ft_words[r[0]]] >= word_freq]
        else:
            return [(self.ft_words[r[0]], r[1]) for r in result]

# Basic usage of nearest_words

In [9]:
fasttext_nn = FastTextNN(model)
fasttext_nn.nearest_words('kir₁₁')

[('u₈', 0.8293469438666048),
 ('{munus}aš₂-gar₃', 0.7865385896163815),
 ('sila₄-nita₂', 0.7757921927768099),
 ('{munus}as₂-gar₃', 0.7628351001648012),
 ('ud₅', 0.7582369094984474),
 ('dara₄-nita₂', 0.7374769295860861),
 ('{munus}si-as₂-gar₃', 0.7277550289147642),
 ('maš₂-gal', 0.7236572334900637),
 ('šimašgi₂', 0.7202870804817777),
 ('{munus}aš₂-gar₃-še₃', 0.7160837278921374)]

# Allow input in transliteration

In [10]:
# create conversion dictionary value to utf8
import pickle
with open("output/ogsl.p", "rb") as p:
    o = pickle.load(p)
d = dict(zip(o["value"], o["utf8"]))

In [11]:
def nearestcun(text, n=10, word_freq=None):
    """input transliteration of a word form, signs separated by hyphens.
    Examples: "ma-an-gi₄", "d-en-lil₂", or "urim₅-ki".
    Transliteration style (gu₄ vs. gud; gen vs. ŋen) is unimportant
    The result is fed to nearest_words.
    n is the number of nearest neigbors that nearest_words will return
    word_freq is the minimum word frequency for a neighbor to be considered"""
    signs = text.lower().split('-')
    seq = [d[s] if s in d else s for s in signs]
    seq = ''.join(seq)
    print(seq), print(text)
    return fasttext_nn.nearest_words(seq, n, word_freq)

In [12]:
nearestcun("udu", n=15, word_freq=10)

𒇻
udu


[('lugal-{d}en-lil₂', 0.34422988045901964),
 ('ur-{d}nagar', 0.3226141831505778),
 ('lugal-ušur₃-ra', 0.32199120250670055),
 ('ur-{d}nu-muš-da', 0.319528987428491),
 ('lugal-ušur₄', 0.31901621352498827),
 ('nig₂', 0.31340217694430345),
 ('ur-e₂-nun-gal', 0.3066736712740029),
 ('ur-{d}nusku', 0.3056915158032068),
 ('aš-še₃', 0.3051245135023469),
 ('muhaldim-še₃', 0.30468321109948465),
 ('lugal-ušur₄-mu', 0.3045496127231272),
 ('ur-{d}nun-gal', 0.3037544105525011),
 ('ar-ha', 0.3032255832933541)]

# TODO
Transliterate output by finding the word in the corpus

In [13]:
with open("output/sux.p", "rb") as p:
    sux = pickle.load(p)

In [14]:
sux_2 = sux.drop_duplicates(["transliteration"])

In [15]:
sux_2[sux_2["utf-8"]=="𒊺"]

Unnamed: 0,transliteration,words,names,utf-8,lemm
2,še,[še],[ŠE],𒊺,še[barley]N
38770,ŠE,[ŠE],[ŠE],𒊺,X
43486,niga,[niga],[ŠE],𒊺,niga[fattened]


In [None]:
d2 = {}
for item in sux_2["utf-8"].unique():
    d[item] = list(sux_2[sux_2['utf-8']==item]["transliteration"])
    

In [None]:
d2 = {item : d2[item] for item in d2 if not item[:5] == "Start"}

In [None]:
with open("output/translit_to_signs.p", "wb") as w:
    pickle.dump(d2, w)