# Classification with FastText

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import gensim
import pickle
from gensim.models.word2vec import LineSentence
from gensim.models.fasttext import FastText as FT_gensim

import re
from scipy.spatial.distance import cosine
from sklearn.preprocessing import normalize
from numpy import array
from sklearn.cluster import KMeans

## Load / Build File With Only Drehem P Numbers

In [3]:
try:
    data_drehem = LineSentence('corpus/sux_lemm_drehem.txt')
    with open('corpus/sux_lemm_drehem.txt', 'r') as f:
        drehem_lines = f.readlines()
except:
    file = "corpus/sux_lemm.txt"
    with open(file) as f:
        lines = f.readlines()

    with open('BDTNS-CDLI-Drehem.txt', 'r') as f:
        drehem = f.read()
    drehem_P = re.findall(r'P[0-9]+', drehem)

    dct = {}
    for i in range(len(lines)):
        found = re.findall(r'P[0-9]+', lines[i])
        if len(found) == 1:
            dct[found[0][1:]] = i

    drehem_lines = []
    for _ in drehem_P:
        if _[1:] in dct:
            drehem_lines.append(lines[dct[_[1:]]])

    with open('corpus/sux_lemm_drehem.txt', 'w') as f:
        f.write(''.join(drehem_lines))

## Load / Compute All-Sumerian Model

In [4]:
model_all_file = 'model/sux_model_lemm_1.model'

try:
    model_all = FT_gensim.load(model_all_file)
except:
    file = "corpus/sux_lemm.txt"
    data = LineSentence(file)
    model_all = FT_gensim(data, min_count=5, window=10, size=100, negative=20, sorted_vocab=1, min_n=1, 
                          max_n=6, sg=1, iter=500)
    model_all.save(model_all_file)

## Load / Compute Drehem Only Model

In [5]:
model_drehem_file = 'model/sux_model_lemm_drehem_500.model'

try:
    model_drehem = FT_gensim.load(model_drehem_file)
except:
    model_drehem = FT_gensim(data_drehem, min_count=5, window=10, size=100, negative=20, sorted_vocab=1, min_n=1, 
                             max_n=6, sg=1, iter=500)
    model_drehem.save(model_drehem_file)

## Clustering With Weighed Vectors

In [6]:
model_1 = model_drehem
model_2 = model_all
model_1_weight = 0.2
model_2_weight = 1 - model_1_weight
word = 'lu₂-dingir-ra'

vs = []
for i in range(len(drehem_lines)):
    v = [0 for _ in range(100)]
    for w in drehem_lines[i][:-1].split(' ')[1:]:
        v += model_1.wv[w] * model_1_weight + model_2.wv[w] * model_2_weight
    vs.append(v)

ldr_ids = []
for i in range(len(drehem_lines)):
    if len(re.findall(word, drehem_lines[i])) > 0:
        ldr_ids.append(i)

vsn = [0 for i in range(len(vs))]
for i in range(len(vs)):
    vsn[i] = (normalize([vs[i]])[0])

ldr_ids = array(ldr_ids)
ldr = array(vsn)[ldr_ids]
drehem_lines = array(drehem_lines)

kmeans = KMeans(n_clusters=2, random_state=42).fit(ldr)

## Line Numbers of One Cluster

In [7]:
ldr_ids[kmeans.labels_ == 1]
# For content:
# drehem_lines[ldr_ids[kmeans.labels_ == 1]]

array([    4,    47,   116,   157,   168,   201,   202,   283,   293,
         302,   308,   459,   486,   510,   525,   559,   570,   593,
         668,   669,   700,   806,   816,   841,   845,   878,   972,
         990,  1073,  1184,  1205,  1278,  1464,  1549,  1712,  1878,
        2303,  2328,  2368,  2423,  2544,  2577,  2588,  2633,  2785,
        2798,  2881,  2882,  2972,  3006,  3040,  3099,  3128,  3148,
        3167,  3211,  3245,  3293,  3326,  3330,  3417,  3563,  3585,
        3617,  3631,  3632,  3634,  3636,  3645,  3663,  3675,  3684,
        3685,  3687,  3692,  3702,  3707,  3714,  3719,  3731,  3732,
        3749,  3763,  3768,  3776,  3777,  3779,  3785,  3792,  3820,
        3823,  3832,  3836,  3841,  3846,  3847,  3851,  3855,  3856,
        3859,  3869,  3872,  3888,  3947,  3996,  3997,  3998,  4047,
        4054,  4165,  4192,  4209,  4216,  4222,  4238,  4247,  4250,
        4262,  4269,  4466,  4500,  4536,  4538,  4558,  4620,  4625,
        4627,  4629,

## Line Numbers of the Other Cluster

In [8]:
ldr_ids[kmeans.labels_ == 0]
# For text:
# drehem_lines[ldr_ids[kmeans.labels_ == 0]]

array([   38,    66,    71,    91,    93,   130,   132,   135,   154,
         156,   192,   236,   269,   282,   285,   350,   356,   357,
         359,   457,   505,   514,   518,   561,   584,   618,   675,
         712,   732,   761,   794,   796,   839,   842,   846,   849,
         873,   875,   935,   945,  1008,  1034,  1040,  1041,  1049,
        1056,  1063,  1077,  1078,  1082,  1108,  1126,  1165,  1186,
        1191,  1221,  1226,  1229,  1236,  1243,  1248,  1297,  1308,
        1316,  1337,  1360,  1369,  1376,  1393,  1400,  1408,  1418,
        1486,  1505,  1508,  1515,  1524,  1547,  1574,  1579,  1596,
        1599,  1615,  1623,  1625,  1631,  1637,  1638,  1640,  1642,
        1643,  1650,  1655,  1673,  1696,  1731,  1753,  1754,  1771,
        1780,  1789,  1856,  1866,  1868,  1876,  1882,  1895,  1918,
        2026,  2085,  2169,  2170,  2264,  2302,  2327,  2349,  2360,
        2405,  2504,  2515,  2547,  2557,  2558,  2580,  2590,  2601,
        2615,  2689,