<a href="https://colab.research.google.com/github/mstekel/gate-theory/blob/main/akk_word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip3 install text-fabric
!pip3 install gensim==4.1.2 --user
!pip3 install jsonpath-ng

In [None]:
%load_ext autoreload
%autoreload 2

import os

os.environ["PYTHONHASHSEED"] = "777"
import logging  # Setting up the loggings to monitor gensim

logging.basicConfig(
    format="%(levelname)s - %(asctime)s: %(message)s",
    datefmt="%H:%M:%S",
    level=logging.INFO,
)

In [None]:
from gensim import models
from datetime import datetime
from pathlib import Path
from os import path
import os
import platform
import numpy as np
from collections import defaultdict
import pandas as pd


import sys

if platform.system() == 'Windows':
    os.environ["DRIVE_ROOT"] = str(Path(os.environ["USERPROFILE"]) / "Google Drive")
else:
    os.environ["DRIVE_ROOT"] = '/content/drive/MyDrive'
DRIVE_ROOT = Path(os.environ["DRIVE_ROOT"])
sys.path.append(str(DRIVE_ROOT / 'Colab Notebooks/akk_word2vec'))

from bible import config as bible_config
#from oracc import config as oracc_config
#from simpsons import config as simpsons_config

#config = simpsons_config
config = bible_config
#config = oracc_config

window = config['word2vec_args']['window']
epochs = config['epochs']

pos = ['verb','subs','advb','adjv','nmpr']
# pos = []

model_path = str(
    DRIVE_ROOT
    / f"models/word2vec/{config['corpus']}_{window}_{epochs}_{pos}.model"
)

df = pd.DataFrame(
    config["corpus_ctor"](pos),
    columns= [
        'origin',
        'context', 
        'clean',
        'sense',
        'genre'
    ]
)

display(df)


# display(df.shape[0])

if False:
    model = models.Word2Vec.load(model_path)
else:
    print(f"Model build started: {datetime.now().time()}")
    model = models.Word2Vec(**config["word2vec_args"])
    model.build_vocab(df['clean'].tolist(), progress_per=10000)
    model.train(
        df['clean'].tolist(), total_examples=model.corpus_count, epochs=epochs, report_delay=1
    )
    # normalize the model
    for k in model.wv.key_to_index:
        model.wv[k] = model.wv[k] / np.linalg.norm(model.wv[k])    
    model.save(model_path)
    print(f"Model build finished: {datetime.now().time()}")

In [None]:
import numpy as np
for count, k in enumerate(model.wv.key_to_index):
    if count > 5:
        break
    print(np.linalg.norm(model.wv[k]))

In [None]:
from html import escape

lemma = "galû"
#lemma = "kayyānu"
print(lemma)
lemma_span = model.wv.most_similar(positive=[lemma], topn=30)
lemma_span

In [None]:
import numpy as np
import heapq

def softmax(x):
    return np.exp(x) / np.sum(np.exp(x))

def get_sim_vector(s, t):
    return np.array([np.dot(t, w) / (np.linalg.norm(t) * np.linalg.norm(w)) for w in s])

def pay_attention(lemma_sentences, lemma):
    vec = model.wv[lemma]
    similarity_vector_list = []
    attention_vector_list = []
    attention_highlight_indices_list = []
    attention_highlight_list = []
    attention_list = []
    for i, s in lemma_sentences.iterrows():
        context = [w for w in s['context'] if w != lemma and w in model.wv.key_to_index]
        context_vec = [model.wv[w] for w in context]
        if(len(context_vec) == 0):
            similarity_vector_list.append(None)
            attention_vector_list.append(None)
            attention_highlight_indices_list.append(None)
            attention_highlight_list.append(None)
            attention_list.append(None)
        else:
            sim_vec = get_sim_vector(context_vec, vec)
            similarity_vector_list.append(sim_vec)
            att_vec = softmax(sim_vec)
            attention_vector_list.append(att_vec)
            attention_highlight_indices = [x[0] for x in heapq.nlargest(3, enumerate(att_vec), key=lambda x: x[1])]
            attention_highlight_indices_list.append(attention_highlight_indices)
            attention_highlight_list.append(list(np.array(context)[attention_highlight_indices]))
            assert len(context_vec) == len(att_vec), "context vector and attention vector must have same length"
            attented_context_vec = np.array([att_vec[i] * context_vec[i] for i in range(len(att_vec))], dtype=tuple)
            attention_list.append(np.sum(attented_context_vec, axis=0))
    lemma_sentences['similarity_vector'] = similarity_vector_list
    lemma_sentences['attention_vector'] = attention_vector_list
    lemma_sentences['attention_highlight_indices'] = attention_highlight_indices_list
    lemma_sentences['attention_highlight'] = attention_highlight_list
    lemma_sentences['attention'] = attention_list

lemma_sentences = df[df.apply(lambda x: lemma in x['clean'], axis=1)]
context_window = 5
context = [s[max(0, s.index(lemma) - context_window) : min(s.index(lemma) + context_window + 1, len(s))] for s in lemma_sentences['clean']]
lemma_sentences['context'] = context
pay_attention(lemma_sentences, lemma)
lemma_sentences.dropna(inplace=True)
display(lemma_sentences[['origin', 'sense', 'attention_highlight']])


In [None]:
!pip install kneed
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from scipy.spatial.distance import cosine
from kneed import KneeLocator
from sklearn import metrics
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

n_neighbors = 5
min_samples = 5

def get_optimal_eps_knee(values):
    nearest_neighbors = NearestNeighbors(n_neighbors=n_neighbors)
    neighbors = nearest_neighbors.fit(values)
    distances, indices = neighbors.kneighbors(values)
    distances = np.sort(distances[:,n_neighbors-1], axis=0)
    i = np.arange(len(distances))
    knee = KneeLocator(i, distances, S=1, curve='convex', direction='increasing', interp_method='polynomial')
    fig = plt.figure(figsize=(5, 5))
    knee.plot_knee()
    plt.xlabel("Points")
    plt.ylabel("Distance")
    #plt.savefig("knee.png", dpi=300)
    # print(distances[knee.knee])
    return distances[knee.knee]

def get_optimal_eps_heuristic(values):
    best_eps = 0.95
    for x in range(90, 0, -5):
        eps = x / 100
        clustering = DBSCAN(eps=eps, metric='cosine').fit(values)
        noise = np.count_nonzero(clustering.labels_ == -1) / len(clustering.labels_)
        print(f'Portion of noise: {noise}')
        print(f'Current eps: {eps}')
        if noise <= 0.5:
            best_eps = eps
        else:
            break
    print(f'Best eps: {best_eps}')
    return best_eps


values = lemma_sentences['attention'].tolist()
best_eps = 0.29 #get_optimal_eps_heuristic(values)
clustering = DBSCAN(eps=best_eps, metric='cosine').fit(values)
lemma_sentences['cluster'] = clustering.labels_
clusters = sorted(set(clustering.labels_)) # - {-1})
#result = lemma_sentences[['origin', 'sense', 'attention_highlight', 'cluster']][lemma_sentences.cluster != -1].sort_values(by=['cluster']).groupby('cluster').head(100)
result = lemma_sentences[['origin', 'sense', 'attention_highlight', 'cluster']].sort_values(by=['cluster']).groupby('cluster').head(100)

# clusters = sorted(set(clustering.labels_))
# result = lemma_sentences[['origin', 'sense', 'attention_highlight', 'cluster']].sort_values(by=['cluster']).groupby('cluster').head()
# latex_path = str(
#     DRIVE_ROOT
#     / "df.tex"
# )
# result.to_latex(latex_path)
# origin = []
# for r in result[['origin']].iterrows():
#     origin.append(r[1].str.wrap(50)[0].replace('\n', '<br>'))
# result['origin'] = origin
result.style.hide_index()
display(result)
print(f'Best eps: {best_eps}')
print(f'Number of clusters: {len(clusters)}')
print(f'Portion of noise: {np.count_nonzero(clustering.labels_ == -1) / len(clustering.labels_)}')