# Imports

In [None]:
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import torch
import pickle
from src.pytorch_word2vec import neg_skipgram

from src.dataset import build_corpus

# Configuration

In [2]:
SIZE = 50000
MODEL_NAME = 'model_50k_300_8_8_1'
RELATIONS_CSV = 'data/filtered-questions-words.csv'

GENERATE_LOG = False

model = Word2Vec.load(f'data/gensim/models/{MODEL_NAME}.model')

# Word analogies

In [3]:
def get_embedding(word):
    return model.wv[word]
    
def get_similar(vector, n):
    return model.wv.similar_by_vector(vector, topn=n)

In [4]:
def check_relation(row):
    try:
        word_one = row['word_one']
        word_two = row['word_two']
        word_three = row['word_three']
        word_four = row['word_four']
        
        embedding_two = get_embedding(word_two)
        embedding_one = get_embedding(word_one)
        embedding_three = get_embedding(word_three)
        
        predicted_vector = embedding_two - embedding_one + embedding_three

        most_similar = get_similar(predicted_vector, n=5)
        words_only = [word for word, similarity in most_similar]

        return word_four in words_only, most_similar[0][0], words_only
    except KeyError:
        return None, None, None

def process_relations(df):
    results = []

    for _, row in df.iterrows():
        is_correct, predicted_word, words_only = check_relation(row)
        if predicted_word is not None:
            results.append({
                'row_id': row['row_id'],
                'category': row['category'],
                'word_one': row['word_one'],
                'word_two': row['word_two'],
                'word_three': row['word_three'],
                'word_four': row['word_four'],
                'is_correct': is_correct,
                'predicted_word': predicted_word,
                'top5': words_only
            })

    return pd.DataFrame(results)

def save_log():
    path = 'data/gensim/log/'
    model_name = MODEL_NAME
        
    os.makedirs(path, exist_ok=True)
    print(path)
    with open(f'{path}/log-{model_name}.txt', 'w') as f:
        f.write(f"*{model_name}*\n")
        f.write(f"Word analogies accuracy: {word_analogy_acc:.2%}, {df['is_correct'].astype(int).sum()}/{len(df)}\n")
        f.write(f"Analogies CSV total len: {len(csv_df)}\n")

if GENERATE_LOG:
    csv_df = pd.read_csv(RELATIONS_CSV)
    df = process_relations(csv_df)

    word_analogy_acc = df['is_correct'].astype(int).sum()/len(df)
    
    save_log()

# KL divergence

In [5]:
with open(f'data/kl/kl-{SIZE}.txt', 'r') as arq:
    kl_divergences = eval(arq.read())

In [None]:
data = build_corpus(SIZE, return_fields=['corpus', 'word2idx', 'idx2word', 'word_count'],  load=True)
corpus, w2idx, idx2w, wc = data['corpus'], data['word2idx'], data['idx2word'], data['word_count']
filtered_words = {key: value for key, value in wc.items() if value >=10}

## Visualizing the relation

In [7]:
def norm(embedding):
    return np.linalg.norm(embedding)

data = [(norm(model.wv[word]) if word in model.wv else 0, kl_divergences[word]) for word in filtered_words.keys()]

y = [data[i][1] for i in range(len(data)) if data[i][0] != 0]
x = [data[i][0] for i in range(len(data)) if data[i][0] != 0]

In [None]:
import scipy.stats
from matplotlib import rcParams

rcParams['font.family'] = 'Ubuntu'

grid_size = 50
hist, x_edges, y_edges = np.histogram2d(x, y, bins=grid_size)

x_idx = np.clip(np.searchsorted(x_edges, x, side='right') - 1, 0, hist.shape[0] - 1)
y_idx = np.clip(np.searchsorted(y_edges, y, side='right') - 1, 0, hist.shape[1] - 1)

density = hist[x_idx, y_idx]

fig, ax = plt.subplots(figsize=(10, 6))

scatter = plt.scatter(x, y, c=density, cmap='viridis', alpha=0.8, s=50)

cbar = plt.colorbar(scatter)
cbar.set_label('Density', fontsize=12)

m, b, r_value, p_value, std_err = scipy.stats.linregress(x, y)
ax.plot(x, m * np.array(x) + b, color='black', label='Regression Line')
ax.legend()

x_mean, y_mean = np.mean(x)*2.08, np.mean(y)*0.18
ax.annotate(f'r²: {r_value**2:.2f}', xy=(x_mean, y_mean + 1 * y_mean), fontsize=11)
ax.annotate(f'formula: {m:.2f}x + {b:.2f}', xy=(x_mean, y_mean), fontsize=11)

ax.set_title("Norm x KL Divergence - Regular model", fontsize=14)
ax.set_xlabel("Vector norm", fontsize=12)
ax.set_ylabel("KL Divergence", fontsize=12)

ax.set_xlim(-1, 11)
ax.set_ylim(-1, 13)

plt.savefig('plots/kl_norm_gensim_50k_8e_8w.png')