# Imports

In [None]:
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import torch
import pickle
from src.pytorch_word2vec import neg_skipgram

from src.dataset import build_corpus

# Configuration

In [None]:
SIZE = 50000
MODEL_NAME = 'model_50k_300_8_10_1'
RELATIONS_CSV = 'data/filtered-questions-words.csv'

model = Word2Vec.load(f'data/gensim/models/{MODEL_NAME}.model')

pytorch_model = neg_skipgram(161333, 300, 0.01)
pytorch_model.load_state_dict(torch.load('data/pytorch_model/default/24_w2v.pt'))

# Word analogies

In [3]:
def get_embedding(word, is_pytorch=True):
    if is_pytorch:
        return pytorch_model.embed(word)
    else:
        return model.wv[word]
    
def get_similar(vector, n, is_pytorch=True):
    if is_pytorch:
        return pytorch_model.get_similar(vector, n)
    else:
        return model.wv.similar_by_vector(vector, topn=n)

In [4]:
def check_relation(row, is_pytorch):
    try:
        word_one = row['word_one']
        word_two = row['word_two']
        word_three = row['word_three']
        word_four = row['word_four']
        
        embedding_two = get_embedding(word_two, is_pytorch)
        embedding_one = get_embedding(word_one, is_pytorch)
        embedding_three = get_embedding(word_three, is_pytorch)
        
        predicted_vector = embedding_two - embedding_one + embedding_three

        most_similar = get_similar(predicted_vector, n=5, is_pytorch=is_pytorch)
        words_only = [word for word, similarity in most_similar]

        return word_four in words_only, most_similar[0][0], words_only
    except KeyError:
        return None, None, None

def process_relations(df, is_pytorch):
    results = []

    for _, row in df.iterrows():
        is_correct, predicted_word, words_only = check_relation(row, is_pytorch)
        if predicted_word is not None:
            results.append({
                'row_id': row['row_id'],
                'category': row['category'],
                'word_one': row['word_one'],
                'word_two': row['word_two'],
                'word_three': row['word_three'],
                'word_four': row['word_four'],
                'is_correct': is_correct,
                'predicted_word': predicted_word,
                'top5': words_only
            })

    return pd.DataFrame(results)

csv_df = pd.read_csv(RELATIONS_CSV)
df = process_relations(csv_df, True)

word_analogy_acc = df['is_correct'].astype(int).sum()/len(df)

In [None]:
df

In [None]:
def save_log(is_pytorch):
    
    if is_pytorch:
        path = 'data/pytorch_model/default/log/'
        model_name = 'pytorch'
    else:
        path = 'data/gensim/log/'
        model_name = MODEL_NAME
        
    os.makedirs(path, exist_ok=True)
    print(path)
    with open(f'{path}/log-{model_name}.txt', 'w') as f:
        f.write(f"*{model_name}*\n")
        f.write(f"Word analogies accuracy: {word_analogy_acc:.2%}, {df['is_correct'].astype(int).sum()}/{len(df)}\n")
        f.write(f"Analogies CSV total len: {len(csv_df)}\n")
        
save_log(True)

In [None]:
df

In [None]:
word_analogy_acc

# KL divergence

In [8]:
arq = open(f'data/kl/kl-{SIZE}.txt', 'r')
content = arq.read()
arq.close()

kl_divergences = eval(content)

In [None]:
data = build_corpus(SIZE, return_fields=['corpus', 'word2idx', 'idx2word', 'word_count'],  load=True)
corpus, w2idx, idx2w, wc = data['corpus'], data['word2idx'], data['idx2word'], data['word_count']
filtered_words = {key: value for key, value in wc.items() if value >=10}

## Visualizing the relation- not adapted to pytorch

In [10]:
def norm(embedding):
    return np.linalg.norm(embedding)

data = [(norm(model.wv[word]) if word in model.wv else 0, kl_divergences[word]) for word in filtered_words.keys()]

y = [data[i][1] for i in range(len(data)) if data[i][0] != 0]
x = [data[i][0] for i in range(len(data)) if data[i][0] != 0]

In [None]:
grid_size = 50
hist, x_edges, y_edges = np.histogram2d(x, y, bins=grid_size)

x_idx = np.clip(np.searchsorted(x_edges, x, side='right') - 1, 0, hist.shape[0] - 1)
y_idx = np.clip(np.searchsorted(y_edges, y, side='right') - 1, 0, hist.shape[1] - 1)

density = hist[x_idx, y_idx]

plt.figure(figsize=(10, 6))

scatter = plt.scatter(x, y, c=density, cmap='viridis', alpha=0.8, edgecolors='k', s=50)

plt.title("Norma x Divergência KL", fontsize=14)
plt.xlabel("Norma do Vetor", fontsize=12)
plt.ylabel("Divergência KL", fontsize=12)

cbar = plt.colorbar(scatter)
plt.xlim(0, 10)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))

scatter = plt.scatter(x, y, c=y, cmap='viridis', alpha=0.8, edgecolors='k', s=50)

plt.title("Norma x Divergência KL", fontsize=14)
plt.xlabel("Norma do Vetor", fontsize=12)
plt.ylabel("Divergência KL", fontsize=12)

cbar = plt.colorbar(scatter)
cbar.set_label("Divergência KL", fontsize=12)

plt.grid(True, linestyle='--', alpha=0.6)
plt.xlim(0, 10)

plt.show()

## Regression

In [13]:
from sklearn.linear_model import LinearRegression

In [14]:
X = [item[0] for item in data]  # Todos os valores de norm(model.wv[word])
y = [item[1] for item in data]

In [None]:
regression_model = LinearRegression()
regression_model.fit(np.array(X).reshape(-1, 1), y)

In [None]:
print("Coeficiente:", regression_model.coef_)