In [1]:
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE
import plotly.express as px
import pandas as pd
import numpy as np

In [2]:
model_path = 'nkjp+wiki-forms-all-100-cbow-hs.txt'
model = KeyedVectors.load_word2vec_format(model_path, binary=False)


In [3]:
df = pd.read_csv('../annotations/words/all_words.csv')
print(df.head())

        word sentiment
0  wszystkie   Neutral
1    luzacki  Positive
2      nudne  Negative
3  wporzadku  Positive
4     groźny  Negative


In [8]:
def get_word_embedding(word):
    try:
        return model[word]
    except KeyError:
        return np.zeros(model.vector_size)

df['embedding'] = df['word'].apply(get_word_embedding)

df.head()


Unnamed: 0,word,sentiment,embedding
0,wszystkie,Neutral,"[-2.297204, 1.9608, -0.525082, 2.372554, -0.15..."
1,luzacki,Positive,"[-0.152554, -0.560976, 0.308668, 0.333914, -0...."
2,nudne,Negative,"[0.64328, -1.358317, 1.081548, 1.442256, -0.99..."
3,wporzadku,Positive,"[1.384761, 0.213649, 0.392168, 1.123134, -0.52..."
4,groźny,Negative,"[0.519366, -0.409377, -0.176712, 0.629131, 0.6..."


In [5]:
embeddings = np.vstack(df['embedding'].values)

tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)
df_tsne = pd.DataFrame(embeddings_2d, columns=['x', 'y'])
df_tsne['word'] = df['word']
df_tsne['sentiment'] = df['sentiment']

fig = px.scatter(df_tsne, x='x', y='y', color='sentiment',
                 hover_data={'word': True, 'sentiment': True, 'x': False, 'y': False},
                 title='t-SNE Visualization of Adjectives')

fig.update_traces(textposition='top center')
fig.update_layout(xaxis_title='t-SNE Component 1',
                  yaxis_title='t-SNE Component 2')

fig.show()

In [9]:
def get_most_similar_words(word, top_k=5):
    try:
        similar_words = model.most_similar(word, topn=top_k)
        return [sim_word for sim_word, similarity in similar_words]
    except KeyError:
        return [None] * top_k

top_k = 5
for i in range(1, top_k + 1):
    df[f'similar_word_{i}'] = None

for idx, row in df.iterrows():
    similar_words = get_most_similar_words(row['word'], top_k)
    for i in range(1, top_k + 1):
        df.at[idx, f'similar_word_{i}'] = similar_words[i - 1]

df.head()

Unnamed: 0,word,sentiment,embedding,similar_word_1,similar_word_2,similar_word_3,similar_word_4,similar_word_5
0,wszystkie,Neutral,"[-2.297204, 1.9608, -0.525082, 2.372554, -0.15...",te,obydwa,oba,wszyskie,poszczególne
1,luzacki,Positive,"[-0.152554, -0.560976, 0.308668, 0.333914, -0....",zabawny,egzaltowany,przyjacielski,staroświecki,wysublimowany
2,nudne,Negative,"[0.64328, -1.358317, 1.081548, 1.442256, -0.99...",śmieszne,monotonne,beznadziejne,zabawne,męczące
3,wporzadku,Positive,"[1.384761, 0.213649, 0.392168, 1.123134, -0.52...",smieszne,szczesliwy,szczesliwa,wazne,wporządku
4,groźny,Negative,"[0.519366, -0.409377, -0.176712, 0.629131, 0.6...",niegroźny,niebezpieczny,silny,mocny,obiecujący
