In [None]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

In [None]:
word_vectors = Word2Vec.load("../preprocessing_and_embeddings/word2vec.model").wv

In [None]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors.astype('double'))

In [None]:
word_vectors.similar_by_vector(model.cluster_centers_[1], topn=10, restrict_vocab=None)

[('chorizo_rioja', 0.7952917814254761),
 ('shaved_ricotta', 0.7886500954627991),
 ('de_mariscos', 0.776299238204956),
 ('sald', 0.7753915786743164),
 ('piperade', 0.7741570472717285),
 ('chestnut_fettucine', 0.7724645733833313),
 ('pickled_raisins', 0.7724539041519165),
 ('elegantly_presented', 0.7710554599761963),
 ('crispy_pancetta', 0.7686232328414917),
 ('mince', 0.7669898271560669)]

In [None]:
positive_cluster_index = 1
positive_cluster_center = model.cluster_centers_[positive_cluster_index]
negative_cluster_center = model.cluster_centers_[1-positive_cluster_index]

In [None]:
words = pd.DataFrame(word_vectors.index_to_key)
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

In [None]:
words['cluster_value'] = [1 if i==positive_cluster_index else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [None]:
words.head(10)

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,the,"[-0.025811557, -0.06777937, 0.03592337, 0.1531...",1,1,0.956277,0.956277
1,and,"[0.04758844, -0.02318652, 0.008652871, 0.07262...",1,1,0.964933,0.964933
2,i,"[0.0046501057, -0.09439859, 0.023876388, 0.101...",1,1,0.917241,0.917241
3,a,"[-0.03135359, -0.008680656, 0.020439783, 0.154...",1,1,0.929204,0.929204
4,to,"[-0.0114356, -0.086332746, -0.00014871405, 0.1...",0,-1,0.893186,-0.893186
5,was,"[-0.07617827, -0.09099883, 0.086886376, -0.015...",1,1,0.957693,0.957693
6,it,"[0.04747942, -0.12630408, 0.04054854, 0.084705...",1,1,0.950745,0.950745
7,of,"[-0.008903565, -0.006358539, 0.009355352, 0.11...",1,1,0.98102,0.98102
8,is,"[0.040766094, 0.0009671062, -0.016327435, 0.02...",1,1,0.947611,0.947611
9,!,"[0.01586833, -0.002405933, -0.017865812, -0.04...",1,1,0.9766,0.9766


In [None]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)