In [77]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

In [78]:
word_vectors = Word2Vec.load("../processing/word2vec.model").wv

In [80]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=60).fit(X=word_vectors.vectors.astype('double'))

In [81]:
word_vectors.similar_by_vector(model.cluster_centers_[1], topn=10, restrict_vocab=None)

[('attempt', 0.999849259853363),
 ('being_connected', 0.9998491406440735),
 ('wonderful_evening', 0.9998481869697571),
 ('nothing_like', 0.9998468160629272),
 ('embodiment', 0.9998446702957153),
 ('i_chatted', 0.9998445510864258),
 ('protective_equipment', 0.9998411536216736),
 ('chabua', 0.9998408555984497),
 ('elephant', 0.9998398423194885),
 ('engage_with', 0.9998378157615662)]

In [82]:
p_cluster_index = 1
p_cluster_center = model.cluster_centers_[p_cluster_index]
n_cluster_center = model.cluster_centers_[1-p_cluster_index]

In [83]:
words = pd.DataFrame(word_vectors.index_to_key)
words.columns = ['words']
words.head(10)
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

In [84]:
words['cluster_value'] = [1 if i==p_cluster_index else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis = 1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [85]:
words.head(20)

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,the,"[-0.0032557384, 0.014657231, 0.044013068, 0.01...",1,1,4.886942,4.886942
1,to,"[-0.023520468, 0.054787725, 0.033953186, 0.022...",1,1,5.927927,5.927927
2,and,"[-0.019750055, 0.043906864, 0.03623085, 0.0173...",1,1,7.065258,7.065258
3,of,"[0.0027759434, 0.014083014, 0.04596995, 0.0122...",1,1,4.435699,4.435699
4,in,"[-0.0070596556, 0.018992692, 0.042874303, 0.00...",1,1,7.980787,7.980787
5,a,"[-0.027137583, 0.043569054, 0.04206904, 0.0163...",1,1,11.548194,11.548194
6,for,"[-0.0183038, 0.043328803, 0.03902246, 0.014244...",1,1,12.321323,12.321323
7,we,"[-0.017228326, 0.0498171, 0.037371643, 0.01505...",1,1,6.645021,6.645021
8,is,"[-0.019295039, 0.035370808, 0.044744648, 0.011...",1,1,10.826874,10.826874
9,our,"[-0.024284206, 0.051811915, 0.032582283, 0.017...",1,1,7.177469,7.177469


In [86]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index = False)