In [1]:
%config IPCompleter.greedy=True
%load_ext memory_profiler
%matplotlib inline
%run DSF_FINAL_PROJECT_Helper.ipynb
%run DSF_FINAL_PROJECT_Helper_General.ipynb
%run DSF_FINAL_PROJECT_Plot.ipynb

### Description

Implementing Kmean model as part of the "transfer learning" methodology

<h3 style="background-color:powderblue;">Loading Pre-trained Word2Vec Model</h3>

In [2]:
#Loading previously trained Word2Vec model
word_vectors = Word2Vec.load("word2vec.model").wv

INFO - 14:42:05: loading Word2Vec object from word2vec.model
INFO - 14:42:05: loading wv recursively from word2vec.model.wv.* with mmap=None
INFO - 14:42:05: setting ignored attribute vectors_norm to None
INFO - 14:42:05: loading vocabulary recursively from word2vec.model.vocabulary.* with mmap=None
INFO - 14:42:05: loading trainables recursively from word2vec.model.trainables.* with mmap=None
INFO - 14:42:05: setting ignored attribute cum_table to None
INFO - 14:42:05: loaded word2vec.model


<h3 style="background-color:powderblue;">Training KMeans Model</h3>

In [3]:
word_vectors.vectors.astype('double')

array([[ 0.07540338,  0.01074704, -0.06582195, ..., -0.02623373,
         0.00852013,  0.04378984],
       [ 0.01639329, -0.04721879,  0.01917083, ...,  0.04354559,
         0.08490365, -0.0501828 ],
       [ 0.07874706,  0.06687038,  0.03743139, ..., -0.02950877,
        -0.08559278, -0.02190159],
       ...,
       [ 0.09668011, -0.00933639, -0.01882755, ..., -0.04592704,
        -0.07622263, -0.04073007],
       [ 0.04689763, -0.05270163,  0.03878045, ...,  0.0181615 ,
         0.09702074, -0.10023054],
       [ 0.05065498, -0.06486151,  0.01897204, ...,  0.00619927,
        -0.02360089,  0.01596161]])

In [4]:
#Since we need to classify, we area going to need to clusters
KMeansModel = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors.astype('double'))

In [5]:
#Taking a loook at the positive / negative similarities
word_vectors.similar_by_vector(KMeansModel.cluster_centers_[1], topn=10, restrict_vocab=None)

INFO - 14:42:06: precomputing L2-norms of word weight vectors


[('symptoms', 0.38584989309310913),
 ('days', 0.22586709260940552),
 ('jake_laturner', 0.21476401388645172),
 ('story', 0.21410855650901794),
 ('daily_coronavirus', 0.2125411033630371),
 ('🦠', 0.210493266582489),
 ('sa', 0.21039935946464539),
 ('public', 0.20879510045051575),
 ('covid_comfortable', 0.20693394541740417),
 ('friends', 0.20372441411018372)]

In [6]:
#Defining the two cluster centers we need for classifying.
#Our information rating is positive or negative (we carried out the neutral o "0")
positive_cluster_index = 1
positive_cluster_center = KMeansModel.cluster_centers_[positive_cluster_index]
negative_cluster_center = KMeansModel.cluster_centers_[1-positive_cluster_index]

In [7]:
#positive_cluster_center

In [8]:
#negative_cluster_center

In [9]:
#Calculating clusters and vectors
words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: KMeansModel.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

In [10]:
words['cluster_value'] = [1 if i==positive_cluster_index else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(KMeansModel.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [11]:
words.head(5)

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,4000,"[-0.063920654, 0.040959604, 0.044367883, 0.088...",1,1,1.012727,1.012727
1,u.s.,"[0.08878458, -0.042585414, -0.087773584, 0.081...",0,-1,1.003965,-1.003965
2,covid-19_deaths,"[-0.0157652, 0.04773497, -0.084967054, -0.0418...",0,-1,1.005249,-1.005249
3,single_day,"[-0.01626454, 0.015796168, 0.08612141, 0.05212...",1,1,1.011506,1.011506
4,--,"[-0.0072290683, -0.0060554845, 0.06939764, 0.0...",1,1,1.00445,1.00445
5,day,"[0.063255735, 0.025054563, -0.10043011, 0.0748...",0,-1,1.002889,-1.002889
6,mob,"[0.089262724, -0.09727907, -0.033364583, 0.068...",1,1,1.000289,1.000289
7,stop_spending,"[-0.09153232, 0.043836623, 0.07361388, -0.0985...",0,-1,1.009398,-1.009398
8,worried_—,"[0.071984805, -0.0006059831, 0.066567436, -0.0...",0,-1,0.999431,-0.999431
9,freshman_gop,"[-0.043518055, 0.045704123, -0.023872497, -0.0...",1,1,1.012325,1.012325


In [39]:
words.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187 entries, 0 to 186
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   words            187 non-null    object 
 1   vectors          187 non-null    object 
 2   cluster          187 non-null    int64  
 3   cluster_value    187 non-null    int64  
 4   closeness_score  187 non-null    float64
 5   sentiment_coeff  187 non-null    float64
dtypes: float64(2), int64(2), object(2)
memory usage: 8.9+ KB


<h3 style="background-color:powderblue;">Saving the sentiment KMeans Model</h3>

In [50]:
#Connecting to sqlite3 database
dbConn = Sqlite3Db('social_network.db')

In [51]:
CLEAN_TRAINING = True
if CLEAN_TRAINING:
    dbConn.query('''DROP TABLE IF EXISTS TweetsSentAnalysis; ''')

In [52]:
#Creating tweets table
dbConn.query('''CREATE TABLE IF NOT EXISTS TweetsSentAnalysis(words TEXT, sentiment_coeff REAL)''')

In [53]:
#Cleaning TweetsTraining table
if CLEAN_TRAINING:
    del_stm_str = """DELETE FROM TweetsSentAnalysis;"""
    dbConn.query(sqlStm=del_stm_str)
    words[['words', 'sentiment_coeff']].to_sql('TweetsSentAnalysis', con=dbConn.conn, if_exists='append', index=False)

In [54]:
dbConn.query('''SELECT words, sentiment_coeff FROM TweetsSentAnalysis; ''')

sqlStmSARes = dbConn.cursor.fetchall()
#print(type(sqlStmRes))
#print(sqlStmRes)
df_Tweets_sent_analysis = pd.DataFrame(sqlStmSARes, columns =['words', 'sentiment_coeff'])

In [55]:
df_Tweets_sent_analysis.head(3)

Unnamed: 0,words,sentiment_coeff
0,4000,1.012727
1,u.s.,-1.003965
2,covid-19_deaths,-1.005249


In [56]:
df_Tweets_sent_analysis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187 entries, 0 to 186
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   words            187 non-null    object 
 1   sentiment_coeff  187 non-null    float64
dtypes: float64(1), object(1)
memory usage: 3.0+ KB


In [57]:
#Close database
dbConn.close()

In [40]:
#words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)

In [None]:
#Saving the generated sentiment dictionary to our sql database