# Sentiment Analysis Training

## Import dependencies

In [49]:
# System
import re
import multiprocessing

# Data manipulation
import pandas as pd
import numpy as np

# Graphing
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
from jupyterthemes import jtplot
jtplot.style(theme='onedork')

In [50]:
# ML

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, recall_score, get_scorer, f1_score,roc_auc_score,precision_score
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB

from joblib import dump, load

In [51]:
# NLP
from wordcloud import WordCloud

from xgboost import XGBClassifier

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Prep nltk library
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/petergish/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/petergish/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/petergish/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Load & inspect

In [52]:
# Load data
file_path = '../data/raw/slava_ukraini_tweets.csv'
raw_tweets_df = pd.read_csv(file_path)
raw_tweets_df.head(10)

Unnamed: 0,id,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,retweets,favorites,is_retweet
0,1498446801376002051,Rob Smith üá®üá¶ üá∫üá¶,"Ontario, Canada","Proud Canadian üá®üá¶, ‚ù§Ô∏è baseball ‚öæÔ∏è, ‚ù§Ô∏è history,...",2013-02-18 18:05:01,395,1898,4137,False,2022-02-28 23:55:31,@kiraincongress Photos of rally in support of ...,,Twitter for iPhone,0,0,False
1,1498446755947499524,Frags,Chicago,ay yo video games\nMarried To My Best Friend @...,2013-05-18 16:53:43,10523,269,3051,False,2022-02-28 23:55:21,#Russia propaganda on #Twitter attempting to d...,"['Russia', 'Twitter', 'Ukraine']",Twitter for iPhone,0,1,False
2,1498446700096245760,Mark Malahosky üá∫üá¶,"Webster, NY",Pharmacist practicing at a Federally Qualified...,2015-07-14 12:38:43,184,164,38434,False,2022-02-28 23:55:07,@1Arnold_Friend @MaryEmBern @13WHAM Putin lack...,['SlavaUkraini'],Twitter for iPhone,0,1,False
3,1498446601710448647,oz,"Hlavn√≠ mƒõsto Praha, ƒåesk√° repu",foodandfilms,2013-07-21 11:38:47,17,93,7907,False,2022-02-28 23:54:44,"@profgalloway Dear prof, I have been following...",,Twitter for Android,0,1,False
4,1498446536820224000,Sydfish üá∫üá¶üá∫üá¶üá∫üá¶üá∫üá¶,"California, USA",Soccer mom with a punk rock heart!!!!üõëNO LISTS...,2008-12-20 22:41:51,9136,8070,167748,False,2022-02-28 23:54:28,I‚Äôve done more than 100 in St. Petersburg. \n\...,,Twitter for iPhone,0,0,False
5,1498446504528207872,Kwolikowa üá∫üá¶üí™üáµüá±,The East Bay,working cat mom. lover of freedom. hater of di...,2011-12-20 02:32:24,31,295,635,False,2022-02-28 23:54:21,@lesiavasylenko Speak your truth!! #SlavaUkraini,['SlavaUkraini'],Twitter for iPhone,0,1,False
6,1498446476493701123,Kas,"Guadalajara, Jalisco",Looking through the glass of rainbow,2009-12-14 22:55:37,675,1269,28334,False,2022-02-28 23:54:14,Heroes\n#SlavaUkraini https://t.co/gl7n6LgEWF,['SlavaUkraini'],Twitter for iPad,0,0,False
7,1498446442406588419,Jinx Spidox ‚û°Ô∏è Gdakon,"Glasgow, Scotland","Railway Electrician, Lanky Wuff, Skoda driver,...",2012-08-23 14:06:58,758,774,40027,False,2022-02-28 23:54:06,#SlavaUkraini good night #Ukraine keep up the ...,"['SlavaUkraini', 'Ukraine']",Twitter Web App,0,4,False
8,1498446410227851265,Jackie Blue üá∫üá∏ üá®üá¶ üá∫üá¶ üåª,üá∫üá∏ Wrong Side of the 49th,"Well, now on to the next emergency. (And more ...",2022-02-02 05:14:11,254,221,3880,False,2022-02-28 23:53:58,@anagin40 @NATO It sadly took Ukraine being a ...,,Twitter Web App,0,2,False
9,1498446402267095040,Lydia B. üóùüêâ,,it was his hat mr krabs. he was number one!\n\...,2013-10-04 20:52:31,208,915,4401,False,2022-02-28 23:53:56,@MrBeast @AndreaRussett @RosannaPansino @Jacks...,,Twitter Web App,0,1,False


In [53]:
# Get general overview
raw_tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24468 entries, 0 to 24467
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                24468 non-null  int64 
 1   user_name         24468 non-null  object
 2   user_location     16461 non-null  object
 3   user_description  21781 non-null  object
 4   user_created      24468 non-null  object
 5   user_followers    24468 non-null  int64 
 6   user_friends      24468 non-null  int64 
 7   user_favourites   24468 non-null  int64 
 8   user_verified     24468 non-null  bool  
 9   date              24468 non-null  object
 10  text              24468 non-null  object
 11  hashtags          15902 non-null  object
 12  source            24468 non-null  object
 13  retweets          24468 non-null  int64 
 14  favorites         24468 non-null  int64 
 15  is_retweet        24468 non-null  bool  
dtypes: bool(2), int64(6), object(8)
memory usage: 2.7+ MB


In [54]:
# Get unique values
raw_tweets_df.nunique()

id                  24468
user_name           10219
user_location        3995
user_description     9025
user_created        10093
user_followers       3973
user_friends         4020
user_favourites     10503
user_verified           2
date                23663
text                24417
hashtags             5889
source                 38
retweets              125
favorites             268
is_retweet              1
dtype: int64

In [55]:
# Check locations (may imply language)
raw_tweets_df['user_location'].value_counts()

United States          328
Bracknell UK           201
Canada                 178
London, England        174
London                 171
                      ... 
Denmark / Ukraine        1
Cambridgeshire           1
Alpharetta, Georgia      1
Beverley, England        1
Inverness, Scotland      1
Name: user_location, Length: 3995, dtype: int64

## Cleaning

In [56]:
# Select relevant columns
col_rename_map = {
    'date': 'date',
    'user_name': 'username',
    'user_description': 'description',
    'user_location': 'location',
    'text': 'tweet',
    'hashtags': 'hashtags'
}

tweets_df = raw_tweets_df[list(col_rename_map.keys())].rename(columns=col_rename_map)

tweets_df.shape

(24468, 6)

In [57]:
# Check duplicate tweets
tweets_df['tweet'].duplicated(keep='first').sum()

51

In [58]:
# Drop duplicate tweets
tweets_df = tweets_df.drop_duplicates(subset='tweet', keep='first')
tweets_df.shape

(24417, 6)

In [59]:
# Initialize Lemmatizer and stopwords
lemma = WordNetLemmatizer()
stop_words = stopwords.words("english")

In [60]:
# Define cleaning functions

def cleanTweet(tweet):

    tweet = tweet.lower()
    tweet = re.sub('https?:\/\/[a-zA-Z0-9@:%._\/+~#=?&;-]*', ' ', tweet)
    tweet = re.sub('\$[a-zA-Z0-9]*', ' ', tweet)
    tweet = re.sub('\@[a-zA-Z0-9]*', ' ', tweet)
    tweet = re.sub('[^a-zA-Z\']', ' ', tweet)
    tweet = ' '.join( [w for w in tweet.split() if len(w)>1] )
    
    lem_stopwords = [lemma.lemmatize(x) for x in nltk.wordpunct_tokenize(tweet) 
                     if x not in stop_words]
    tweet = ' '.join(lem_stopwords)
    
    return [lemma.lemmatize(x, nltk.corpus.reader.wordnet.VERB) for x in nltk.wordpunct_tokenize(tweet) 
             if x not in stop_words]


def cleanHashtags(hashtags):

    if hashtags:
        hashtags = hashtags.lower()
        hashtags = re.sub('\$[a-zA-Z0-9]*', ' ', hashtags)
        hashtags = re.sub('[^a-zA-Z]', ' ', hashtags)
        hashtags=hashtags.strip() 
    return hashtags

In [61]:
# Clean tweets
tweets_df['clean_tweet'] = tweets_df['tweet'].apply(lambda x: cleanTweet(x))
tweets_df['cleaned_tweet'] = tweets_df['clean_tweet'].apply(lambda x:' '.join(x))

In [62]:
# Clean hashtags
tweets_df["hashtags"] = tweets_df["hashtags"].astype(str)
tweets_df["hashtags"] = tweets_df["hashtags"].apply(lambda x: cleanHashtags(x))

In [63]:
tweets_df.head()

Unnamed: 0,date,username,description,location,tweet,hashtags,clean_tweet,cleaned_tweet
0,2022-02-28 23:55:31,Rob Smith üá®üá¶ üá∫üá¶,"Proud Canadian üá®üá¶, ‚ù§Ô∏è baseball ‚öæÔ∏è, ‚ù§Ô∏è history,...","Ontario, Canada",@kiraincongress Photos of rally in support of ...,,"[photo, rally, support, ukraine, toronto, cana...",photo rally support ukraine toronto canada sun...
1,2022-02-28 23:55:21,Frags,ay yo video games\nMarried To My Best Friend @...,Chicago,#Russia propaganda on #Twitter attempting to d...,russia twitter ukraine,"[russia, propaganda, twitter, attempt, diminis...",russia propaganda twitter attempt diminish eve...
2,2022-02-28 23:55:07,Mark Malahosky üá∫üá¶,Pharmacist practicing at a Federally Qualified...,"Webster, NY",@1Arnold_Friend @MaryEmBern @13WHAM Putin lack...,slavaukraini,"[friend, putin, lackey, slavaukraini]",friend putin lackey slavaukraini
3,2022-02-28 23:54:44,oz,foodandfilms,"Hlavn√≠ mƒõsto Praha, ƒåesk√° repu","@profgalloway Dear prof, I have been following...",,"[dear, prof, follow, podcast, time, think, sor...",dear prof follow podcast time think sort under...
4,2022-02-28 23:54:28,Sydfish üá∫üá¶üá∫üá¶üá∫üá¶üá∫üá¶,Soccer mom with a punk rock heart!!!!üõëNO LISTS...,"California, USA",I‚Äôve done more than 100 in St. Petersburg. \n\...,,"[do, st, petersburg, dm, would, like, blurb, r...",do st petersburg dm would like blurb russian t...


In [64]:
# Convert date to datetime and extract month/year
tweets_df['date'] = pd.to_datetime(tweets_df['date'])
tweets_df['month'] = tweets_df['date'].dt.month
tweets_df['year'] = tweets_df['date'].dt.year

In [65]:
tweets_df.tail()

Unnamed: 0,date,username,description,location,tweet,hashtags,clean_tweet,cleaned_tweet,month,year
24463,2022-03-13 05:33:56,Joel at Seldon Crisis üéô,"Lover of Earth, Mars, and all of Sol's childre...","Montara, CA (near SF)",@Tazerface16 You went and followed me and now ...,slavaukraini,"[go, follow, ', bring, unfollow, besides, ', g...",go follow ' bring unfollow besides ' get slava...,3,2022
24464,2022-03-13 05:32:32,P,Scottish. British. Proud. Believer in democrac...,"Inverness, Scotland",@dkaleniuk @McFaul This photo was shared a cou...,,"[photo, share, couple, day, ago, lpr, alledged...",photo share couple day ago lpr alledgedly capt...,3,2022
24465,2022-03-13 05:32:04,"T0CM, Back to Buy and Hodl. 8.01. Slava Ukraini!",APES TOGETHER STRONG!,Launch Pad Waiting Room,#SlavaUkraini #FuckPutin #Russiangofuckyourse...,slavaukraini fuckputin russiangofuckyour...,"[slavaukraini, fuckputin, russiangofuckyoursel...",slavaukraini fuckputin russiangofuckyourselves...,3,2022
24466,2022-03-13 05:32:00,JoCroft,#nipolitics üá¨üáß\n#Irishpolitics üáÆüá™ üçÄ \n#perazzi...,Northern Ireland,@nexta_tv The #KingRat #russianoligarch himsel...,kingrat russianoligarch romanabramovich ...,"[tv, kingrat, russianoligarch, romanabramovich...",tv kingrat russianoligarch romanabramovich sla...,3,2022
24467,2022-03-13 05:31:04,‚ùÑRadical Liberal Snowflake‚ùÑ üá∫üá∏ üóΩüá∫üá¶üåª,"This is my mainly political account, if you're...","Washington State, USA",@lapatina_ Mentality sending all the extra str...,slavaukra ni,"[mentality, send, extra, strength, support, ev...",mentality send extra strength support every uk...,3,2022


In [66]:
# Inspect sample of tweets
filter_cond = (tweets_df['year']==2022) & (tweets_df['month']==2)
list(tweets_df['cleaned_tweet'][filter_cond][:100])

['photo rally support ukraine toronto canada sunday ukraine alone',
 'russia propaganda twitter attempt diminish everything ukraine fight spread lie mi',
 'friend putin lackey slavaukraini',
 'dear prof follow podcast time think sort understand',
 'do st petersburg dm would like blurb russian translation',
 'speak truth slavaukraini',
 'hero slavaukraini',
 'slavaukraini good night ukraine keep good fight tonight may saint javelin protector keep',
 'sadly take ukraine sacrificial lamb world wake mount unify assault',
 'please speak ukraine',
 'starlink arrive pretty quickly ukraine slavaukraini ukrainerussiawar',
 'glad hear kid okay slavaukraini slavaukraine',
 'saw news footage family child separate father ukraine upset',
 'swear art form ukraine slavaukraini fckptn',
 'much love southern highland nsw australia slavaukraini',
 'minute silence lisbon ukraine ukrainerussiawar slavaukraini',
 'like laugh johnsonout hoyleout slavaukraini',
 'new video ukraine invasion explain get please 

In [67]:
# Remove all tweets which do not have the words "ukraine" or "russia"
target_tweets_df = tweets_df.copy()

filter_words = ['ukraine', 'russia']
target_tweets_df = target_tweets_df[target_tweets_df["cleaned_tweet"].str.contains('|'.join(filter_words))]
target_tweets_df.shape

(11693, 10)

## Embeddings

In [68]:
# Converting the "clean_text" column in the format supported by embeddings.
sent = [row for row in tweets_df["clean_tweet"]]

# Automatically detect common phrases (bigrams) from a list of sentences.
phrases = Phrases(sent, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[1]

['russia',
 'propaganda',
 'twitter',
 'attempt',
 'diminish',
 'everything',
 'ukraine',
 'fight',
 'spread',
 'lie',
 'mi']

In [69]:
# Initialize vector model

w2v_model = Word2Vec(min_count=4,
                     window=5,
                     vector_size =300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     seed= 42,
                     workers=multiprocessing.cpu_count()-1)

In [70]:
# Build vocab of the word2vec model from the custom data
w2v_model.build_vocab(sentences, progress_per=50000)

In [71]:
# Train the model
w2v_model.train(sentences, 
                total_examples=w2v_model.corpus_count, 
                epochs=30, 
                report_delay=1)

(976813, 5197770)

In [72]:
# Check similar words to war in the corpus
w2v_model.wv.most_similar(positive=["war"])

[('fa', 0.9996999502182007),
 ('complete', 0.9996972680091858),
 ('johnson', 0.9996960163116455),
 ('organize', 0.9996914267539978),
 ('table', 0.9996911287307739),
 ('range', 0.9996908903121948),
 ('doctor', 0.9996902942657471),
 ('happy_birthday', 0.9996885657310486),
 ('become', 0.9996870756149292),
 ('heart', 0.9996864795684814)]

In [73]:
# Save the word2vec model
# w2v_model.save("../models/word2vec.model")

In [74]:
# Load the word2vec model
# word_vectors = Word2Vec.load("../models/word2vec.model").wv
word_vectors = w2v_model.wv

## Clustering model

In [75]:
# Feed the embeddings to a KMeans model to cluster words into positive, negative, and neutral clusters
cluster_model = KMeans(n_clusters=3, 
               max_iter=1000, 
               random_state=42, 
               n_init=50).fit(X=word_vectors.vectors.astype('double'))

In [29]:
word_vectors.most_similar(negative=['bad', 'suffer', 'bomb'])

[('neverwherdle', 0.10658860951662064),
 ('russia_lapresse', 0.024935027584433556),
 ('ukranian_woman', -0.9974691867828369),
 ('amp_iranian', -0.9975388050079346),
 ('day_brave', -0.9978143572807312),
 ('woman_woman', -0.9978271722793579),
 ('happy_international', -0.9980980157852173),
 ('actually_make', -0.9984676837921143),
 ('like_duck', -0.9985438585281372),
 ('nobel_peace', -0.9986076354980469)]

In [46]:
cluster_values = zip(word_vectors.similar_by_vector(cluster_model.cluster_centers_[0], 
                                                        topn=20, 
                                                        restrict_vocab=None),
    word_vectors.similar_by_vector(cluster_model.cluster_centers_[1], 
                                                        topn=20, 
                                                        restrict_vocab=None),
    word_vectors.similar_by_vector(cluster_model.cluster_centers_[2], 
                                                        topn=20, 
                                                        restrict_vocab=None))

print('Cluster 0\t\t\tCluster 1\t\t\tCluster 2')
print('---------\t\t\t---------\t\t\t---------')
for i, (c1, c2, c3) in enumerate(cluster_values):
    print(f'{i}: {c1[0]} - ({str(c1[1])[:7]})\t\t{c2[0]} - ({str(c2[1])[:7]})\t\t{c3[0]} - ({str(c3[1])[:7]})')


Cluster 0			Cluster 1			Cluster 2
---------			---------			---------
0: ancient - (0.99986)		ancient - (0.99986)		ancient - (0.99986)
1: later - (0.99986)		later - (0.99986)		thousand - (0.99986)
2: hospital - (0.99985)		heartbreaking - (0.99985)		later - (0.99986)
3: thousand - (0.99985)		thousand - (0.99985)		several - (0.99985)
4: heartbreaking - (0.99985)		sound - (0.99985)		hospital - (0.99985)
5: several - (0.99985)		hospital - (0.99985)		sound - (0.99985)
6: sound - (0.99985)		several - (0.99985)		effort - (0.99985)
7: aslund - (0.99985)		aslund - (0.99985)		heartbreaking - (0.99985)
8: effort - (0.99985)		effort - (0.99985)		face - (0.99985)
9: street - (0.99985)		abc_news - (0.99985)		abc_news - (0.99985)
10: face - (0.99985)		street - (0.99985)		street - (0.99985)
11: abc_news - (0.99985)		face - (0.99985)		aslund - (0.99985)
12: swear - (0.99985)		warn - (0.99985)		coward - (0.99985)
13: warn - (0.99985)		swear - (0.99985)		swear - (0.99985)
14: coward - (0.99985)		coward - (

In [48]:
test = list(cluster_values)

In [36]:
# Check each cluster to label the clusters
for i, (word, vector) in enumerate(word_vectors.similar_by_vector(cluster_model.cluster_centers_[0], 
                                                        topn=20, 
                                                        restrict_vocab=None)):

    print(f'{i}: {word} - ({vector})')

print()
print()
print(word_vectors.similar_by_vector(cluster_model.cluster_centers_[1], 
                               topn=20, 
                               restrict_vocab=None))
print()
print(word_vectors.similar_by_vector(cluster_model.cluster_centers_[2], 
                               topn=20, 
                               restrict_vocab=None))

0: ancient - (0.9998633861541748)
1: later - (0.9998631477355957)
2: hospital - (0.9998594522476196)
3: thousand - (0.9998593330383301)
4: heartbreaking - (0.9998591542243958)
5: several - (0.999858021736145)
6: sound - (0.9998573660850525)
7: aslund - (0.9998570680618286)
8: effort - (0.9998566508293152)
9: street - (0.9998552203178406)
10: face - (0.9998547434806824)
11: abc_news - (0.9998543858528137)
12: swear - (0.9998542666435242)
13: warn - (0.9998539686203003)
14: coward - (0.9998530745506287)
15: announce - (0.9998522996902466)
16: mute - (0.9998515248298645)
17: sa - (0.9998512864112854)
18: kosovo - (0.9998509287834167)
19: ball - (0.9998506307601929)


[('ancient', 0.9998639822006226), ('later', 0.9998628497123718), ('heartbreaking', 0.9998595714569092), ('thousand', 0.9998592734336853), ('sound', 0.9998583197593689), ('hospital', 0.9998580813407898), ('several', 0.9998574256896973), ('aslund', 0.9998572468757629), ('effort', 0.9998562932014465), ('abc_news', 0.999855101108

In [76]:
temp = word_vectors.similar_by_vector(cluster_model.cluster_centers_[0], 
                               topn=10, 
                               restrict_vocab=None)

print(temp)

[('toy', 0.9998560547828674), ('along', 0.9998544454574585), ('capital', 0.9998512864112854), ('russianwarcrimes', 0.999850869178772), ('table', 0.9998483657836914), ('farmer', 0.9998480677604675), ('brigade', 0.9998468160629272), ('answer', 0.9998465776443481), ('within', 0.999846339225769), ('regardless', 0.9998459219932556)]


In [83]:
word_vectors.most_similar(positive=['slava', 'slavaukraine'], restrict_vocab=False)

[('russianaggression', 0.9997856020927429),
 ('pound', 0.9997782111167908),
 ('warrior', 0.9997779130935669),
 ('survival', 0.9997776746749878),
 ('manage', 0.9997773170471191),
 ('nuclear', 0.9997752904891968),
 ('game', 0.999774158000946),
 ('early', 0.9997735023498535),
 ('whatever', 0.9997727870941162),
 ('ta', 0.9997723698616028)]

In [86]:
word_vectors.similar_by_word('suffer', restrict_vocab=False)

[('coffee', 0.9997217655181885),
 ('wa', 0.9997190237045288),
 ('mum', 0.9997183680534363),
 ('within', 0.9997137188911438),
 ('escalation', 0.9997112154960632),
 ('foot', 0.9997074604034424),
 ('forever', 0.9997039437294006),
 ('meanwhile', 0.9997033476829529),
 ('starmer', 0.9997005462646484),
 ('georgian', 0.9997005462646484)]

In [82]:
word_vectors.most_similar(negative=['death', 'suffer'], positive=['freedom'], restrict_vocab=False)

[('russia_lapresse', -0.0788167417049408),
 ('neverwherdle', -0.9909428358078003),
 ('woman_woman', -0.9941052198410034),
 ('day_brave', -0.9941832423210144),
 ('amp_iranian', -0.9942304491996765),
 ('happy_international', -0.9944518208503723),
 ('ukranian_woman', -0.9947688579559326),
 ('nobel_peace', -0.9948393702507019),
 ('ronald_reagan', -0.9956508874893188),
 ('world_depend', -0.9956626892089844)]

In [None]:
# Label the clusters based on the type of words they carry
positive_cluster_center = cluster_model.cluster_centers_[2]
negative_cluster_center = cluster_model.cluster_centers_[1]
neutral_cluster_center= cluster_model.cluster_centers_[0]

In [None]:
# Create a DataFrame of words with their embeddings and cluster values

words = pd.DataFrame(word_vectors.index_to_key)
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: cluster_model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

In [None]:
# Assign 1 to positive values, 0 to neutral and -1 for negative values
words['cluster_value'] = [1 if i==2 else 0 if i==0 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(cluster_model.transform([x.vectors]).min()), axis=1)

In [None]:
# Save word embeddings to file
words.to_csv('../embeddings/words.csv')

In [None]:
# Load word embeddings
# words = pd.read_csv('../embeddings/words.csv')

## Analysis

In [None]:
# Display negative values
words[words["cluster_value"]==-1].sort_values("closeness_score")

In [None]:
# Plot pie chart of Sentiment Distribution of words
emotion = {
        0: "neutral",
        1: "positive",
        -1: "negative"
}

words["sentiments"] = words["cluster_value"].map(emotion)

fig = plt.gcf()
fig.set_size_inches(7,7)
colors = ["cyan","pink","yellow"]

words_df_pie = words["sentiments"].value_counts().reset_index()

plt.pie(words_df_pie["sentiments"],
        labels=words_df_pie["index"],
        radius=2,
        colors=colors,
        autopct="%1.1f%%")

plt.axis('equal')
plt.title("Sentiment Distribution of Words", fontsize=20)
plt.show()
words_df_pie

In [None]:
# Define a function to get the sentiment for the entire tweet
def getSentiments(row, sentiment_dict):

    total=0
    count=0
    test = row["clean_tweet"]
    for t in test:
        if words_cluster_dict.get(t):
            total+=int(sentiment_dict.get(t))
        count+=1 
        
    avg = total / count
    return -1 if (avg < -0.15) else 1 if (avg > 0.15) else 0

In [None]:
# Create a dictionary of the word and its cluster value
words_cluster_dict = dict(zip(words.words, words.cluster_value))

In [None]:
# Add sentiment column
target_tweets_df["sentiment"] = target_tweets_df.apply(getSentiments,
                                                         args=(words_cluster_dict,),
                                                         axis=1)

In [None]:
# Check the value counts of each sentiment
target_tweets_df["sentiment"].value_counts()

In [None]:
# Plotting pie chart of Sentiment Distribution of tweets
emotion = {
        0: "neutral",
        1: "positive",
        -1: "negative"
}

target_tweets_df["sentiments_val"] = target_tweets_df["sentiment"].map(emotion)

fig = plt.gcf()
fig.set_size_inches(7,7)
colors = ["yellow","cyan","pink"]

tweets_df_pie = target_tweets_df["sentiments_val"].value_counts().reset_index()

plt.pie(tweets_df_pie["sentiments_val"],
        labels=tweets_df_pie["index"],
        radius=2,
        autopct="%1.1f%%",
        colors=colors)

plt.axis('equal')
plt.title("Sentiment Distribution of Tweets ", fontsize=20)
plt.show()
tweets_df_pie

In [None]:
# Save sentiment df
target_tweets_df.to_csv('../data/slava_ukraini_tweets_sentiment.csv')

In [None]:
# Extract negative sentiments
negative_tweets_df = target_tweets_df[target_tweets_df["sentiment"]==-1]

In [None]:
# Inspect the cause of negative tweets
filter_cond = (target_tweets_df['year']==2022) & (target_tweets_df['month']==2)
list(negative_tweets_df['cleaned_tweet'][filter_cond])

In [None]:
# Trim for sample of negative tweets
list(negative_tweets_df["cleaned_tweet"][300:330])

## Visualization

In [None]:
# Inspect keyword sentiment
selections = ['slava', 'zelensky', 'volodimir']

keyword_sent_df = target_tweets_df[(target_tweets_df["cleaned_tweet"].str.contains('|'.join(selections)))]
sns.countplot(x=keyword_sent_df["sentiments_val"]);

In [None]:
# Inspect ukraine sentiment
selections = ["ukraine"]

keyword_sent_df = target_tweets_df[(target_tweets_df["cleaned_tweet"].str.contains('|'.join(selections)))]
sns.countplot(x = keyword_sent_df["sentiments_val"]);

In [None]:
# Inspect russia sentiment
selections = ["russia"]

keyword_sent_df = target_tweets_df[(target_tweets_df["cleaned_tweet"].str.contains('|'.join(selections)))]
sns.countplot(x=keyword_sent_df["sentiments_val"]);

In [None]:
# Tweet counts

plt.subplots(figsize = (10,8))

chart = sns.countplot(x="month", data=target_tweets_df, palette="Set2");
chart.set_xticklabels(chart.get_xticklabels())

plt.title("Tweets per Month ", fontsize=20)
plt.show()

In [None]:
# Tweet sentiments for each month
plt.subplots(figsize = (10,8))

chart = sns.countplot(x="month", data=target_tweets_df, palette="Set2", hue="sentiments_val");
chart.set_xticklabels(chart.get_xticklabels())

plt.title("Tweets Sentiments' per month ", fontsize=20)
plt.show();

In [None]:
# Top 10 highest tweeting usernames
plt.subplots(figsize = (10,8))

plt.title("Top 10 highest tweeting usernames", fontsize=20)

chart = sns.countplot(x="username",
                    hue="sentiments_val",
                    data=target_tweets_df,
                    palette="Set2",
                    order= target_tweets_df["username"].value_counts().iloc[:10].index);

chart.set_xticklabels(chart.get_xticklabels(), 
                      rotation=30, 
                      horizontalalignment='right');

In [None]:
# Top 10 most used hashtags
plt.subplots(figsize = (15,10))
plt.title("Top 10 hashtags", fontsize=20)

chart=sns.countplot(x="hashtags",
                    hue="sentiments_val",
                    data=target_tweets_df,
                    palette="Set2",
                    order=target_tweets_df["hashtags"].value_counts().iloc[1:10].index);

chart.set_xticklabels(chart.get_xticklabels(), 
                      rotation=30, 
                      horizontalalignment='right');

### WordCloud

In [None]:
def generateWordcloud(text):
    words=' '.join([words for words in text])
    wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(words)
    plt.figure(figsize=(10, 7))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis('off')
    plt.show()

In [None]:
# Wordcloud for all tweets
generateWordcloud(target_tweets_df["cleaned_tweet"].values)

In [None]:
# Wordcloud for positive tweets
generateWordcloud(target_tweets_df[target_tweets_df["sentiment"]==1]["cleaned_tweet"].values)

In [None]:
# Wordcloud for negative tweets
generateWordcloud(target_tweets_df[target_tweets_df["sentiment"]==-1]["cleaned_tweet"].values)

In [None]:
# Wordcloud for neutral tweets
generateWordcloud(target_tweets_df[target_tweets_df["sentiment"]==0]["cleaned_tweet"].values)

## Prediction modelling

In [None]:
target_tweets_df.head()

In [None]:
# Convert each sentiment to df (no need to worry about memory crash, small dataset)
pos_df = target_tweets_df[target_tweets_df["sentiments_val"]=="positive"]
neg_df = target_tweets_df[target_tweets_df["sentiments_val"]=="negative"]
neu_df = target_tweets_df[target_tweets_df["sentiments_val"]=="neutral"]

In [None]:
# Combine all sentiments in one df
sentiments_df_list = [pos_df, neg_df, neu_df] 
agg_sentiment_df = pd.concat(sentiments_df_list)

In [None]:
# Split the data to training, testing, and validation data 
train_test_df, valid_df = train_test_split(agg_sentiment_df, test_size=0.2, random_state=10)

In [None]:
X = train_test_df['cleaned_tweet']
y = train_test_df['sentiment']

# Split the dataset set int0 training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Vectorization

In [None]:
# Instantiate TfidfVectorizer 
vectorizer = TfidfVectorizer(min_df=3,
                             sublinear_tf=True,
#                              encoding="latin-1",
                             ngram_range=(1,2),
                             stop_words='english')

In [None]:
# Fit vectorizer
X_train_tf = vectorizer.fit_transform(X_train.reset_index()["cleaned_tweet"]).toarray()
X_test_tf = vectorizer.transform(X_test.reset_index()["cleaned_tweet"]).toarray()

In [None]:
X_train_tf.shape

In [None]:
feature_names = vectorizer.get_feature_names_out() 

### Build models

In [None]:
# Evaluate various models

models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
    XGBClassifier()
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []

for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, 
                                 X_train_tf, 
                                 y_train, 
                                 scoring='accuracy', 
                                 cv=CV)
    
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))

In [None]:
# Aggregate validation scorese
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [None]:
# Plot model performances
plt.subplots(figsize = (10,7))
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', 
              y='accuracy', 
              data=cv_df, 
              size=10,
              jitter=True, 
              edgecolor="gray", 
              linewidth=2)

plt.show()

In [None]:
# Check mean accuracy for each model
cv_df.groupby('model_name').accuracy.mean()

In [None]:
# Above shows linearSVC has highest mean accuracy

## LinearSVC

In [None]:
# Create a dict of Sentiment_val: sentiments to use with the confusion matrix
sentiment_id_df = agg_sentiment_df[['sentiments_val', 'sentiment']].drop_duplicates().sort_values('sentiment')
sentiment_to_id = dict(sentiment_id_df.values)

In [None]:
# Instantiate the model
linearSVC = LinearSVC(random_state=0)

In [None]:
# Fit the model
linearSVC.fit(X_train_tf, y_train)

In [None]:
# Predict
svc_y_pred = linearSVC.predict(X_test_tf)

In [None]:
# Plot confusion matrix to evaluate the model results
svc_conf_mat = confusion_matrix(y_test, svc_y_pred)

fig, ax = plt.subplots(figsize=(5,5))
sns.heatmap(svc_conf_mat, 
            annot=True, 
            fmt='d',
            xticklabels=sentiment_id_df.sentiments_val.values, 
            yticklabels=sentiment_id_df.sentiment.values)

plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
# Get classification report
print(metrics.classification_report(y_test, svc_y_pred))

In [None]:
# Use score method to get accuracy of model
svc_score = linearSVC.score(X_test_tf, y_test)
print(svc_score)

In [None]:
# Feature importance

def plotCoefficients(classification, feature_names, top_features=20):
    size = len(feature_names)
    coef_neg = classification.coef_[-1][:size]
    coef_pos = classification.coef_[1][:size]
    top_positive_coefficients = np.argsort(coef_pos[coef_pos>0])[-top_features:]
    top_negative_coefficients = np.argsort(coef_pos[coef_neg<0])[:top_features]
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
    coef=np.hstack([coef_neg,coef_pos])
    # create plot
    plt.figure(figsize=(15, 5))
    colors = ['red' if c < 0 else 'blue' for c in coef[top_coefficients]]
    plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
    feature_names = np.array(feature_names)
    plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha='right')
    title="Positive and Negative Labels"
    plt.title(title)
    plt.show()

In [None]:
coefs_with_fns = sorted(zip(linearSVC.coef_[0], feature_names)) 
df_feat = pd.DataFrame(coefs_with_fns)
df_feat.columns = 'coefficient','word'
df_feat.sort_values(by='coefficient')

plotCoefficients(linearSVC, feature_names)

In [None]:
# Save linearSVC model
dump(linearSVC, '../models/linearSVC.joblib')

## MultinomialNB Model

In [None]:
# Instantiate the model
multiNB = MultinomialNB()

In [None]:
# Fit the model
multiNB.fit(X_train_tf, y_train)

In [None]:
# predict
nb_y_pred = multiNB.predict(X_test_tf)

In [None]:
# Plot confusion matrix to evaluate the model results
conf_mat = confusion_matrix(y_test, nb_y_pred)
fig, ax = plt.subplots(figsize=(5,5))
sns.heatmap(conf_mat, 
            annot=True, 
            fmt='d',
            xticklabels=sentiment_id_df.sentiments_val.values, 
            yticklabels=sentiment_id_df.sentiment.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
# Print classification report
print(metrics.classification_report(y_test, nb_y_pred))

In [None]:
# Use score method to get accuracy of model
nb_score = multiNB.score(X_test_tf, y_test)
print(nb_score)

In [None]:
# Save linearSVC model
dump(multiNB, '../models/multinomialNB.joblib')

## Comparison: LinearSVC vs. MultinomialNB

In [None]:
# Format validation set
valid_df = valid_df.reset_index()
valid_df.tail()

In [None]:
# Create new df contains the sentiment and its encoded value, and create two dictionaries
sentiment_id_df = target_tweets_df[['sentiments_val', 'sentiment']].drop_duplicates().sort_values('sentiment')
sentiment_to_id = dict(sentiment_id_df.values)

id_to_sentiment = dict(sentiment_id_df[['sentiment', 'sentiments_val']].values)

print("sentiment_to_id: ",sentiment_to_id)
print("id_to_sentiment: ",id_to_sentiment)

In [None]:
# Select "random" tweet
tweet = target_tweets_df["tweet"][202]
tweet

In [None]:
# test the data
clean = vectorizer.transform([tweet]) # apply TFIDF
pred = linearSVC.predict(clean) # predict the tweet using out model
pred = id_to_sentiment[pred[0]]
print(pred)

## Validation

In [None]:
# Define our X and y
X_val = valid_df['cleaned_text']
y_val = valid_df['sentiment']

In [None]:
# Vectorize the model
vectorizer = TfidfVectorizer(min_df=3,
                             sublinear_tf=True,
#                              encoding="latin-1", 
                             ngram_range=(1,2),
                             stop_words='english')

In [None]:
# Fit vectors
X_val_tf = vectorizer.fit_transform(X_val.reset_index()['cleaned_text']).toarray()

In [None]:
# Fit multinomialNB
multiNB.fit(X_val_tf, y_val)

In [None]:
# Plot confusion matrix to evaluate the model results
conf_mat = confusion_matrix(y_val, multiNB.predict(X_val_tf))

fig, ax = plt.subplots(figsize=(5,5))
sns.heatmap(conf_mat, 
            annot=True, 
            fmt='d',
            cmap='viridis',
            xticklabels=sentiment_id_df.sentiments_val.values, 
            yticklabels=sentiment_id_df.sentiment.values)

plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
# Print classification report
print(metrics.classification_report(y_val, multiNB.predict(X_val_tf)))