# Sentiment Analysis Prediction

## Import dependencies

In [1]:
# System
import re
import multiprocessing

# Data manipulation
import pandas as pd
import numpy as np

# Graphing
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
# from jupyterthemes import jtplot
# jtplot.style(theme='onedork')

In [2]:
# ML

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, recall_score, get_scorer, f1_score,roc_auc_score,precision_score
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB

from joblib import dump, load

In [3]:
# NLP
from wordcloud import WordCloud

from xgboost import XGBClassifier

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Prep nltk library
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

  from pandas import MultiIndex, Int64Index
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/petergish/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/petergish/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/petergish/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Load & inspect

In [5]:
# Load data
# file_path = 'https://www.kaggle.com/towhidultonmoy/russia-vs-ukraine-tweets-datasetdaily-updated?select=filename.csv'
# file_path = 'https://www.kaggle.com/towhidultonmoy/russia-vs-ukraine-tweets-datasetdaily-updated/download'
# raw_tweets_df = pd.read_csv(file_path, encoding='latin-1', error_bad_lines=False, engine ='python', sep=',')
raw_tweets_df = pd.read_csv('../../data/russia_vs_ukraine_tweets.csv')
raw_tweets_df.head()

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,1517715552302686209,1517634680404598784,2022-04-23 04:02:40 UTC,2022-04-23,04:02:40,0,870355689545371648,voidbourn,VoidBourn IGG 🇺🇲 🇷🇺,,...,,,,,,"[{'screen_name': 'El_Was_Taken', 'name': 'Elli...",,,,
1,1517715545575297025,1517666965464264706,2022-04-23 04:02:38 UTC,2022-04-23,04:02:38,0,338911267,applekappa1337,Applegaku,,...,,,,,,"[{'screen_name': 'fedtanyl', 'name': 'Fed For ...",,,,
2,1517715539925561344,1517569434956804103,2022-04-23 04:02:37 UTC,2022-04-23,04:02:37,0,703180594914570240,mbw955,"Mal, just another ប្រឆាំងហ្វាស៊ីស amongst many.",,...,,,,,,"[{'screen_name': 'pl4ma', 'name': 'plama', 'id...",,,,
3,1517715531574489094,1517533955959967746,2022-04-23 04:02:35 UTC,2022-04-23,04:02:35,0,1411455047263670273,shodanette,Shodan🔮|,,...,,,,,,"[{'screen_name': 'Rimlee18', 'name': 'Rimlee',...",,,,
4,1517715528697143296,1517491994922213379,2022-04-23 04:02:34 UTC,2022-04-23,04:02:34,0,1181952375399092225,chilberg11,Carl Hilberg,,...,,,,,,"[{'screen_name': 'InnaSovsun', 'name': 'Inna S...",,,,


In [6]:
# Get basic summary
raw_tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 36 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               10001 non-null  int64  
 1   conversation_id  10001 non-null  int64  
 2   created_at       10001 non-null  object 
 3   date             10001 non-null  object 
 4   time             10001 non-null  object 
 5   timezone         10001 non-null  int64  
 6   user_id          10001 non-null  int64  
 7   username         10001 non-null  object 
 8   name             10000 non-null  object 
 9   place            2 non-null      object 
 10  tweet            10001 non-null  object 
 11  language         10001 non-null  object 
 12  mentions         10001 non-null  object 
 13  urls             10001 non-null  object 
 14  photos           10001 non-null  object 
 15  replies_count    10001 non-null  int64  
 16  retweets_count   10001 non-null  int64  
 17  likes_count 

In [7]:
# Get unique values
raw_tweets_df.nunique()

id                 10001
conversation_id     7211
created_at          6510
date                   1
time                6510
timezone               1
user_id             7169
username            7169
name                7082
place                  2
tweet               9912
language              39
mentions             422
urls                2122
photos               944
replies_count         32
retweets_count        54
likes_count          104
hashtags            1171
cashtags              15
link               10001
retweet                1
quote_url            835
video                  2
thumbnail           1096
near                   0
geo                    0
source                 0
user_rt_id             0
user_rt                0
retweet_id             0
reply_to            3467
retweet_date           0
translate              0
trans_src              0
trans_dest             0
dtype: int64

In [8]:
# View all columns
raw_tweets_df.columns

Index(['id', 'conversation_id', 'created_at', 'date', 'time', 'timezone',
       'user_id', 'username', 'name', 'place', 'tweet', 'language', 'mentions',
       'urls', 'photos', 'replies_count', 'retweets_count', 'likes_count',
       'hashtags', 'cashtags', 'link', 'retweet', 'quote_url', 'video',
       'thumbnail', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
       'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
       'trans_dest'],
      dtype='object')

## Cleaning

In [9]:
# Select relevant columns
col_rename_map = {
    'date': 'date',
    'username': 'username',
    # 'language': 'language'
    'retweets_count': 'retweets',
    'tweet': 'tweet',
    'hashtags': 'hashtags'
}

tweets_df = raw_tweets_df[list(col_rename_map.keys())].rename(columns=col_rename_map)
tweets_df.shape

(10001, 5)

In [None]:
# Filter out non-english tweets
# tweets_df = tweets_df[tweets_df["language"]=="en"]
# tweets_df.shape

In [10]:
# Check duplicate tweets
tweets_df['tweet'].duplicated(keep='first').sum()

89

In [11]:
# Drop duplicate tweets
tweets_df = tweets_df.drop_duplicates(subset='tweet', keep='first')
tweets_df.shape

(9912, 5)

In [12]:
# Initialize Lemmatizer and stopwords
lemma = WordNetLemmatizer()
stop_words = stopwords.words("english")

In [13]:
# Define cleaning functions

def cleanText(tweet):

    tweet = tweet.lower()
    tweet = re.sub('https?:\/\/[a-zA-Z0-9@:%._\/+~#=?&;-]*', ' ', tweet)
    tweet = re.sub('\$[a-zA-Z0-9]*', ' ', tweet)
    tweet = re.sub('\@[a-zA-Z0-9]*', ' ', tweet)
    tweet = re.sub('[^a-zA-Z\']', ' ', tweet)
    tweet = ' '.join( [w for w in tweet.split() if len(w)>1] )
    
    lem_stopwords = [lemma.lemmatize(x) for x in nltk.wordpunct_tokenize(tweet) 
                     if x not in stop_words]
    tweet = ' '.join(lem_stopwords)
    
    return [lemma.lemmatize(x, nltk.corpus.reader.wordnet.VERB) for x in nltk.wordpunct_tokenize(tweet) 
             if x not in stop_words]


def cleanHashtags(hashtags):

    if hashtags:
        hashtags = hashtags.lower()
        hashtags = re.sub('\$[a-zA-Z0-9]*', ' ', hashtags)
        hashtags = re.sub('[^a-zA-Z]', ' ', hashtags)
        hashtags=hashtags.strip() 
    return hashtags

In [14]:
# Clean text
tweets_df['clean_tweet'] = tweets_df['tweet'].apply(lambda x: cleanText(x))
tweets_df['cleaned_tweet'] = tweets_df['clean_tweet'].apply(lambda x:' '.join(x))

In [15]:
# Clean hashtags
tweets_df["hashtags"] = tweets_df["hashtags"].astype(str)
tweets_df["hashtags"] = tweets_df["hashtags"].apply(lambda x: cleanHashtags(x))

In [16]:
tweets_df.head()

Unnamed: 0,date,username,retweets,tweet,hashtags,clean_tweet,cleaned_tweet
0,2022-04-23,voidbourn,0,@El_Was_Taken @mariya_GuO @jacksonhinklle This...,,"[take, guo, russia, usa, permanent, seat, unit...",take guo russia usa permanent seat unite natio...
1,2022-04-23,applekappa1337,0,"@fedtanyl Thomas Friedman sucks, but the artic...",,"[thomas, friedman, suck, article, simp, author...",thomas friedman suck article simp authoritaria...
2,2022-04-23,mbw955,0,@pl4ma @TKensingtonian @freedomrideblog Not do...,,"[downplay, nazi, russian, aggression, greater,...",downplay nazi russian aggression greater russi...
3,2022-04-23,shodanette,0,@Rimlee18 @_Chosokaba @gadhi_minosh @KittBarte...,,"[chosokaba, minosh, trade, agreement, equal, h...",chosokaba minosh trade agreement equal hence u...
4,2022-04-23,chilberg11,0,@InnaSovsun Russia won't stop at Transnistria....,,"[russia, ', stop, transnistria, putin, claim, ...",russia ' stop transnistria putin claim moldova...


In [17]:
# Convert date to datetime and extract month/year
tweets_df['date'] = pd.to_datetime(tweets_df['date'])
tweets_df['month'] = tweets_df['date'].dt.month
tweets_df['year'] = tweets_df['date'].dt.year

In [None]:
tweets_df.head()

In [25]:
# Inspect sample of tweets
filter_cond = (tweets_df['year']==2022) & (tweets_df['month']==4)
list(tweets_df['cleaned_tweet'][filter_cond][:10])

['take guo russia usa permanent seat unite nation security council nothing repeat nothing say negate downplay fact sacrifice permanent none people come back nation primarily help',
 'thomas friedman suck article simp authoritarianism basically say despite issue american democracy china russia fail provide alternative due incompetence whether ukraine shanghai covid',
 "downplay nazi russian aggression greater russia denazification also tweet country like u settler country like australia canada ' issue genocide discus usual ignore",
 'chosokaba minosh trade agreement equal hence uk russia conflict agreement gas electric equate imperialism simple modern economics',
 "russia ' stop transnistria putin claim moldova ukraine baltic nation always part greater russia want moldova moldova military strength alliance fend invasion easy grab",
 'geopolitical pilgrimage global leader india continue one return stronger relation trade india understand india mean india position russia well know change 

In [None]:
# Remove all tweets which do not have the words "ukraine" or "russia"
# country_tweets_df = tweets_df.copy()

# filter_words = ['ukraine', 'russia']
# country_tweets_df = country_tweets_df[country_tweets_df["cleaned_tweet"].str.contains('|'.join(filter_words))]
# country_tweets_df.shape

## Embeddings

In [26]:
# Converting the "clean_text" column in the format supported by embeddings.
sent = [row for row in tweets_df["clean_tweet"]]

# Automatically detect common phrases (bigrams) from a list of sentences.
phrases = Phrases(sent, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[1]

['thomas',
 'friedman',
 'suck',
 'article',
 'simp',
 'authoritarianism',
 'basically_say',
 'despite',
 'issue',
 'american',
 'democracy',
 'china',
 'russia',
 'fail',
 'provide',
 'alternative',
 'due',
 'incompetence',
 'whether',
 'ukraine',
 'shanghai',
 'covid']

In [27]:
# Initialize model

w2v_model = Word2Vec(min_count=4,
                     window=5,
                     vector_size =300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     seed= 42,
                     workers=multiprocessing.cpu_count()-1)


# Build vocab of the word2vec model from the custom data
w2v_model.build_vocab(sentences, progress_per=50000)

In [28]:
# Training the model
w2v_model.train(sentences, 
                total_examples=w2v_model.corpus_count, 
                epochs=30, 
                report_delay=1)

(721155, 4092300)

In [29]:
# Check similar words to war in the corpus
w2v_model.wv.most_similar(positive=["war"])

[('meet', 0.9997609257698059),
 ('advantage', 0.9997460246086121),
 ('cage', 0.9997429847717285),
 ('grow', 0.9997410774230957),
 ('consequence', 0.9997397661209106),
 ('wow', 0.9997391700744629),
 ('murder', 0.9997386932373047),
 ('stoprussianaggression', 0.9997381567955017),
 ('want', 0.9997380375862122),
 ('international', 0.999737560749054)]

In [None]:
# Save the word2vec model
# w2v_model.save("models/word2vec.model")

In [None]:
# Load the word2vec model
# word_vectors = Word2Vec.load("word2vec.model").wv
word_vectors = w2v_model.wv

## Vectorization

In [30]:
# Instantiate TfidfVectorizer 
vectorizer = TfidfVectorizer(min_df=3,
                             sublinear_tf=True,
#                              encoding="latin-1",
                             ngram_range=(1,2),
                             stop_words='english')

In [31]:
# Fit vectorizer
# X_train_tf = vectorizer.fit_transform(X_train.reset_index()["cleaned_text"]).toarray()
# X_test_tf = vectorizer.transform(X_test.reset_index()["cleaned_text"]).toarray()
X_vectors = vectorizer.transform(tweets_df["cleaned_tweet"].reset_index()['cleaned_tweet']).toarray()

NotFittedError: The TF-IDF vectorizer is not fitted

In [None]:
X_vectors.shape

In [None]:
feature_names = vectorizer.get_feature_names_out() 

# Import model

In [67]:
# Load vectorizer
# vectorizer = load('./models/vectorizer/russia_ukraine_vectorizer.joblib')
vectorizer = load('../pipeline_1/models/vectorizer/slava_vectorizer.joblib')

In [82]:
x = pd.read_csv('./data/transformed/russia_ukraine_sentiment.csv')
x.head()

Unnamed: 0.1,Unnamed: 0,date,username,retweets,tweet,hashtags,clean_tweet_words,clean_tweet,day,month,sentiment_val,sentiment
0,0,4/23/22,voidbourn,0.0,@El_Was_Taken @mariya_GuO @jacksonhinklle This...,,"['take', 'guo', 'russia', 'usa', 'permanent', ...",take guo russia usa permanent seat unite natio...,23.0,4.0,1.0,positive
1,1,4/23/22,applekappa1337,0.0,"@fedtanyl Thomas Friedman sucks, but the artic...",,"['thomas', 'friedman', 'suck', 'article', 'sim...",thomas friedman suck article simp authoritaria...,23.0,4.0,1.0,positive
2,2,4/23/22,mbw955,0.0,@pl4ma @TKensingtonian @freedomrideblog Not do...,,"['downplay', 'nazi', 'russian', 'aggression', ...",downplay nazi russian aggression greater russi...,23.0,4.0,1.0,positive
3,3,4/23/22,shodanette,0.0,@Rimlee18 @_Chosokaba @gadhi_minosh @KittBarte...,,"['chosokaba', 'minosh', 'trade', 'agreement', ...",chosokaba minosh trade agreement equal hence u...,23.0,4.0,1.0,positive
4,4,4/23/22,chilberg11,0.0,@InnaSovsun Russia won't stop at Transnistria....,,"['russia', 'stop', 'transnistria', 'putin', 'c...",russia stop transnistria putin claim moldova u...,23.0,4.0,1.0,positive


In [73]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131090 entries, 0 to 131089
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         65554 non-null  object 
 1   date               37733 non-null  object 
 2   username           37733 non-null  object 
 3   retweets           37733 non-null  object 
 4   tweet              37733 non-null  object 
 5   hashtags           1717 non-null   object 
 6   clean_tweet_words  9912 non-null   object 
 7   clean_tweet        9888 non-null   object 
 8   day                9911 non-null   float64
 9   month              9911 non-null   float64
 10  sentiment_val      9911 non-null   float64
 11  sentiment          9911 non-null   object 
dtypes: float64(3), object(9)
memory usage: 12.0+ MB


In [83]:
x[~x['clean_tweet'].notna()]

Unnamed: 0.1,Unnamed: 0,date,username,retweets,tweet,hashtags,clean_tweet_words,clean_tweet,day,month,sentiment_val,sentiment
9870,9982,4/23/22,tompainetoday,0.0,Trending Now:,,,,,,,
9871,#Ukraine | #Russia | #After | #Russian | #Mcca...,,,,,,,,,,,


In [74]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9912 entries, 0 to 10000
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           9912 non-null   datetime64[ns]
 1   username       9912 non-null   object        
 2   retweets       9912 non-null   int64         
 3   tweet          9912 non-null   object        
 4   hashtags       9912 non-null   object        
 5   clean_tweet    9912 non-null   object        
 6   cleaned_tweet  9912 non-null   object        
 7   month          9912 non-null   int64         
 8   year           9912 non-null   int64         
dtypes: datetime64[ns](1), int64(3), object(5)
memory usage: 774.4+ KB


In [70]:
len(vectorizer.idf_)

3648

In [46]:
# Transform text using vectorizer
X_vectors = vectorizer.transform(tweets_df["cleaned_tweet"].reset_index()['cleaned_tweet']).toarray()

In [47]:
X_vectors.shape

(9912, 3648)

In [None]:
feature_names = vectorizer.get_feature_names_out() 

In [48]:
# Load linearSVC
linearSVC_model = load('../pipeline_1/models/linear_svc/slava_linearSVC.joblib')

In [58]:
linearSVC_model.n_features_in_

3648

In [54]:
# Load linearSVC
multiNB_model = load('../pipeline_1/models/multi_nb/slava_multinomialNB.joblib')

In [57]:
multiNB_model.n_features_in_

3648

In [71]:
param_idx, param_name = next(((i, x) for i, x in enumerate(['n_features_in_']) if hasattr(multiNB_model, x)), None)
model_param = getattr(multiNB_model, param_name)
if callable(model_param):
    model_shape = model_param()
else:
    model_shape = model_param
param_idx, model_shape

(0, 3648)

In [64]:
getattr(multiNB_model, 'get_params')()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True}

In [49]:
# Generate prediction
svc_predict = linearSVC_model.predict(X_vectors)

In [50]:
# connect predictions with outputs
for i in range(10):
	print(tweets_df["cleaned_tweet"].iloc[i], svc_predict[i])

take guo russia usa permanent seat unite nation security council nothing repeat nothing say negate downplay fact sacrifice permanent none people come back nation primarily help -1
thomas friedman suck article simp authoritarianism basically say despite issue american democracy china russia fail provide alternative due incompetence whether ukraine shanghai covid -1
downplay nazi russian aggression greater russia denazification also tweet country like u settler country like australia canada ' issue genocide discus usual ignore -1
chosokaba minosh trade agreement equal hence uk russia conflict agreement gas electric equate imperialism simple modern economics -1
russia ' stop transnistria putin claim moldova ukraine baltic nation always part greater russia want moldova moldova military strength alliance fend invasion easy grab -1
geopolitical pilgrimage global leader india continue one return stronger relation trade india understand india mean india position russia well know change pm bori

In [None]:
# Map sentiment encodings

emotion = { 0: "neutral", 1: "positive", -1: "negative" }

k = np.array(list(emotion.keys()))
v = np.array(list(emotion.values()))

print(k)
print(v)

mapping_arr = np.zeros(k.max()+1, dtype=v.dtype) #k,v from approach #1
print(mapping_arr)
mapping_arr[k] = v

sent_predictions = mapping_arr[svc_predict]

In [None]:
# Build df from predictions

tweet_sentiments = pd.DataFrame(zip(tweets_df['cleaned_tweet'], sent_predictions), columns=['tweet', 'sentiments_val'])

In [None]:
tweet_sentiments.head()

In [None]:
# Plot pie chart of Sentiment Distribution of words
fig = plt.gcf()
fig.set_size_inches(7,7)
colors = ["cyan","pink","yellow"]

pie_df = tweet_sentiments['sentiments_val'].value_counts().reset_index()

plt.pie(pie_df['sentiments_val'],
        labels=pie_df["index"],
        radius=2,
        colors=colors,
        autopct="%1.1f%%")

plt.axis('equal')
plt.title('Sentiment Distribution of Tweets', fontsize=20)
plt.show()
pie_df

In [None]:
# Inspect keyword sentiment
keywords=['russia']
pattern = '|'.join(keywords)
keyword_sent_df = tweet_sentiments[(tweet_sentiments["tweet"].str.contains(pattern))]
sns.countplot(x=keyword_sent_df["sentiments_val"]);
