## SENTIMENT & EXPLORATORY DATA ANALYSIS OF TWEETS ON COVID-19 VACCINES

## INSY 5377- 001 : Web And Social Analytics

## Group-5

### The following code has been inspired from the following sources:
- https://www.kaggle.com/datasets/gpreda/all-covid19-vaccines-tweets/code?resource=download (Dataset)
- https://www.kaggle.com/code/hassanhshah/covid-vaccine-sentiment-and-time-series-analysis (Time Series Analysis VADER)
- https://www.kaggle.com/code/yutotsubaki/sentiment-analysis-with-textblob-analyze-in-time (Textblob)

In [None]:
import numpy as np
import pandas as pd
import re
import string
import nltk
import matplotlib.pyplot as plt
plt.rc('figure',figsize=(17,13))
import seaborn as sns
sns.set_style('darkgrid')
import plotly.express as ex
from plotly.subplots import make_subplots
#nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from wordcloud import WordCloud,STOPWORDS, ImageColorGenerator
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import datetime
import warnings
warnings.filterwarnings("ignore")
print("Library Setup Complete.")

In [None]:
# Reading Data
vaccine_filepath = r'C:\Users\dundu\OneDrive\Desktop\Web_Social_Project\MAIN_MOHIT.csv'
vaccine_data = pd.read_csv(vaccine_filepath)
print("Read Complete.")

In [None]:
display(vaccine_data.shape, str(vaccine_data.shape[0])+" tweets in dataset") 

In [None]:
vaccine_data=vaccine_data.drop_duplicates('text') #dropping duplicate tweets
vaccine_data.shape

In [None]:
# Examining Data
vaccine_data.head()

In [None]:
# Examining Data
vaccine_data.tail()

In [None]:
# Examining statistics
vaccine_data.describe()

In [None]:
# Determining data types
vaccine_data.dtypes

In [None]:
# Looking for unfilled values
vaccine_data.isnull().sum()

# TEXT PREPROCESSING

In [None]:
# Lowercase
vaccine_data['text'] = vaccine_data['text'].str.lower()
vaccine_data['text']

In [None]:
# URL Removal
vaccine_data['text'] = vaccine_data['text'].apply(lambda x:re.sub(r"http\S+", "", x))
vaccine_data["text"]

In [None]:
# Punctuation Removal
punctuation_removal = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', punctuation_removal))
vaccine_data["text"] = vaccine_data["text"].apply(lambda text: remove_punctuation(text))
vaccine_data["text"]

In [None]:
# Single character and double space removal
vaccine_data["text"] = vaccine_data["text"].apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))
vaccine_data["text"] = vaccine_data["text"].apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))
vaccine_data["text"]
vaccine_data.to_csv("1.Before_clean.csv")
vaccine_filepath1 = r'C:\Users\dundu\OneDrive\Desktop\Web_Social_Project\1.Before_clean.csv'
vaccine_data1 = pd.read_csv(vaccine_filepath1)

In [None]:
# Stopword Removal
", ".join(stopwords.words('english'))
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

vaccine_data["text"] = vaccine_data["text"].apply(lambda text: remove_stopwords(text))
vaccine_data["text"]

In [None]:
# Emoji Removal
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)
vaccine_data["text"] = vaccine_data["text"].apply(str)
vaccine_data["text"] = vaccine_data["text"].apply(remove_emoji)
vaccine_data["text"]

In [None]:
# Single character and double space removal
vaccine_data["text"] = vaccine_data["text"].apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))
vaccine_data["text"] = vaccine_data["text"].apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))
vaccine_data["text"]

In [None]:
# Most common words
from collections import Counter
cnt = Counter()
for text in vaccine_data["text"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

# Sentiment Analysis

In [None]:
# Sentiment analysis
sid = SIA()
vaccine_data['sentiments'] = vaccine_data["text"].apply(lambda x: sid.polarity_scores(' '.join(re.findall(r'\w+',str(x).lower()))))
vaccine_data['Positive Sentiment'] = vaccine_data['sentiments'].apply(lambda x: x['pos']) 
vaccine_data['Neutral Sentiment'] = vaccine_data['sentiments'].apply(lambda x: x['neu'])
vaccine_data['Negative Sentiment'] = vaccine_data['sentiments'].apply(lambda x: x['neg'])

vaccine_data['Compound'] = vaccine_data['sentiments'].apply(lambda x: x['compound'])
sentiment=[]
scores = vaccine_data['Compound'].tolist()
for i in scores:
    if i>=0.05:
        sentiment.append('Positive')
    elif i<=(-0.05):
        sentiment.append('Negative')
    else:
        sentiment.append('Neutral')
vaccine_data['sentiment']=pd.Series(np.array(sentiment))



vaccine_data.head()

In [None]:
vaccine_data1 = vaccine_data

# def sentiment_analysis(tweet):


from textblob import TextBlob

#Function for Subjectivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

#Function for polarity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

vaccine_data1['TextBlob_Subjectivity'] = vaccine_data1['text'].apply(getSubjectivity)
vaccine_data1['TextBlob_Polarity'] = vaccine_data1['text'].apply(getPolarity)
def getAnalysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'
vaccine_data1['TextBlob_Analysis'] = vaccine_data1['TextBlob_Polarity'].apply(getAnalysis)
vaccine_data1.to_csv("TEXTBLOB.CSV")
vaccine_data1.head()

In [None]:
# Sentiment analysis
sid = SIA()
vaccine_data1['sentiments'] = vaccine_data1["text"].apply(lambda x: sid.polarity_scores(' '.join(re.findall(r'\w+',str(x).lower()))))
vaccine_data1['Positive Sentiment'] = vaccine_data1['sentiments'].apply(lambda x: x['pos']) 
vaccine_data1['Neutral Sentiment'] = vaccine_data1['sentiments'].apply(lambda x: x['neu'])
vaccine_data1['Negative Sentiment'] = vaccine_data1['sentiments'].apply(lambda x: x['neg'])
vaccine_data1.head()
vaccine_data1.to_csv("1.Unclean_test_sent.csv")

In [None]:
vaccine_data.to_csv("1.After_Sent_Analysis.csv")

# Feature Engineering

In [None]:
#Number of Words
vaccine_data['Number_Of_Words'] = vaccine_data['text'].apply(lambda x:len(x.split(' ')))
#Average Word Length
vaccine_data['Mean_Word_Length'] = vaccine_data['text'].apply(lambda x:np.round(np.mean([len(w) for w in x.split(' ')]),2) )
vaccine_data.head()


In [None]:
# Tokenization and lemmatization
def tokenization(text):
    text = re.split('\W+', text)
    return text
vaccine_data['tokenized'] = vaccine_data['text'].apply(lambda x: tokenization(x.lower()))
wn = nltk.WordNetLemmatizer()
def lemmatizer(text):
    text = [wn.lemmatize(word) for word in text]
    return text
vaccine_data['lemmatized'] = vaccine_data['tokenized'].apply(lambda x: lemmatizer(x))
vaccine_data.head()
vaccine_data.tail()
vaccine_data.to_csv("1.Token_Lemm.csv")

# Overall Analysis

In [None]:
# World Cloud
tweet_All = " ".join(review for review in vaccine_data["text"])

fig, ax = plt.subplots(1, 1, figsize  = (10,10))
# Create and generate a word cloud image:
wordcloud_ALL = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(tweet_All)

# Display the generated image:
ax.imshow(wordcloud_ALL, interpolation='bilinear')

ax.axis('off')

In [None]:
# Sentiment Distribution
plt.subplot(2,1,1)
plt.title('Distriubtion Of Sentiments Across Tweets',fontsize=19,fontweight='bold')
sns.kdeplot(vaccine_data['Negative Sentiment'], color = 'red')
sns.kdeplot(vaccine_data['Positive Sentiment'], color = 'green')
sns.kdeplot(vaccine_data['Neutral Sentiment'], color = 'blue')
plt.xlabel(' ')
plt.legend(['Negative Sentiment','Positive Sentiment','Neutral Sentiment'])
plt.subplot(2,1,2)
plt.title('Average Sentiments Across Tweets',fontsize=19,fontweight='bold')
neg_total_avg = (vaccine_data['Negative Sentiment'].sum())/len(vaccine_data.index)
print(neg_total_avg)
pos_total_avg = (vaccine_data['Positive Sentiment'].sum())/len(vaccine_data.index)
print(pos_total_avg)
neu_total_avg = (vaccine_data['Neutral Sentiment'].sum())/len(vaccine_data.index)
print(neu_total_avg)
sentiment_type = ['Negative','Positive','Neutral']
sentiment_total_avg = [neg_total_avg, pos_total_avg, neu_total_avg]
plt.bar(sentiment_type, sentiment_total_avg, color = ['red', 'green', 'blue'])
plt.ylabel('Average Sentiment Per Tweet',fontsize=19)
plt.xlabel('Sentiment Type',fontsize=19)
plt.show()

In [None]:
# Sentiment Destribution
neg_total_avg = (vaccine_data['Negative Sentiment'].sum())/len(vaccine_data.index)
print("Average Density of Negative Tweets:", neg_total_avg)
pos_total_avg = (vaccine_data['Positive Sentiment'].sum())/len(vaccine_data.index)
print("Average Density of Positive Tweets: ",pos_total_avg)
neu_total_avg = (vaccine_data['Neutral Sentiment'].sum())/len(vaccine_data.index)
print("Average Density of Neutral Tweets: ",neu_total_avg)

In [None]:
#stopwords are the words which won't bring about any changes to the polarity of the tweet
stop_words = stopwords.words('english')   
len(stop_words),stop_words[0:10]

# Time Series Analysis

In [None]:
# Change variables
ft_data = vaccine_data.copy()
ft_data['date'] = pd.to_datetime(vaccine_data['date']).dt.date
ft_data['year'] = pd.DatetimeIndex(ft_data['date']).year
b_date_count = ft_data.groupby(by='date').count().reset_index()
b_date_count = b_date_count.rename(columns={'id':'Tweets Per Day'})
fig = ex.line(b_date_count,x='date',y='Tweets Per Day')

fig.add_shape(type="line",
    x0=b_date_count['date'].values[0], y0=b_date_count['Negative Sentiment'].mean(), x1=b_date_count['date'].values[-1], y1=b_date_count['Negative Sentiment'].mean(),
    line=dict(
        color="Red",
        width=2,
        dash="dashdot",
    ),
        name='Mean',
)

fig.update_traces(mode="markers+lines")
fig.update_layout(hovermode="x unified")


# ###annots
b_date_count.date = pd.to_datetime(b_date_count.date)
b_date_count_dt = b_date_count.set_index('date')

fig.add_annotation(x=datetime.datetime(2021,3,1), y=b_date_count_dt.loc[pd.Timestamp('2021-03-1'),'year'],
            text=r"J&J authorization",
            showarrow=True,
            arrowhead= 3,
            bordercolor="#c7c7c7")

fig.add_annotation(x=datetime.datetime(2021,4,21), y=b_date_count_dt.loc[pd.Timestamp('2021-04-21'),'year'],
            text=r"Fear of supply outstripping demand & CDC discussion of J&J bloodclots",
            showarrow=True,
            arrowhead=3,
            yshift=5,bordercolor="#c7c7c7")

fig.add_annotation(x=datetime.datetime(2021,6,29), y=b_date_count_dt.loc[pd.Timestamp('2021-06-29'),'year'],
            text=r"Discussion of vaccine protection against delta variant",
            showarrow=True,
            arrowhead=3,
            yshift=5,ay=-30,bordercolor="#c7c7c7")

fig.update_layout(title='<b>Daily Tweets<b>',width=1000)
fig.show()

# Positive Negative Neutral Analysis

In [None]:
# Assigning sentiment 
Positive_tweet = vaccine_data[vaccine_data['Positive Sentiment'] >= 0.5].reset_index()
Negative_tweet = vaccine_data[vaccine_data['Negative Sentiment']>= 0.5].reset_index()
Neutral_tweet = vaccine_data[vaccine_data['Neutral Sentiment']>= 0.5].reset_index()

In [None]:
# 50 most common positive words

all_positive_words=[]
for i in range(len(Positive_tweet['lemmatized'])):
    a=Positive_tweet['lemmatized'][i]
    for i in a:
        all_positive_words.append(i)
all_positive_words=pd.Series(np.array(all_positive_words))
common_words=all_positive_words.value_counts()[:50].rename_axis('Common Positive Words').reset_index(name='count')
fig = ex.treemap(common_words, path=['Common Positive Words'], values='count',title='50 Most Common Words In Positive Tweets')
fig.show()


In [None]:
# 50 most common negative words

all_negative_words=[]
for i in range(len(Negative_tweet['lemmatized'])):
    a=Negative_tweet['lemmatized'][i]
    for i in a:
        all_negative_words.append(i)
all_negative_words=pd.Series(np.array(all_negative_words))
common_words=all_negative_words.value_counts()[:50].rename_axis('Common Negative Words').reset_index(name='count')
fig = ex.treemap(common_words, path=['Common Negative Words'], values='count',title='50 Most Common Words In Negative Tweets')
fig.show()

In [None]:
# 50 most common neutral words

all_neutral_words=[]
for i in range(len(Neutral_tweet['lemmatized'])):
    a=Neutral_tweet['lemmatized'][i]
    for i in a:
        all_neutral_words.append(i)
all_neutral_words=pd.Series(np.array(all_neutral_words))
common_words=all_neutral_words.value_counts()[:50].rename_axis('Common Neutral Words').reset_index(name='count')
fig = ex.treemap(common_words, path=['Common Neutral Words'], values='count',title='50 Most Common Words In Neutral Tweets')
fig.show()
vaccine_data.head()

# Country Sentiment Analysis

In [None]:
vax_data = r'C:\Users\dundu\OneDrive\Desktop\Web_Social_Project\Main_Data\vaccination_all_tweets.csv'
df_vax = pd.read_csv(vax_data)
print("Read Complete.")

In [None]:
vax= df_vax.drop(['user_name','user_description','user_created','user_followers','user_friends','user_favourites','source','is_retweet'],axis=1)

In [None]:
vax.head()

In [None]:
all_vax = ['covaxin', 'sinopharm', 'sinovac', 'moderna', 'pfizer', 'biontech', 'oxford', 'astrazeneca', 'sputnik']
vaccine_filepath = r'C:\Users\dundu\OneDrive\Desktop\Web_Social_Project\Main_Data\vaccination_all_tweets.csv'
vax_sentiment = pd.DataFrame()
vax_sentiment['Vaccine']=all_vax
sentiments=list()

In [None]:
vax= vaccine_data.drop(['user_name','user_description','user_created','user_followers','user_friends','user_favourites','source','is_retweet'],axis=1)

In [None]:
country_sentiment=pd.DataFrame()
countries=['india','usa','canada','germany','spain','pakistan','uk','brazil','russia','italy','australia','france','argentina','uae','israel','mexico','japan']
country_sentiment['countries']=countries
senti=list()

for country in countries :
    senti.append(vax[vax['user_location'].str.lower().str.contains(country)].Sentiment.mean())
    
country_sentiment['Sentiment']=senti

#  News

In [None]:
vaccine_data['word_sentiment']=pd.Series(np.array(sentiment))
vaccine_data.head()

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(3,1, figsize=(10, 16))
sns.barplot(x="user_followers", y="user_name", orient="h", ax=ax1, palette=["b"],
           vaccine_data=vaccine_data[(vaccine_data.sentiment== "Positive")]\
           .drop_duplicates(subset=["user_name"])\
           .sort_values(by=["user_followers"], ascending=False)[["user_name", "user_followers"]][:10])
ax1.set_title('Top 10 Accounts with Highest Followers who tweet Positive')
sns.barplot(x="user_followers", y="user_name", orient="h", ax=ax2, palette=["g"],
           vaccine_data=vaccine_data[(vaccine_data.sentiment == "Neutral")]
           .drop_duplicates(subset=["user_name"])\
           .sort_values(by=["user_followers"], ascending=False)[["user_name", "user_followers"]][:10])
ax2.set_title('Top 10 Accounts with Highest Followers who tweet Neutral')
sns.barplot(x="user_followers", y="user_name", orient="h", ax=ax3, palette=["r"],
           vaccine_data=vaccine_data[(vaccine_data.sentiment == "Negative")]
           .drop_duplicates(subset=["user_name"])\
           .sort_values(by=["user_followers"], ascending=False)[["user_name", "user_followers"]][:10])
ax3.set_title('Top 10 Accounts with Highest Followers who tweet Negative')

fig.show()