In [1]:
#Import modules
import pandas as pd
import datetime as dt
from twitterscraper import query_tweets
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from langdetect import detect
import matplotlib.pyplot as plt
import seaborn as sns

INFO: {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'}


In [2]:
#detector function to make sure text is fed into language detection
def detector(x):
    try:
        return(detect(x))
    except:
        None

In [3]:
#Analyzer object
analyzer = SentimentIntensityAnalyzer()

In [4]:
#date range before premier
begin_date = dt.date(2019,3,3)
end_date = dt.date(2019,3,7)

In [5]:
#date range after premier
begin_date_premier = dt.date(2019,3,7)
end_date_premier = dt.date(2019,3,9)

In [None]:
#Query tweets
tweets_before = query_tweets("#CaptainMarvel", begindate=begin_date, enddate=end_date,limit=100000)
tweets_after = query_tweets("#CaptainMarvel", begindate=begin_date_premier, enddate=end_date_premier, limit=100000)

In [None]:
#Convert tweets to dataframe
df_before = pd.DataFrame(t.__dict__ for t in tweets_before)
df_after =  pd.DataFrame(t.__dict__ for t in tweets_after)

In [None]:
#Filtering for english language
df_before['lang']  = df_before['text'].apply(lambda x: detector(x))
df_before= df_before[df_before['lang']=='en']

In [None]:
df_before.shape

In [None]:
#Filtering for english language
df_after['lang'] = df_after['text'].apply(lambda x:detector(x))
df_after = df_after[df_after['lang']=='en']

In [None]:
df_after.shape

In [None]:
#Sentiment analysis
sentiment_before = df_before['text'].apply(lambda x: analyzer.polarity_scores(x))
sentiment_after = df_after['text'].apply(lambda x: analyzer.polarity_scores(x))

In [None]:
#Put sentiment in dataframe
df_before = pd.concat([df_before, sentiment_before.apply(pd.Series)],1)
df_after = pd.concat([df_after, sentiment_after.apply(pd.Series)],1)

In [None]:
#removing duplicate tweets
df_before.drop_duplicates(subset='text', inplace=True)
df_after.drop_duplicates(subset='text', inplace=True)

In [None]:
#filtering by dates
df_after = df_after[df_after['timestamp']>dt.datetime(2019,3,8,0,0,0)]

In [None]:
df_before.compound.hist()

In [None]:
#From the histogram, we get:
#majority of histograms have 0 sentiment score i.e. neutral

In [None]:
mean_compound_score_before_prem = df_before.compound.mean() 
mean_compound_score_before_prem

In [None]:
#Sentiment score before premier is fairly positive at 0.24

In [None]:
df_before.compound.median()

In [None]:
mean_compound_score_after_prem = df_after.compound.mean()
mean_compound_score_after_prem

In [None]:
df_after.compound.median()

The Compound score is a metric that calculates the sum of all the lexicon ratings which have been normalized between -1(most extreme negative) and +1 (most extreme positive).

We see that the mean compound score has increased from 0.24 earlier to 0.33 after the movie premiered implying increase in positive sentiment

In [None]:
print("Increase in compound score before and after premier is", round(((mean_compound_score_after_prem - mean_compound_score_before_prem)/mean_compound_score_before_prem)*100,2),"%")

In [None]:
#Calculating ratios of positive to negative sentiment before and after movie premier
before_ratio = df_before[df_before['compound']>0].shape[0]/ df_before[df_before['compound']<0].shape[0]
after_ratio = df_after[df_after['compound']>0].shape[0]/ df_after[df_after['compound']<0].shape[0]

In [None]:
before_ratio

In [None]:
#Before premier, the ratio of positive to negative sentiment is 3.44

In [None]:
after_ratio

In [None]:
#After premier, the ratio of positive to negative sentiment is 4.47 i.e. higher than before the premier (3.43)

In [None]:
#Only analyzing positive or negative tweets
df_before_nz = df_before[df_before['compound']!=0]
df_after_nz = df_after[df_after['compound']!=0]

In [None]:
df_before_nz['compound'].sample(5000).hist()

In [None]:
df_after_nz['compound'].sample(5000).hist()

Plotting the compound sentiment score after and before premier for comparision

In [None]:
ax1 = sns.distplot(df_before_nz['compound'], bins=15, hist=False, label = 'Captain Marvel before premier', color='blue', kde_kws={'linestyle':'--'})
ax2 = sns.distplot(df_after_nz['compound'], bins=15, hist=False, label= 'Captain Marvel after premier', color='blue')
plt.legend()
plt.show()