In [2]:
import pandas as pd

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (17, 7)
plt.rcParams.update({'font.size': 14})
import seaborn as sns

#improve resolution
#comment this line if erroring on your machine/screen
%config InlineBackend.figure_format ='retina'

import warnings
warnings.filterwarnings('ignore')

#import natural language toolkit
import nltk


# download stopwords & punkt & VADER
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon') 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fajardirham/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/fajardirham/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/fajardirham/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
tweets_df = pd.read_csv('tweets.csv')
#movie_id is one above what it is supposed to be in movies_df
tweets_df['movie_id'] = tweets_df['movie_id'] - 1
tweets_df.head()

Unnamed: 0,tweet,movie,movie_id
0,@SpiderManMovie @HarryHolland99 @IMAX @DolbyCi...,Spider-Man: No Way Home,0
1,@A_C_Mitchell @molly_kraus @MarvelStudios @Spi...,Spider-Man: No Way Home,0
2,“Spider Man trailer and stock prices”… Story |...,Spider-Man: No Way Home,0
3,@Gamer21690 @SpiderManMovie too obsessed,Spider-Man: No Way Home,0
4,Looking forward to Spider-Man tonight. If anyo...,Spider-Man: No Way Home,0


In [4]:
movies_df = pd.read_csv('processed_movies.csv')
movies_df['Released'] = movies_df['Released'].apply(pd.to_datetime) #transforms our string values in column Released into a datetime object
movies_df.columns = ['released', 'movie', 'domestic', 'international', 'worldwide']
movies_df.head()

Unnamed: 0,released,movie,domestic,international,worldwide
0,2021-12-17,Spider-Man: No Way Home,"$804,617,772","$1,083,808,579","$1,888,426,351"
1,2021-09-03,Shang-Chi and the Legend of the Ten R…,"$224,543,292","$207,700,000","$432,243,292"
2,2021-10-01,Venom: Let There be Carnage,"$213,550,366","$288,050,013","$501,600,379"
3,2021-07-09,Black Widow,"$183,651,655","$196,100,000","$379,751,655"
4,2021-06-25,F9: The Fast Saga,"$173,005,945","$548,072,000","$721,077,945"


In [8]:
# Process the tweets first
def clean_text(text):
# Removes all special characters and numericals leaving the alphabets
    to_return = ''
    for word in text.split():
        if not '@' in word and not 'https:' in word:
            to_return += word + " "

    to_return = to_return.replace("#","")
    return to_return[0:len(to_return)-1]

test_clean = clean_text(tweets_df.iloc[0]['tweet'])
tweets_df["tweet_clean"] = tweets_df['tweet'].apply(clean_text)
tweets_df.head()

Unnamed: 0,tweet,movie,movie_id,tweet_clean
0,@SpiderManMovie @HarryHolland99 @IMAX @DolbyCi...,Spider-Man: No Way Home,0,SpiderMan has saved the movie going experience!!!
1,@A_C_Mitchell @molly_kraus @MarvelStudios @Spi...,Spider-Man: No Way Home,0,Yes- so good! Grab extra napkins with your pop...
2,“Spider Man trailer and stock prices”… Story |...,Spider-Man: No Way Home,0,“Spider Man trailer and stock prices”… Story |...
3,@Gamer21690 @SpiderManMovie too obsessed,Spider-Man: No Way Home,0,too obsessed
4,Looking forward to Spider-Man tonight. If anyo...,Spider-Man: No Way Home,0,Looking forward to Spider-Man tonight. If anyo...


In [9]:
# Tokenize
from nltk.tokenize import word_tokenize
tweets_df['tweet_token'] = tweets_df['tweet_clean'].apply(word_tokenize)
tweets_df.head()

Unnamed: 0,tweet,movie,movie_id,tweet_token
0,@SpiderManMovie @HarryHolland99 @IMAX @DolbyCi...,Spider-Man: No Way Home,0,"[@, SpiderManMovie, @, HarryHolland99, @, IMAX..."
1,@A_C_Mitchell @molly_kraus @MarvelStudios @Spi...,Spider-Man: No Way Home,0,"[@, A_C_Mitchell, @, molly_kraus, @, MarvelStu..."
2,“Spider Man trailer and stock prices”… Story |...,Spider-Man: No Way Home,0,"[“, Spider, Man, trailer, and, stock, prices, ..."
3,@Gamer21690 @SpiderManMovie too obsessed,Spider-Man: No Way Home,0,"[@, Gamer21690, @, SpiderManMovie, too, obsessed]"
4,Looking forward to Spider-Man tonight. If anyo...,Spider-Man: No Way Home,0,"[Looking, forward, to, Spider-Man, tonight, .,..."


In [10]:
# Remove stop words

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
tweets_df['tweet_stop'] = tweets_df['tweet_token'].apply(lambda x: [item for item in x if item not in stop_words])
tweets_df.head()

Unnamed: 0,tweet,movie,movie_id,tweet_token,tweet_stop
0,@SpiderManMovie @HarryHolland99 @IMAX @DolbyCi...,Spider-Man: No Way Home,0,"[@, SpiderManMovie, @, HarryHolland99, @, IMAX...","[@, SpiderManMovie, @, HarryHolland99, @, IMAX..."
1,@A_C_Mitchell @molly_kraus @MarvelStudios @Spi...,Spider-Man: No Way Home,0,"[@, A_C_Mitchell, @, molly_kraus, @, MarvelStu...","[@, A_C_Mitchell, @, molly_kraus, @, MarvelStu..."
2,“Spider Man trailer and stock prices”… Story |...,Spider-Man: No Way Home,0,"[“, Spider, Man, trailer, and, stock, prices, ...","[“, Spider, Man, trailer, stock, prices, ”, …,..."
3,@Gamer21690 @SpiderManMovie too obsessed,Spider-Man: No Way Home,0,"[@, Gamer21690, @, SpiderManMovie, too, obsessed]","[@, Gamer21690, @, SpiderManMovie, obsessed]"
4,Looking forward to Spider-Man tonight. If anyo...,Spider-Man: No Way Home,0,"[Looking, forward, to, Spider-Man, tonight, .,...","[Looking, forward, Spider-Man, tonight, ., If,..."


In [None]:
# Stemming
from nltk.stem import PorterStemmer

ps = PorterStemmer()

tweets_df['tweet_stem'] = tweets_df['tweet_stop'].apply(lambda x: [ps.stem(y) for y in x])
tweets_df.head()

In [None]:
#DO OTHER STUFF TO SEE FREQ DISTRIBUTION

In [12]:
# VADER sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
analyser = SentimentIntensityAnalyzer()
test_tweet = tweets_df.iloc[0]['tweet_clean']
print(test_tweet)
print(analyser.polarity_scores(test_tweet))

SpiderMan has saved the movie going experience!!!
{'neg': 0.0, 'neu': 0.62, 'pos': 0.38, 'compound': 0.5684}


In [13]:
sentiment_df = pd.DataFrame()
sentiment_df['tweet_clean'] = tweets_df['tweet_clean']
sentiment_df['sentiment'] = tweets_df['tweet_clean'].apply(analyser.polarity_scores)
sentiment_df.head()

Unnamed: 0,tweet_clean,sentiment
0,SpiderMan has saved the movie going experience!!!,"{'neg': 0.0, 'neu': 0.62, 'pos': 0.38, 'compou..."
1,Yes- so good! Grab extra napkins with your pop...,"{'neg': 0.0, 'neu': 0.66, 'pos': 0.34, 'compou..."
2,“Spider Man trailer and stock prices”… Story |...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
3,too obsessed,"{'neg': 0.63, 'neu': 0.37, 'pos': 0.0, 'compou..."
4,Looking forward to Spider-Man tonight. If anyo...,"{'neg': 0.25, 'neu': 0.75, 'pos': 0.0, 'compou..."


In [17]:
def spread_sentiment(sentiment_obj, category):
    return sentiment_obj[category]
sentiment_df['compound'] = sentiment_df['sentiment'].apply(lambda x: spread_sentiment(x, 'compound'))
sentiment_df['neg'] = sentiment_df['sentiment'].apply(lambda x: spread_sentiment(x, 'neg'))
sentiment_df['neu'] = sentiment_df['sentiment'].apply(lambda x: spread_sentiment(x, 'neu'))
sentiment_df['pos'] = sentiment_df['sentiment'].apply(lambda x: spread_sentiment(x, 'pos'))
sentiment_df.drop(columns=['sentiment'], inplace=True)
sentiment_df.head()

Unnamed: 0,tweet_clean,compound,neg,neu,pos
0,SpiderMan has saved the movie going experience!!!,0.5684,0.0,0.62,0.38
1,Yes- so good! Grab extra napkins with your pop...,0.7339,0.0,0.66,0.34
2,“Spider Man trailer and stock prices”… Story |...,0.0,0.0,1.0,0.0
3,too obsessed,-0.1779,0.63,0.37,0.0
4,Looking forward to Spider-Man tonight. If anyo...,-0.6875,0.25,0.75,0.0


In [19]:
# assign labels
def compound_to_label(compound_score):
    if(compound_score >= 0.05): return 'pos'
    if(compound_score <= -0.05): return 'neg'
    return 'neu'

sentiment_df['label'] = sentiment_df['compound'].apply(compound_to_label)
sentiment_df.head()

Unnamed: 0,tweet_clean,compound,neg,neu,pos,label
0,SpiderMan has saved the movie going experience!!!,0.5684,0.0,0.62,0.38,pos
1,Yes- so good! Grab extra napkins with your pop...,0.7339,0.0,0.66,0.34,pos
2,“Spider Man trailer and stock prices”… Story |...,0.0,0.0,1.0,0.0,neu
3,too obsessed,-0.1779,0.63,0.37,0.0,neg
4,Looking forward to Spider-Man tonight. If anyo...,-0.6875,0.25,0.75,0.0,neg


In [20]:
# Drop neutral ones
print('before', sentiment_df.shape)
sentiment_df.drop(sentiment_df[sentiment_df.label == 'neu'].index, inplace=True)
print('after', sentiment_df.shape)

before (801929, 6)
after (650046, 6)
