## Movie Mania
#### Team - Nelson Dsouza, Justin Petelka, Shrija Priyanil

### Part 1 - Get Movie Tweets

In [1]:
#Import libraries
import sys
import time
import datetime
import json
import pandas as pd
import tweepy
import re
import operator
from collections import Counter
from scipy.stats.stats import pearsonr

from data.sentiments_nrc import SENTIMENTS
from data.sentiments_nrc import EMOTIONS

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
#Set up keys and api authentication
consumer_key = ''
consumer_secret = ''
access_token =''
access_token_secret = ''

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

In [3]:
def findTweets(maxTweets, maxPerPage, searchTweet, fil):
    '''
    Function to find tweets via REST API given search string
    '''
    tweetCount = 0
    try:
        while (tweetCount < maxTweets):
            with open(fil, 'a', encoding='UTF-8') as f:
                for tweet in tweepy.Cursor(api.search, rpp=maxPerPage, q=searchTweet).items(maxTweets):
                    json_tweet = tweet._json
                    f.write(json.dumps(json_tweet) + '\n')
                    tweetCount += 1
    except KeyboardInterrupt:
        print('Downloaded {0} tweets'.format(tweetCount))
        pass
    except tweepy.TweepError as e:
        print('Tweep error raised',str(e))
        pass

    f.close()
    print('Downloaded {0} tweets'.format(tweetCount))

In [4]:
maxTweets = 2500
maxPerPage = 50
# findTweets(maxTweets, maxPerPage, '#BlackPanther OR #blackpanther', 'BP.json')

In [5]:
maxTweets = 2500
maxPerPage = 50
# findTweets(maxTweets, maxPerPage, '#ReadyPlayerOne OR #readyplayerone', 'RP.json')

In [6]:
def load_tweets(tw_file):
    '''
    Function to load only - "user","text","favorite_count", "retweet_count","source" for tweets from file
    '''
    tweets = []
    tweet_file = open(tw_file, "r")
    
    for line in tweet_file:
        try:
            tweet = json.loads(line)
            #print(tweet)
            tweets.append(tweet)
        except:
            continue
    tw_pd = pd.DataFrame(tweets)
    return (tw_pd[["user","text","favorite_count", "retweet_count","source", "retweeted_status"]], tw_pd["text"])

In [7]:
# Extract only relevant tweet attributes

tw_bp_all, tw_bp_txt = load_tweets('BP.json')
tw_rp_all, tw_rp_txt = load_tweets('RP.json')

### Part 2 - Analyze Tweet metadata

In [8]:
def populate_aggr_dict(users, key, dctnry):
    '''
    Function to populate dictionary for a tweet metadata
    '''
    for user in users:
        if user[key] not in dctnry.keys(): dctnry[user[key]] = 1
        else: dctnry[user[key]] +=1
    return dctnry


def twet_aggr_metadata(tw_list):
    '''
    Function to aggregate metadata for a tweet set
    '''
    users = list(tw_list['user'])
    t_id = populate_aggr_dict(users, 'id', {})
    t_lc = populate_aggr_dict(users, 'location', {})
    t_vr = populate_aggr_dict(users, 'verified', {})
    return (t_id, t_lc, t_vr)


def user_aggr_metadata(tw_list):
    '''
    Function to gather the user related attributes exactly once for each user
    '''
    users = list(tw_list['user'])
    user_pass, u_fv, u_fl, u_fr = {}, {}, {}, {}
    for user in users:
        if user['id'] not in user_pass.keys():
            u_fv[user['id']] = user['favourites_count']
            u_fl[user['id']] = user['followers_count']
            u_fr[user['id']] = user['friends_count']
        else: user_pass[user['id']] +=1
    return (u_fv, u_fl, u_fr)
    

In [9]:
# Calculate aggreate tweet statistics for movies
from operator import add

tw_bp_usr_unid, tw_bp_usr_locn, tw_bp_usr_vrfd = twet_aggr_metadata(tw_bp_all)
tw_bp_twt_favs, tw_bp_twt_foll, tw_bp_twt_frnd = user_aggr_metadata(tw_bp_all)
tw_bp_tot_rtwt = list(tw_bp_all['retweet_count'])
tw_bp_tot_favs = list(map(add, list(tw_bp_all['favorite_count']),
                     [0 if pd.isnull(item) else item['favorite_count'] for item in list(tw_bp_all['retweeted_status'])]))

tw_rp_usr_unid, tw_rp_usr_locn, tw_rp_usr_vrfd = twet_aggr_metadata(tw_rp_all)
tw_rp_twt_favs, tw_rp_twt_foll, tw_rp_twt_frnd = user_aggr_metadata(tw_rp_all)
tw_rp_tot_rtwt = list(tw_rp_all['retweet_count'])
tw_rp_tot_favs = list(map(add, list(tw_rp_all['favorite_count']),
                     [0 if pd.isnull(item) else item['favorite_count'] for item in list(tw_rp_all['retweeted_status'])]))

In [10]:
bp_len, rp_len = len(tw_bp_all), len(tw_rp_all)

print('Statistics Black Panther / Ready Player One')

print('\n\nUnique Users who tweeted', len(tw_bp_usr_unid), ' / ', len(tw_rp_usr_unid))
print('\nUser id\'s with maximum tweets', sorted(tw_bp_usr_unid.items(), key=operator.itemgetter(1))[-3:], '\n',
                                     sorted(tw_rp_usr_unid.items(), key=operator.itemgetter(1))[-3:])
print('\nTweets from most locations', sorted(tw_bp_usr_locn.items(), key=operator.itemgetter(1))[-5:], '\n',
                                      sorted(tw_rp_usr_locn.items(), key=operator.itemgetter(1))[-5:])
print('\nUsers with verified accounts', tw_bp_usr_vrfd[True], ' / ', tw_rp_usr_vrfd[True])


print('\nAverage User Followers', round(sum(tw_bp_twt_foll.values()) / bp_len), ' / ',
                                  round(sum(tw_rp_twt_foll.values()) / rp_len))

print('\nAverage User Friends', round(sum(tw_bp_twt_frnd.values()) / bp_len), ' / ',
                                  round(sum(tw_rp_twt_frnd.values()) / rp_len))

print('\nAverage User Favorites', round(sum(tw_bp_twt_favs.values()) / bp_len), ' / ',
                                  round(sum(tw_rp_twt_favs.values()) / rp_len))

print('\nTotal Tweet Retweets', sum(tw_bp_tot_rtwt), ' / ', sum(tw_rp_tot_rtwt))

print('\nTotal Tweet Favorites', sum(tw_bp_tot_favs), ' / ', sum(tw_rp_tot_favs))

Statistics Black Panther / Ready Player One


Unique Users who tweeted 2382  /  2179

User id's with maximum tweets [(750402925063798784, 4), (2279614957, 5), (2507243704, 12)] 
 [(845855575741124611, 9), (2550251936, 10), (186668321, 16)]

Tweets from most locations [('Los Angeles, CA', 18), ('Atlanta, GA', 18), ('Johannesburg, South Africa', 21), ('United States', 28), ('', 691)] 
 [('Philippines', 11), ('United States', 13), ('España', 16), ('Virginia', 17), ('', 736)]

Users with verified accounts 25  /  51

Average User Followers 2957  /  10390

Average User Friends 867  /  1293

Average User Favorites 22674  /  10808

Total Tweet Retweets 3368437  /  421102

Total Tweet Favorites 7276180  /  443787


### Part 3 - Categorizing tweet sentiments

In [11]:
def get_tweet_sentiment(tweets):
    '''
    Function to get sentiment of a tweet
    '''
    senti, senti_words = {}, {}
    for emotion in EMOTIONS:
        senti[emotion], senti_words[emotion] = 0, []      
   
    for tweet in tweets:
        words_list = re.split('\W+', tweet)
    
        for wrd in words_list:
            if wrd in SENTIMENTS.keys():
                for sentiment in SENTIMENTS[wrd]:
                    senti[sentiment] += 1
                    senti_words[sentiment] = senti_words[sentiment] + [wrd]
                    
    for emotion in EMOTIONS:
        senti_words[emotion] = Counter(senti_words[emotion])

    return(senti, senti_words)


def get_tweet_sentiment_per(tw_senti):
    '''
    Function to get percentage of sentiment
    tw_senti[0]  Gives the count each sentiment
    tw_senti[1]['anger'] Gives the count of each word for the sentiment
    '''
    mf =  1/sum(tw_senti[0].values())*100
    for key in tw_senti[0]:
        tw_senti[0][key] = mf*tw_senti[0][key]
        
    return tw_senti

In [12]:
# Segregating tweet emotions and words having those emotions
tw_senti_bp = get_tweet_sentiment(tw_bp_txt)
tw_senti_rp = get_tweet_sentiment(tw_rp_txt)

# Get percentage of sentiment
tw_senti_bp_per = get_tweet_sentiment_per(tw_senti_bp)
tw_senti_rp_per = get_tweet_sentiment_per(tw_senti_rp)
all_sentiments = sorted(tw_senti_bp_per[0].keys())

print('All Sentiments\n', all_sentiments)
print('\n\nBlack Panther: ', tw_senti_bp_per[0])
print('\nReady Player One Sentiments: ', tw_senti_rp_per[0])

All Sentiments
 ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust']


Black Panther:  {'disgust': 0.27888929307627014, 'fear': 12.064993330908209, 'sadness': 0.4243967503334546, 'anticipation': 23.717715532921062, 'trust': 6.1719413119922395, 'anger': 1.2004365223717717, 'joy': 11.992239602279618, 'surprise': 11.567842851946162, 'negative': 13.495816660603856, 'positive': 19.08572814356736}

Ready Player One Sentiments:  {'disgust': 2.674539770753734, 'fear': 4.5154567558179926, 'sadness': 3.5081625564432093, 'anticipation': 18.513372698853768, 'trust': 12.330670371656826, 'anger': 3.542896839180271, 'joy': 10.350816255644322, 'surprise': 8.787773532476555, 'negative': 9.517193469954845, 'positive': 26.25911774921848}


### Part 4 - Retrieving OMDB Data

In [13]:
import json, requests
def retrieve_omdb_rating(api, mov):
    '''
    Function to retrieve omdb rating for the movie
    '''
    url = api + mov
    response = requests.get(url)
    omdb_response = json.loads(response.text)
    return omdb_response

In [14]:
api = ""
bp = "Black+Panther&y=2018"
rp = "Ready+Player+One&y=2018"

omdb_bp = retrieve_omdb_rating(api, bp)
omdb_rp = retrieve_omdb_rating(api, rp)

meta_rating_bp = float(omdb_bp['Ratings'][2]['Value'].split('/')[0])
meta_rating_rp = float(omdb_rp['Ratings'][2]['Value'].split('/')[0])

imdb_rating_bp = float(omdb_bp['Ratings'][0]['Value'].split('/')[0])
imdb_rating_rp = float(omdb_rp['Ratings'][0]['Value'].split('/')[0])

rotn_rating_bp = float(omdb_bp['Ratings'][1]['Value'].split('/')[0][:-1])
rotn_rating_rp = float(omdb_rp['Ratings'][1]['Value'].split('/')[0][:-1])

print(omdb_bp, '\n\n\n', omdb_rp,  
      '\n\nMeta Ratings\nBlack Panther: ', meta_rating_bp, '\nReady Player One: ', meta_rating_rp,
      '\n\nIMBD Ratings\nBlack Panther: ', imdb_rating_bp, '\nReady Player One: ', imdb_rating_rp,
      '\n\nRotten Tomatoes Ratings\nBlack Panther: ', rotn_rating_bp, '\nReady Player One: ', rotn_rating_rp)

{'Runtime': '134 min', 'Poster': 'https://ia.media-imdb.com/images/M/MV5BMTg1MTY2MjYzNV5BMl5BanBnXkFtZTgwMTc4NTMwNDI@._V1_SX300.jpg', 'DVD': 'N/A', 'Language': 'Swahili, Nama, English, Xhosa, Korean', 'Year': '2018', 'Metascore': '88', 'Response': 'True', 'BoxOffice': '$501,105,037', 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '7.8/10'}, {'Source': 'Rotten Tomatoes', 'Value': '97%'}, {'Source': 'Metacritic', 'Value': '88/100'}], 'Plot': "T'Challa, the King of Wakanda, rises to the throne in the isolated, technologically advanced African nation, but his claim is challenged by a vengeful outsider who was a childhood victim of T'Challa's father's mistake.", 'imdbVotes': '201,861', 'Actors': "Chadwick Boseman, Michael B. Jordan, Lupita Nyong'o, Danai Gurira", 'imdbID': 'tt1825683', 'imdbRating': '7.8', 'Genre': 'Action, Adventure, Sci-Fi', 'Title': 'Black Panther', 'Country': 'USA', 'Writer': 'Ryan Coogler, Joe Robert Cole, Stan Lee (based on the Marvel Comics by), Jack Kirb

### Part 5 - Analyzing Correlation between Tweet sentiments and OMDB rating

In [15]:
def aggregate_sentiments(sentiments, movies_sentiments):
    '''
    Function to group all movie sentiments
    '''
    sentiments_movies = {}
    for sentiment in all_sentiments:
        movie_sentiment=[]
        for movie in movies_sentiments:
            #print(movie)
            movie_sentiment.append(movie[sentiment])
        sentiments_movies[sentiment] = movie_sentiment
    return sentiments_movies   


def calculate_sentiment_rating_corr(movie_sentiments, movie_ratings):
    '''
    Function to calculate Pearson's Correlation for movies
    Returns (Pearson’s correlation coefficient, 2-tailed p-value)
    '''
    sentiment_rating_correlation = {}
    for sentiment in movie_sentiments:
        sentiment_rating_correlation[sentiment] = pearsonr([round(item, 0) for item in movie_sentiments[sentiment]], movie_ratings)
    return sentiment_rating_correlation

In [16]:
all_movies_sentiments = aggregate_sentiments(all_sentiments, [tw_senti_bp_per[0], tw_senti_rp_per[0]])
all_movies_ratings_meta = [meta_rating_bp, meta_rating_rp]
all_movies_ratings_imdb = [imdb_rating_bp, imdb_rating_rp]
all_movies_ratings_rotn = [rotn_rating_bp, rotn_rating_rp]

corr_meta = calculate_sentiment_rating_corr(all_movies_sentiments, all_movies_ratings_meta)
corr_imdb = calculate_sentiment_rating_corr(all_movies_sentiments, all_movies_ratings_imdb)
corr_rotn = calculate_sentiment_rating_corr(all_movies_sentiments, all_movies_ratings_rotn)

print("Metacritic:\n", corr_meta, "\n\nIMDB:\n", corr_imdb, "\n\nRotten Tomatoes:\n", corr_rotn)

Metacritic:
 {'disgust': (-1.0, 0.0), 'fear': (1.0, 0.0), 'sadness': (-1.0, 0.0), 'joy': (1.0, 0.0), 'negative': (1.0, 0.0), 'anger': (-1.0, 0.0), 'anticipation': (1.0, 0.0), 'surprise': (1.0, 0.0), 'trust': (-1.0, 0.0), 'positive': (-1.0, 0.0)} 

IMDB:
 {'disgust': (1.0, 0.0), 'fear': (-1.0, 0.0), 'sadness': (1.0, 0.0), 'joy': (-1.0, 0.0), 'negative': (-1.0, 0.0), 'anger': (1.0, 0.0), 'anticipation': (-1.0, 0.0), 'surprise': (-1.0, 0.0), 'trust': (1.0, 0.0), 'positive': (1.0, 0.0)} 

Rotten Tomatoes:
 {'disgust': (-1.0, 0.0), 'fear': (1.0, 0.0), 'sadness': (-1.0, 0.0), 'joy': (1.0, 0.0), 'negative': (1.0, 0.0), 'anger': (-1.0, 0.0), 'anticipation': (1.0, 0.0), 'surprise': (1.0, 0.0), 'trust': (-1.0, 0.0), 'positive': (-1.0, 0.0)}


## Conclusions
For Metacritic and Rotten Tomatoes Ratings:
    Positively correlated with rating: Anticipation, Fear, Joy, Negative, Surprise
    Negatively correlated with rating: Anger, Disgust, Positive, Sadness, Trust
    
For IMDB Ratings:
    Positively correlated with rating: Anger, Disgust, Positive, Sadness, Trust
    Negatively correlated with rating: Anticipation, Fear, Joy, Negative, Surprise
    
## Caveats
* Black Panther and Ready Player One were released during different months
* Ready Player one noticably has high positive sentiment but lower rating which might have skewed the correlation metrics for Metacritic and Rotten Tomatoes
* Rating system makes a huge difference as exemplified by difference between Metacritic / Rotten Tomatoes and IMDB
* Adding more movies might make this more stable