#### This script predicts the opening box office per theater for the films released on August 10th using random forest regressor.

#### The regressor is trained on the opening gross of 9 films, as well as the average of their daily tweet mentions on the day before and day of release.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv('analysis_results/all_data.csv')
df["box office"] = df["box office"]/1000

dayof = df.loc[df["days since release"] == 0].reset_index(drop = True)
dayof = dayof.rename(columns={"daily tweet count":"day 0 tweets"})
dayof = dayof[["title","day 0 tweets", "theaters", "box office"]]

daybefore = df.loc[df["days since release"] == -1].reset_index(drop = True)
daybefore = daybefore.rename(columns={"daily tweet count":"day -1 tweets"})
daybefore = daybefore[["title","day -1 tweets"]]

projection = pd.merge(daybefore, dayof, on = "title")
projection["avg tweets"] = (projection["day -1 tweets"] + projection["day 0 tweets"])/2
projection["box office per theater"] = projection["box office"]/projection["theaters"]
projection

Unnamed: 0,title,day -1 tweets,day 0 tweets,theaters,box office,avg tweets,box office per theater
0,Blindspotting,186.0,362.0,14.0,143.178,274.0,10.227
1,MammaMia2,1451.0,2867.0,3317.0,14286.815,2159.0,4.30715
2,TheEqualizer2,623.0,648.0,3388.0,13375.85,635.5,3.948008
3,UnfriendedMovie,72.0,140.0,1546.0,1401.07,106.0,0.906255
4,MissionImpossible,1733.0,5409.0,4386.0,22803.458,3571.0,5.199147
5,TeenTitansGOMovie,643.0,1470.0,3188.0,4239.719,1056.5,1.329899
6,ChristopherRobin,3225.0,4494.0,3602.0,9514.392,3859.5,2.641419
7,nevergoinback,19.0,34.0,2.0,4.28,26.5,2.14
8,TheSpyWhoDumpedMe,190.0,517.0,3111.0,5011.316,353.5,1.610838


In [3]:
x = projection[["avg tweets"]]
y = projection["box office per theater"].values
clf = RandomForestRegressor(n_estimators=2000,max_depth=10).fit( x, y )

In [4]:
predictors = pd.DataFrame({"day":[], 
                            "daily tweet count":[],  
                            "title":[]})

In [5]:
# Dependencies
from datetime import datetime
from blacklist import blacklist

movie_list = ["BlacKkKlansman","TheMeg","SlenderManMovie"]

for movie in movie_list:
    print(f'processing {movie}')
    df = pd.read_csv(f"rawtweets/{movie}.csv")
    blacklist_flag = []
    converted_time = []
    
    for message in df.text:
        if any(word in message for word in blacklist): #check if the tweet message contains any of the blacklisted words
            blacklist_flag.append("yes")
        else:
            blacklist_flag.append("no")
    
    for raw_time in df.rawtime:
        converted_time.append(datetime.strptime(str(raw_time), "%a %b %d %H:%M:%S %z %Y"))
    
    df['blacklist'] = blacklist_flag
    df['time'] = converted_time
    
    date = pd.to_datetime(df["time"])
    df["day"] = date.dt.day
    df["month"] = date.dt.month
    for row_index in range(len(df)):
        if df.loc[row_index,"month"] == 8:
            df.loc[row_index,"day"] = df.loc[row_index,"day"]+31
    del df["rawtime"]  
    del df["month"]
    
    df = df.loc[(df["blacklist"] == "no") , :].reset_index(drop= True)
    del df["blacklist"]
    
    df.to_csv(f"processedtweets/{movie}.csv", index=False, header=True)
    
    grouped = df.groupby(['day'])
    daily_tweet_count = grouped["text"].count()

    curr_movie = pd.DataFrame({"daily tweet count": daily_tweet_count}).reset_index()
    curr_movie['title'] = movie
    
    #trim out the earliest and most recent day in the twitter data because those days do not have a complete day of tweet record
    curr_movie = curr_movie.loc[curr_movie["day"].isin([33,34,35,36,37])].reset_index(drop = True)

    predictors = pd.concat([predictors, curr_movie]).reset_index(drop = True)
    
print('done')

processing BlacKkKlansman
processing TheMeg
processing SlenderManMovie
done


In [6]:
grouped = predictors.groupby("title")
avgtweet = grouped["daily tweet count"].mean()



In [7]:
print(f"{avgtweet.index[0]}'s estimated opening gross: {clf.predict(avgtweet[0]) [0]} Thousand Dollars per Theater")
print(f"{avgtweet.index[1]}'s estimated opening gross: {clf.predict(avgtweet[1]) [0]} Thousand Dollars per Theater")
print(f"{avgtweet.index[2]}'s estimated opening gross: {clf.predict(avgtweet[2]) [0]} Thousand Dollars per Theater")

BlacKkKlansman's estimated opening gross: 2.856648877890078 Thousand Dollars per Theater
SlenderManMovie's estimated opening gross: 7.267020688977805 Thousand Dollars per Theater
TheMeg's estimated opening gross: 3.917798580959757 Thousand Dollars per Theater


#### The predictions are similar to the results of script 5b which uses a different set of parameters.