#### This script predicts the opening box office per theater for the films released on August 10th using random forest regressor.

#### The regressor is trained on data for all released films, parameters included are "daily tweet count", "weekday or weekend", "days since release"

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv('analysis_results/released_only.csv')

df["box office per theater"] = df["box office"]/df["theaters"]

df.head()

Unnamed: 0,day,daily tweet count,day of week,title,box office,days since release,weekend,star,theaters,box office per theater
0,18.0,55.0,2.0,RampageMovie,5.098,97.0,0.0,1721.0,61.0,0.083574
1,19.0,46.0,3.0,RampageMovie,4.059,98.0,0.0,1721.0,61.0,0.066541
2,20.0,35.0,4.0,RampageMovie,35.295,99.0,1.0,1721.0,114.0,0.309605
3,21.0,41.0,5.0,RampageMovie,49.155,100.0,1.0,1721.0,114.0,0.431184
4,22.0,27.0,6.0,RampageMovie,40.598,101.0,1.0,1721.0,114.0,0.356123


In [3]:
x = df[["daily tweet count","weekend","days since release"]]
y = df["box office per theater"].values
clf = RandomForestRegressor(n_estimators=2000,max_depth=10).fit( x, y )

In [4]:
futurerelease = pd.DataFrame({"day":[], 
                            "daily tweet count":[],  
                            "title":[]})

In [5]:
# Dependencies
from datetime import datetime
from blacklist import blacklist

movie_list = ["BlacKkKlansman","TheMeg","SlenderManMovie"]

for movie in movie_list:
    print(f'processing {movie}')
    df = pd.read_csv(f"rawtweets/{movie}.csv")
    blacklist_flag = []
    converted_time = []
    
    for message in df.text:
        if any(word in message for word in blacklist): #check if the tweet message contains any of the blacklisted words
            blacklist_flag.append("yes")
        else:
            blacklist_flag.append("no")
    
    for raw_time in df.rawtime:
        converted_time.append(datetime.strptime(str(raw_time), "%a %b %d %H:%M:%S %z %Y"))
    
    df['blacklist'] = blacklist_flag
    df['time'] = converted_time
    
    date = pd.to_datetime(df["time"])
    df["day"] = date.dt.day
    df["month"] = date.dt.month
    for row_index in range(len(df)):
        if df.loc[row_index,"month"] == 8:
            df.loc[row_index,"day"] = df.loc[row_index,"day"]+31
    del df["rawtime"]  
    del df["month"]
    
    df = df.loc[(df["blacklist"] == "no") , :].reset_index(drop= True)
    del df["blacklist"]
    
    df.to_csv(f"processedtweets/{movie}.csv", index=False, header=True)
    
    grouped = df.groupby(['day'])
    daily_tweet_count = grouped["text"].count()

    curr_movie = pd.DataFrame({"daily tweet count": daily_tweet_count}).reset_index()
    curr_movie['title'] = movie
    
    #trim out the earliest and most recent day in the twitter data because those days do not have a complete day of tweet record
    curr_movie = curr_movie.loc[curr_movie["day"].isin([33,34,35,36,37])].reset_index(drop = True)

    futurerelease = pd.concat([futurerelease, curr_movie]).reset_index(drop = True)
    
print('done')

processing BlacKkKlansman
processing TheMeg
processing SlenderManMovie
done


In [6]:
grouped = futurerelease.groupby("title")
avgtweet = grouped["daily tweet count"].mean()

predictors = pd.DataFrame({"avg daily tweet":avgtweet})
predictors["weekend"] = 1
predictors["days since release"] = 0

In [7]:
for i in range(3):
    x = pd.DataFrame( predictors.iloc[i,:] ).T
    print(f"{predictors.index[i]}'s estimated opening gross: {clf.predict(x)[0]} Thousand Dollars per Theater")



BlacKkKlansman's estimated opening gross: 2.8195793189430995 Thousand Dollars per Theater
SlenderManMovie's estimated opening gross: 7.475978808496873 Thousand Dollars per Theater
TheMeg's estimated opening gross: 3.981906337888704 Thousand Dollars per Theater


#### The predictions are similar to the results of script 5a which uses a different set of parameters.