# released films

In [None]:
#https://www.kaggle.com/avnovikov/predicting-box-office
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

df = pd.read_csv('analysis_results/allfilms.csv')

fiveweeks = df.loc[(df["days since release"]>= 0) & (df["days since release"]<=34), :].reset_index(drop = True)
fiveweeks["box office"] = fiveweeks["box office"]/1000
fiveweeks.head()

In [None]:
x = fiveweeks[["daily tweet count","days since release","weekend","star"]]
y = fiveweeks["box office"].values

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)


In [None]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(n_estimators=1000,max_depth=10).fit( x_train, y_train )
clf_score = clf.score(x_test, y_test)




In [None]:
x_vals = x_test['daily tweet count']
y_predicted = clf.predict(x_test)

plt.style.use('ggplot')

#predicted = #f46d43
#actual = #a6d96a
#border = #ffffbf

fig = plt.figure(figsize = (20,16))
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212)

# A function that takes in the subplot handle and formats the subplot
def figformat(ax, title, clf_score):
    ax.set_xlabel('daily tweets', fontsize = 20)
    ax.set_ylabel('boxoffice (thousands)', fontsize = 20)
    ax.set_title(title, fontsize = 24)
    ax.tick_params(axis='both', length = 10, width = 5, labelsize = 18)
    ax.tick_params(axis='both', which = "minor", length = 8, width = 2)
    lgn_h = ax.legend(loc = "upper left", prop={'size': 20})
    frame = lgn_h.get_frame()
    frame.set_facecolor('white')
    ax.text(0.02,0.7,'classifier accuracy score = {:.2f}'.format(clf_score),transform=ax.transAxes, fontsize = 18)
    
def figformat2(ax):
    ax.set_xscale('log')
    ax.set_yscale('log') 
    ax.set_xlim(0.9, 6500)
    ax.set_ylim(0.9, 30000)

ax1.scatter(x_vals, y_predicted, c = "#f46d43", edgecolor = "#ffffbf", s = 200, alpha = 0.8, label = "predicted data")
ax1.scatter(x_vals, y_test, c = "#a6d96a", edgecolor = "#ffffbf", s = 200, alpha = 0.8, label = "actual data")
ax2.scatter(x_vals, y_predicted, c = "#f46d43", edgecolor = "#ffffbf", s = 200, alpha = 0.8, label = "predicted data")
ax2.scatter(x_vals, y_test, c = "#a6d96a", edgecolor = "#ffffbf", s = 200, alpha = 0.8, label = "actual data")

figformat(ax1, "Box Office vs Daily Tweet Mentions", clf_score)
figformat(ax2, "Box Office vs Daily Tweet Mentions (in log scale)", clf_score)
figformat2(ax2)

plt.subplots_adjust(hspace=0.3)
plt.savefig('analysis_results/overallplot.png')

# opening night

In [None]:
prerelease = df.loc[df["days since release"].isin([0,-1])]
prerelease = prerelease.loc[prerelease["title"] != "DarkestMinds",:].reset_index(drop = True)
prerelease

In [None]:
daybefore = df.loc[df["days since release"]== -1, :].set_index("title")
dayof = df.loc[df["days since release"]== 0, :].set_index("title")

filmtitles = dayof.index.values

projection_dict = []
for film in filmtitles:
    tweet_daybefore = daybefore.loc[film, "daily tweet count"]
    tweet_dayof = dayof.loc[film, "daily tweet count"]
    opening_gross = dayof.loc[film,"box office"]
    star = dayof.loc[film, "star"]
    projection_dict.append({"title":film, "tweet day -1": tweet_daybefore, "tweet day 0": tweet_dayof, "star": star, "opening": opening_gross})
    
projection = pd.DataFrame(projection_dict)
projection

In [None]:
x = projection[["tweet day -1","star"]]
y = projection["opening"].values
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)
x_train = x
x_test = x
y_train = y
y_test =y


In [None]:
clf = RandomForestRegressor(n_estimators=1000,max_depth=10).fit( x_train, y_train )
clf_score = clf.score(x_test, y_test)
print ('The classifier accuracy score is {:.2f}'.format(clf_score))

x_vals = x_test['tweet day -1']
y_predicted = clf.predict(x_test)

In [None]:
fig = plt.figure(figsize = (15,6))
ax1 = fig.add_subplot(111)

ax1.scatter(x_vals, y_predicted, c = "#f46d43", edgecolor = "#ffffbf", s = 200, alpha = 0.8, label = "predicted data")
ax1.scatter(x_vals, y_test, c = "#a6d96a", edgecolor = "#ffffbf", s = 200, alpha = 0.8, label = "actual data")

figformat(ax1, "Box Office vs Daily Tweet Mentions", clf_score)

plt.subplots_adjust(hspace=0.3)
plt.savefig('analysis_results/overallplot.png')