In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error

In [4]:
models = ['Random Forest', 'XGBoost', 'Gradient Boosting', 'SVM', 'Logistic Regression', 'Naive Bayes']
path = "/Users/mattmacrides/Personal-Projects/Predicting Box Office Sales/"
df = pd.read_csv(path + 'data.csv', sep=',', decimal='.')
df = df[0:33]
df.head(5)

Unnamed: 0,Rank,Release,Max Theaters,Opening,Budget (millions),Year,Tickets Sold (that year),Distributor,Critic Ratings,User Ratings,...,IsMarvel,IsAvengers,IsSpiderman,IsBlackPanther,IsThor,IsIronMan,IsGalaxy,IsAmerica,IsAnt,IsDoctor
0,0,The Marvels,4406,,275,2023,901469391,Walt Disney Studios Motion Pictures,,,...,1,0,0,0,0,0,0,0,0,0
1,1,Avengers: Endgame,4662,357115007.0,356,2019,1228896463,Walt Disney Studios Motion Pictures,0.94,0.9,...,0,1,0,0,0,0,0,0,0,0
2,2,Spider-Man: No Way Home,4336,260138569.0,200,2021,434113718,Sony Pictures Entertainment (SPE),0.93,0.98,...,0,0,1,0,0,0,0,0,0,0
3,3,Avengers: Infinity War,4474,257698183.0,316,2018,1311292418,Walt Disney Studios Motion Pictures,0.85,0.92,...,0,1,0,0,0,0,0,0,0,0
4,4,The Avengers,4349,207438708.0,220,2012,1382831536,Walt Disney Studios Motion Pictures,0.91,0.91,...,0,1,0,0,0,0,0,0,0,0


I decided to add budget, user scores, and critic scores from rotten tomatoes in the dataset. I also added indicator variables denoted by 0 or 1 if a movie IsSequel, IsMarvel, IsAvengers, etc.

Still, there are 4 parameters that are unknown for The Marvels: Critic ratings, user ratings, theaters, and opening box office $. For max theaters, I averaged the last 6 releases. Let's use a random forest model to predict the remaining variables.

Let's create our modeling function

In [5]:
def modeling(predict_x, x, y):
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5)

    model = RandomForestRegressor()
    model.fit(x_train, y_train.values.ravel())
    y_pred = model.predict(x_test)

    mse = mean_squared_error(y_test, y_pred)
    #print("Mean Squared Error:", mse)

    predicted_score = model.predict(predict_x)

    metric = pd.DataFrame(predicted_score)

    return metric

Now let's predict the critic score of the upcoming movie

In [6]:
df_critic_ratings = df
df_critic_ratings = df_critic_ratings.drop(columns=['Opening', 'User Ratings', 'Rank', 'Release', 'Distributor'])
predict_critic = df_critic_ratings[0:1].drop('Critic Ratings', axis=1)
df_critic_ratings = df_critic_ratings[1:32]
# separate the target variable from the predictors
x = df_critic_ratings.drop('Critic Ratings', axis=1)
y = df_critic_ratings['Critic Ratings']

In [7]:
metric = pd.DataFrame()
for i in range(50):
    metric_ = modeling(predict_critic, x, y)
    metric = pd.concat([metric, metric_], ignore_index=True)
avg_cscore = metric.mean()
avg_cscore = round(avg_cscore, 2)

In [8]:
df.at[df.index[df['Release'] == 'The Marvels'][0], 'Critic Ratings'] = avg_cscore
df[0:1]

  df.at[df.index[df['Release'] == 'The Marvels'][0], 'Critic Ratings'] = avg_cscore


Unnamed: 0,Rank,Release,Max Theaters,Opening,Budget (millions),Year,Tickets Sold (that year),Distributor,Critic Ratings,User Ratings,...,IsMarvel,IsAvengers,IsSpiderman,IsBlackPanther,IsThor,IsIronMan,IsGalaxy,IsAmerica,IsAnt,IsDoctor
0,0,The Marvels,4406,,275,2023,901469391,Walt Disney Studios Motion Pictures,0.74,,...,1,0,0,0,0,0,0,0,0,0


Now let's predict the user score of the upcoming movie

In [9]:
df_user_ratings = df
df_user_ratings = df_user_ratings.drop(columns=['Opening', 'Rank', 'Release', 'Distributor'])
predict_user = df_user_ratings[0:1].drop('User Ratings', axis=1)
df_user_ratings = df_user_ratings[1:32]
# separate the target variable from the predictors
x = df_user_ratings.drop('User Ratings', axis=1)
y = df_user_ratings['User Ratings']

In [10]:
metric = pd.DataFrame()
for i in range(50):
    metric_ = modeling(predict_user, x, y)
    metric = pd.concat([metric, metric_], ignore_index=True)
avg_uscore = metric.mean()
avg_uscore = round(avg_uscore, 2)

In [11]:
df.at[df.index[df['Release'] == 'The Marvels'][0], 'User Ratings'] = avg_uscore
df[0:1]

  df.at[df.index[df['Release'] == 'The Marvels'][0], 'User Ratings'] = avg_uscore


Unnamed: 0,Rank,Release,Max Theaters,Opening,Budget (millions),Year,Tickets Sold (that year),Distributor,Critic Ratings,User Ratings,...,IsMarvel,IsAvengers,IsSpiderman,IsBlackPanther,IsThor,IsIronMan,IsGalaxy,IsAmerica,IsAnt,IsDoctor
0,0,The Marvels,4406,,275,2023,901469391,Walt Disney Studios Motion Pictures,0.74,0.73,...,1,0,0,0,0,0,0,0,0,0


finally, let's predict the opening box office score for the upcoming movie

In [12]:
df_opening = df
df_opening = df_opening.drop(columns=['Rank', 'Release', 'Distributor'])
predict_opening = df_opening[0:1].drop('Opening', axis=1)
df_opening = df_opening[1:32]
# separate the target variable from the predictors
x = df_opening.drop('Opening', axis=1)
y = df_opening['Opening']

In [13]:
metric = pd.DataFrame()
for i in range(50):
    metric_ = modeling(predict_opening, x, y)
    metric = pd.concat([metric, metric_], ignore_index=True)
avg_opening = metric.mean()
avg_opening = int(avg_opening)

  avg_opening = int(avg_opening)


In [14]:
df.at[df.index[df['Release'] == 'The Marvels'][0], 'Opening'] = avg_opening
df[0:1]

Unnamed: 0,Rank,Release,Max Theaters,Opening,Budget (millions),Year,Tickets Sold (that year),Distributor,Critic Ratings,User Ratings,...,IsMarvel,IsAvengers,IsSpiderman,IsBlackPanther,IsThor,IsIronMan,IsGalaxy,IsAmerica,IsAnt,IsDoctor
0,0,The Marvels,4406,146011639.0,275,2023,901469391,Walt Disney Studios Motion Pictures,0.74,0.73,...,1,0,0,0,0,0,0,0,0,0


Here is my prediction for the upcoming movie:

Opening: $149,869,505

In [19]:
prior = df.loc[df['Release'] == 'Captain Marvel', 'Opening'].iloc[0]
a = .2
prior * a + (1-a)  * avg_opening


147495995.8