# movie_budget_and_revenue.ipynb

In this notebook we will explore movie budget and revenue, as well as profit and ROI (Return on Investment).

In [1]:
import sys
sys.path.append("..")

import pandas as pd
import ast  # Abstract Syntax Trees - safely parse string list to Python list
from reader import generic_reader

First we read in the movie data from `movies_clean.csv`

In [2]:
df_movies = generic_reader.read_csv_file_to_data_frame("movie_data/movies_clean.csv")

Then we choose the columns we will work in this notebook

In [3]:
columns_to_keep = ['budget_2023_usd', 'revenue_2023_usd', 'vote_average']
df_movies = df_movies[columns_to_keep].copy()
df_movies

Unnamed: 0,budget_2023_usd,revenue_2023_usd,vote_average
0,0,0,2.500
1,0,0,6.200
2,0,0,5.000
3,0,0,4.000
4,0,0,7.100
...,...,...,...
20484,5000000,14864960,7.443
20485,0,313000,7.093
20486,0,0,5.800
20487,40000000,54401583,7.233


In [4]:
# Drop movies where budget is zero
df_movies = df_movies[df_movies['budget'] != 0].copy()
# df_movies = df_movies[(df_movies['budget'] != 0) & (df_movies['revenue'] != 0)].copy()
len(df_movies)

KeyError: 'budget'

In [None]:
# Convert release_date into proper datetime objects
df_movies['release_date'] = pd.to_datetime(df_movies['release_date'])

# Create year column
df_movies['release_year'] = df_movies['release_date'].dt.year
df_movies.sort_values(by="release_year", ascending=False)

In [None]:
df_movies
df_movies.sort_values(by="budget_2023_usd", ascending=False)

Calculate profit and ROI

In [None]:
df_movies['profit_2023_usd'] = df_movies['revenue_2023_usd'] - df_movies['budget_2023_usd']
df_movies['roi'] = ( df_movies['profit_2023_usd'] / df_movies['budget_2023_usd'] )*100
df_movies.sort_values(by="profit_2023_usd", ascending=False)

In this plot we plot budget along the x-axis and revenue along the y-axis. The blue line is the line for break even.

We see that most movies actually do make a profit.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.scatterplot(x=df_movies['budget_2023_usd'], y=df_movies['revenue_2023_usd'], alpha=0.1)
plt.xlim(0, 3e9)
plt.ylim(0, 3e9)
plt.gca().set_aspect('equal', adjustable='box')
plt.plot([0, 3e9], [0, 3e9])
plt.xlabel("Budget [2023 USD]")
plt.ylabel("Revenue [2023 USD]");

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.scatterplot(x=df_movies['budget_2023_usd'], y=df_movies['revenue_2023_usd'], alpha=0.1)
plt.xlim(0, 0.6e9)
plt.ylim(0, 3e9)
# plt.gca().set_aspect('equal', adjustable='box')
plt.plot([0, 3e9], [0, 3e9])
plt.xlabel("Budget [2023 USD]")
plt.ylabel("Revenue [2023 USD]");

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.scatterplot(x=df_movies['budget_2023_usd'], y=df_movies['profit_2023_usd'], alpha=0.1)
# plt.xlim(0, 0.6e9)
# plt.ylim(0, 3e9)
# plt.gca().set_aspect('equal', adjustable='box')
# plt.plot([0, 3e9], [0, 3e9])
plt.xlabel("Budget [2023 USD]")
plt.ylabel("Profit [2023 USD]");

In [None]:
sns.scatterplot(x=df_movies['budget_2023_usd'], y=df_movies['vote_average'], alpha=0.1)

In [None]:
sns.scatterplot(x=df_movies['profit_2023_usd'], y=df_movies['vote_average'], alpha=0.1)

In [None]:
sns.scatterplot(x=df_movies['roi'], y=df_movies['vote_average'], alpha=0.1)
plt.xlim(-100, 2000)