# Imports

In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data

In [2]:
# Load the data
df = pd.read_csv('Data/tmdb_results_combined.csv.gz', index_col = 'imdb_id')
df.head()

Unnamed: 0_level_0,Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,,,,,,,,,,...,,,,,,,,,,
tt0062336,1,0.0,/fw5tsNib4QZBEw18xmebpVe3WZ8.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 14, 'name...",http://poetastros.com/el-tango-del-viudo/,602986.0,es,El tango del viudo y su espejo deformante,...,0.0,63.0,"[{'english_name': 'Spanish', 'iso_639_1': 'es'...",Released,,The Tango of the Widower and Its Distorting Mi...,0.0,5.3,3.0,
tt0805647,2,0.0,/8rIoyM6zYXJNjzGseT3MRusMPWl.jpg,,0.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",https://www.thewitchesmovie.net/,531219.0,en,Roald Dahl's The Witches,...,26900000.0,106.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,They're real!,Roald Dahl's The Witches,0.0,6.435,2315.0,PG
tt0920462,3,0.0,/mwXmcrvjOJwzsJSuNEupjisXUt6.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}]",https://www.lostgirlslovehotels.film,479259.0,en,Lost Girls & Love Hotels,...,106045.0,97.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Lost Girls & Love Hotels,0.0,5.219,121.0,R
tt0926132,4,0.0,/op14w44FvLUqH7TWyT1ijzrSfXV.jpg,,0.0,"[{'id': 27, 'name': 'Horror'}]",http://www.roadtored.com,651448.0,en,Darkslide,...,0.0,121.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Keep Moving or Die,Darkslide,0.0,6.5,2.0,


# Data Cleaning

In [3]:
# We will only be using budget, revenue, and certification for this model
df = df[['budget', 'revenue', 'certification']]
df.head()

Unnamed: 0_level_0,budget,revenue,certification
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,,,
tt0062336,0.0,0.0,
tt0805647,0.0,26900000.0,PG
tt0920462,0.0,106045.0,R
tt0926132,0.0,0.0,


In [4]:
# Remove any duplicates
df.drop_duplicates(inplace = True)
df.duplicated().sum()

0

In [5]:
# Check for null values
df.isna().sum()

budget              1
revenue             8
certification    3325
dtype: int64

In [6]:
# We will drop the null values in the budget column
print(df['revenue'].isna().sum() / df.shape[0])
df.dropna(subset = 'revenue', inplace = True)

0.0008646779074794639


In [7]:
# Get the percentage of null values in the certification column.
df['certification'].isna().sum() / df.shape[0]

0.3588273474686283

In [8]:
# 36% of the certification values are null so we will impute with 'missing'
df['certification'].fillna('Missing', inplace = True)

In [9]:
# Check the certification column
df['certification'].value_counts()

Missing                            3317
R                                  2616
PG-13                              1896
PG                                  738
NR                                  471
G                                   161
NC-17                                36
Unrated                               2
Not Rated                             1
UR                                    1
PG-13                                 1
10                                    1
R                                     1
ScreamFest Horror Film Festival       1
-                                     1
Name: certification, dtype: int64

In [20]:
# Fix the unrated labels
df['certification'].replace({'Not Rated': 'NR', 
                             'Unrated': 'NR',
                             'UR': 'NR'},
                            inplace = True)

In [11]:
# Find the issue with R and PG-13 movies
print(df['certification'].str.contains('PG-13 ').sum())
print(df['certification'].str.contains('R ').sum())

1
1


In [12]:
# Fix the ratings with an extra space
df.replace({'PG-13 ': 'PG-13','R ': 'R'}, inplace = True)

In [13]:
# Drop the values in certification that don't seem to make sense
df = df[~df['certification'].str.contains('10')]
df = df[~df['certification'].str.contains('ScreamFest')]

In [18]:
# Drop the column with only '-'
df = df[~(df['certification'] == '-')]

In [21]:
# Check the value counts again
df['certification'].value_counts()

Missing    3317
R          2617
PG-13      1897
PG          738
NR          475
G           161
NC-17        36
Name: certification, dtype: int64

In [26]:
# Check the data types
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9241 entries, tt0062336 to tt0445841
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   budget         9241 non-null   object 
 1   revenue        9241 non-null   float64
 2   certification  9241 non-null   object 
dtypes: float64(1), object(2)
memory usage: 288.8+ KB


In [30]:
# The budget column is not a float.
df['budget'] = df['budget'].astype(float)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9241 entries, tt0062336 to tt0445841
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   budget         9241 non-null   float64
 1   revenue        9241 non-null   float64
 2   certification  9241 non-null   object 
dtypes: float64(2), object(1)
memory usage: 288.8+ KB


In [34]:
# See how many values for budget and revenue are 0
filter = (df['budget'] == 0) & (df['revenue'] == 0)
len(df[filter])

10

In [36]:
# Few enough values meet the above condition so we drop those rows
df = df[~filter]

In [39]:
# See how many values for budget or revenue are 0
filter = (df['budget'] == 0) | (df['revenue'] == 0)
len(df[filter])

4041