# Exploratory Data Analysis
- Michael Vincent
- 9/20

## Imports

In [2]:
# Imports
import numpy as np
import pandas as pd

## Load the data

In [3]:
# Load the data frames
df_2000 = pd.read_csv('Data/final_tmdb_data_2000.csv.gz', index_col = 'imdb_id')
df_2001 = pd.read_csv('Data/final_tmdb_data_2001.csv.gz', index_col = 'imdb_id')

# Concatenate the data frames into one data frame
df = pd.concat([df_2000, df_2001], axis = 0)
df.head()

# Save the joined data frames
df.to_csv('Data/tmdb_results_combined.csv.gz',
          compression = 'gzip',
          index = False)

In [3]:
# The first row is an artifact from the JSON file, so we drop it.
df.drop(['0'], inplace = True)

## Data analysis

In [4]:
# Find the number of movies with some valid financial information
filter = (df['budget'] > 0) & (df['revenue'] > 0)
print('There are', len(df[filter]), 
      'movies with some valid financial information.')

There are 349 movies with some valid financial information.


In [6]:
# Find the number of movies with each rating.
df['certification'].value_counts()

R          452
PG-13      181
NR          66
PG          64
G           24
NC-17        6
Unrated      1
-            1
Name: certification, dtype: int64

In [25]:
# Get the average budget by certification
avg_budget = df.groupby('certification')['budget'].mean().round()
avg_budget.sort_values(ascending = False)

certification
PG-13      31231536.0
PG         24597656.0
G          23833333.0
R           9998904.0
NR          1622729.0
-                 0.0
NC-17             0.0
Unrated           0.0
Name: budget, dtype: float64

In [26]:
# Get the average revenue by certification
avg_revenue = df.groupby('certification')['revenue'].mean().round()
avg_revenue.sort_values(ascending = False)

certification
G          72163319.0
PG-13      71608340.0
PG         61435347.0
R          16821129.0
NR          2289235.0
-                 0.0
NC-17             0.0
Unrated           0.0
Name: revenue, dtype: float64