# Project 3 Part 4
- Michael Vincent
- 10/5

## Imports

In [73]:
# Imports
import numpy as np
import pandas as pd
import glob
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

## Load the data

In [2]:
# Load the data using glob

# Get the filenames
q = 'Data/final_tmdb_data_*.csv.gz'
chunked_files = glob.glob(q)

# Load the files
df_list = []
for file in chunked_files:
    temp_df = pd.read_csv(file, engine = 'python')
    df_list.append(temp_df)

# Combine the files into one data frame    
df = pd.concat(df_list)
df.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0062336,0.0,/fw5tsNib4QZBEw18xmebpVe3WZ8.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 14, 'name...",http://poetastros.com/el-tango-del-viudo/,602986.0,es,El tango del viudo y su espejo deformante,...,0.0,63.0,"[{'english_name': 'Spanish', 'iso_639_1': 'es'...",Released,,The Tango of the Widower and Its Distorting Mi...,0.0,5.3,3.0,
2,tt0805647,0.0,/8rIoyM6zYXJNjzGseT3MRusMPWl.jpg,,0.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",https://www.thewitchesmovie.net/,531219.0,en,Roald Dahl's The Witches,...,26900000.0,106.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,They're real!,Roald Dahl's The Witches,0.0,6.435,2315.0,PG
3,tt0920462,0.0,/mwXmcrvjOJwzsJSuNEupjisXUt6.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}]",https://www.lostgirlslovehotels.film,479259.0,en,Lost Girls & Love Hotels,...,106045.0,97.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Lost Girls & Love Hotels,0.0,5.219,121.0,R
4,tt0926132,0.0,/op14w44FvLUqH7TWyT1ijzrSfXV.jpg,,0.0,"[{'id': 27, 'name': 'Horror'}]",http://www.roadtored.com,651448.0,en,Darkslide,...,0.0,121.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Keep Moving or Die,Darkslide,0.0,6.5,2.0,


In [3]:
# Get the columns of interest
df = df[['imdb_id', 'budget', 'revenue', 'certification']]
df.head()

Unnamed: 0,imdb_id,budget,revenue,certification
0,0,,,
1,tt0062336,0.0,0.0,
2,tt0805647,0.0,26900000.0,PG
3,tt0920462,0.0,106045.0,R
4,tt0926132,0.0,0.0,


In [5]:
# Drop the null values
df.dropna(inplace = True)
df.head()

Unnamed: 0,imdb_id,budget,revenue,certification
2,tt0805647,0.0,26900000.0,PG
3,tt0920462,0.0,106045.0,R
5,tt0983946,7000000.0,47078545.0,PG-13
6,tt10003008,0.0,4296804.0,R
9,tt10004108,0.0,0.0,R


In [9]:
# Check the values in the certification column
df['certification'].unique()

array(['PG', 'R', 'PG-13', 'NR', 'NC-17', 'G', 'Not Rated', 'UR',
       'Unrated', 'PG-13 ', '10', 'R ', 'ScreamFest Horror Film Festival',
       '-'], dtype=object)

> We see some ratings have an extra space.

In [13]:
# Get rid of the space in the ratings of R and PG-13. While it's not necessary
# for this project, we will replace 'UR' and 'Not Rated' with Unrated. 
df['certification'].replace({'PG-13 ': 'PG-13', 
                             'R ': 'R',
                             'UR': 'Unrated',
                             'Not Rated': 'Unrated'}, 
                            inplace = True)

# Make sure the changes were made
df['certification'].unique()

array(['PG', 'R', 'PG-13', 'NR', 'NC-17', 'G', 'Unrated', '10',
       'ScreamFest Horror Film Festival', '-'], dtype=object)

## Hypothesis Test: Do ratings have a significant effect on revenue?

**State the Null and Alternative Hypotheses**

$H_0$: The rating of a movie does not affect its revenue.

$H_1$: The rating of a movie has a significant effect on its revenue.

> We are comparing the mean revenue of more than two groups so we will use an ANOVA test.

In [65]:
# Get the revenues of the movies with ratings G, PG, PG-13, and R
g_movie_rev = df.loc[df['certification'] == 'G', 'revenue']
pg_movie_rev = df.loc[df['certification'] == 'PG', 'revenue']
pg13_movie_rev = df.loc[df['certification'] == 'PG-13', 'revenue']
r_movie_rev = df.loc[df['certification'] == 'R', 'revenue']

In [66]:
# Remove any outliers from the groups
g_movie_rev = g_movie_rev[abs(stats.zscore(g_movie_rev)) <= 3]
pg_movie_rev = pg_movie_rev[abs(stats.zscore(pg_movie_rev)) <= 3]
pg13_movie_rev = pg13_movie_rev[abs(stats.zscore(pg13_movie_rev)) <= 3]
r_movie_rev = r_movie_rev[abs(stats.zscore(r_movie_rev)) <= 3]
print(len(g_movie_rev),
      len(pg_movie_rev),
      len(pg13_movie_rev),
      len(r_movie_rev))

447 1451 3253 6244


> The sample sizes of each group are large enough that we may forgo the normality test

In [70]:
# Test the groups for equal variance
stats.levene(g_movie_rev,
             pg_movie_rev,
             pg13_movie_rev,
             r_movie_rev)

LeveneResult(statistic=258.1783756187162, pvalue=4.129948362335215e-162)

> The $p$-value is less than our $\alhpa$ of 0.05 so we reject the null-hypothesis that the groups have the same variance and assume they do *not* have the same variance so we use the Kruskal-Wallis test.

In [72]:
# Perform the hypothesis test
stats.kruskal(g_movie_rev,
              pg_movie_rev,
              pg13_movie_rev,
              r_movie_rev)

KruskalResult(statistic=458.444710214375, pvalue=4.825294601401865e-99)

> Our $p$-value of $4.8 \times 10^{-99}$ is less than the given $\alpha$ of 0.05 so we reject the null hypothesis. That is, our data supports the claim that the rating of a film has a significant effect on its revenue.

In [None]:
#