# Project 3 - Part 4 Tests
Author: Paul Foy

# Imports and Load Data

In [1]:
# Required Imports
import json
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats

import os, time,json
from tqdm.notebook import tqdm_notebook
import tmdbsimple as tmdb 
import glob


import pymysql
pymysql.install_as_MySQLdb()

from urllib.parse import quote_plus as urlquote

# from sqlalchemy import create_engine
# from sqlalchemy_utils import create_database, database_exists

# Load Data

In [2]:
#Check data that exists in Data folder
FOLDER = 'Data/2010s/'
file_list = sorted(os.listdir(FOLDER))
file_list

['2010sfinal_tmdb_data_2010.csv.gz',
 '2010sfinal_tmdb_data_2011.csv.gz',
 '2010sfinal_tmdb_data_2012.csv.gz']

In [None]:
#Use glob to find final_tmdb_data csv files
q = FOLDER+"final_*csv.gz"
print(q)
file_list = sorted(glob.glob(q))
file_list

In [None]:
#Use list comprehension to load all files into 1 data frame
df = pd.concat([pd.read_csv(f, lineterminator = '\n') for f in file_list])
df.head()

In [None]:
#Remove zero id
df = df.loc[df['imdb_id'] != '0']
df.head()

In [None]:
# Remove first column
df.drop(['Unnamed: 0'], axis=1)

In [None]:
#Save data to disk 
df.to_csv(FOLDER+'combined_tmdb_data.csv.gz', compression = 'gzip', index = False)

# Load and Inspect Data

In [None]:
# Load new dataset and verify
df1 = pd.read_csv(FOLDER+'combined_tmdb_data.csv.gz',lineterminator='\n')
df1.head()

In [None]:
df1.info()

In [None]:
#Drop unneedeed columns for testing
drop_cols = ['adult', 'backdrop_path', 'belongs_to_collection', 'homepage', 
             'original_title', 'overview', 'poster_path', 'status', 'tagline', 
             'video', 'budget', 'genres', 'id', 'original_language', 
             'popularity', 'production_companies', 'production_countries', 
             'spoken_languages', 'title',]
df1 = df1.drop(columns = drop_cols)
df1.info()

In [None]:
#Preview certification column
df['certification'].value_counts(dropna = False)

In [None]:
#Clean up extra ratings
cert_dict = {'Unrated': 'NR',
            'UR': 'NR',
            'Not Rated': 'NR',
            'ScreamFest Horror Film Festival': 'NR'}

#Replace values
df['certification'] = df['certification'].replace(cert_dict)
df['certification'].value_counts(dropna = False)

In [None]:
#Drop null values
df = df.dropna(subset = 'certification')
df['certification'].value_counts()

# Hypothesis Testing

## Hypothesis 1

**Business Question**: 
- Does the MPAA rating of a movie (G/PG/PG-13/R) affect how much revenue the movie generates?

**State Hypothesese**: 
- H<sub>0</sub>: There is no difference between revenue generated for different ratings.
- H<sub>a</sub>: There is a significant difference between revenue generated for different ratings.

**Alpha** 
- 0.05

**Test type**:
- Because we are comparing a numeric value (revenue) across multiple groups (ratings), I will use an ANOVA test.

**Test Assumptions**: 
- No significant outliers.
- Normality.
- Equal Variance.

In [None]:
##Create groups dictionary
groups = {}
#Loop through all unique categories
for i in df['certification'].unique():
    ## Get series for group and rename
    data = df.loc[df['certification']==i,'revenue'].copy()
    
    # save into the dictionary
    groups[i] = data
groups.keys()

In [None]:
#Running normal test on each group and confirming there are >20 in each group
norm_results = {}
for i, data in groups.items():
    stat, p = stats.normaltest(data)
    ## save the p val, test statistic, and the size of the group
    norm_results[i] = {'n': len(data),
                             'p':p,
                             'test stat':stat,}
## convert to a dataframe
norm_results_df = pd.DataFrame(norm_results).T

#Checking sig with pandas 
norm_results_df['sig'] = norm_results_df['p'] < .05 
norm_results_df

Note: There are more than 15 samples in each group, so we can continue.

In [None]:
#Test for equal variance
stats.levene(*groups.values())

### Run Test

In [None]:
#Execute Kruskal=Wallis test
result = stats.kruskal(*groups)
print(result)
print(result.pvalue < .05)

### Results
- Our p-value is ***greater than*** our alpha of 0.05, therefore we fail to reject the null hypothesis. In our data, there is not a significant difference in average revenue generated per rating category.


### Supporting Visualization

In [None]:
#Create barplot of revenue vs. rating
ax = sns.barplot(data = df, x= 'certification', y= 'revenue')
ax.set_title('Average Revenue per Rating');

## Hypothesis Test 2: 
- Do movies that are over 2.5 hours long earn more revenue than movies that are 1.5 hours long (or less)?

**State Hypothesese**: 
- H<sub>0</sub>: There is no difference between groups of movies.
- H<sub>a</sub>: There is a significant difference between groups of movies.

**Alpha** 
- 0.05

**Test type**:
- We are comparing a numeric outcome (revenue) to another numeric variable (runtime), so I will use a 2-sample t-test.



### Data Preprocessing

In [None]:
# Add new columsn for short and long movies
df['is_long'] = df['runtime'] > 150
df['is_short'] = df['runtime'] <= 90
df.head()

In [None]:
# Create groups to test for long (>150 min) and short (<150 min)
long = df.loc[df['is_long'] == True, ['revenue', 'is_long']]
short = df.loc[df['is_short'] == True, ['revenue', 'is_short']]
print(f'There are {len(long)} movies over 2.5 hours.')
print(f'There are {len(short)} movies under 1.5 hours.')

In [None]:
#Create new datasets
long_group = long['revenue']
short_group = short['revenue']
display(long_group.head(), short_group.head())

## Test Assumptions

- No significant outliers
- Normality
- Equal variance

In [None]:
# Check for sig. outliers in long group
zscores_long = stats.zscore(long_group)
outliers_long = abs(zscores_long)>3
np.sum(outliers_long)

In [None]:
# Remove outliers
long_group = long_group[(np.abs(stats.zscore(long_group)) < 3)]

In [None]:
#Check for sig. outliers in short group
zscores_short = stats.zscore(short_group)
outliers_short = abs(zscores_short)>3
np.sum(outliers_short)

In [None]:
#Remove outliers
short_group = short_group[(np.abs(stats.zscore(short_group)) < 3)]

In [None]:
#Normal Test for long group
result_long = stats.normaltest(long_group)
result_long

In [None]:
#Normal Test of short group
result_short = stats.normaltest(short_group)
result_short

In [None]:
#Test for equal variance
result = stats.levene(long_group, short_group)
result

Note: 
- Failed equal variance test. 
- As a result, we'll need to use a Welch's T-test with the equal_var = False parameter

### Run test

In [None]:
result = stats.ttest_ind(long_group, short_group, equal_var = False)
result

The p-value from the t-test is less than 0.05. We can reject the null hypothesis. There is a significant difference between the revenue generated by short movies vs. long movies.

### Visualization

In [None]:
#Create barplot 
ax = sns.barplot(data = df, x= 'is_long', y= 'revenue')
ax.set_title('Average Revenue per Rating');