In [1]:
# Imports for project

%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import patsy
import statsmodels.api as sm
import scipy.stats as stats
from scipy.stats import ttest_ind, chisquare, normaltest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

  from pandas.core import datetools


# Oscars Best Picture
We use the Oscars dataset to combine compile all the movies that either got nominated or was awarded Best Picture.

In [None]:
# Import database
df_BP = pd.read_csv('../cogs108_data/awards.csv')

# Drop unnecessary columns
df_BP = df_BP[['Award','Winner','Name','Film']]

# Check all the award categories
df_BP['Award'].unique()

In [None]:
# Replace all the variant names of Best Picture into Best Picture
bp_variants = {'Outstanding Picture', 'Outstanding Production', 'Outstanding Motion Picture', 'Best Motion Picture', 'Best Picture'}

for index, row in df_BP.iterrows():
    if(row['Award'] in bp_variants):
        df_BP.loc[index,'Award'] = 'Best Picture'

In [None]:
# Only keep best picture awards
df_BP = df_BP[df_BP['Award']=='Best Picture']

In [None]:
print((df_BP[df_BP['Winner'] == 1]).to_string())

We noticed that there are 89 pictures instead of the expected 88. We went through them by hand and found that M*A*S*H was put under winning when it should not have. Also there are movies in which the producer is mistakenly put as film name.

In [None]:
# M*A*S*H was not a winner
df_BP.loc[4852,'Winner']=np.nan
# Change to correct film names
df_BP.loc[21,'Name']="Wings"
df_BP.loc[64,'Name']="The Broadway Melody"

Several of the nominations had wrong names too...

In [None]:
df_BP.loc[19,'Name']="The Racket"
df_BP.loc[20,'Name']="7th Heaven"
df_BP.loc[62,'Name']="Alibi"
df_BP.loc[63,'Name']="In Old Arizona"
df_BP.loc[65,'Name']="Hollywood Revue"
df_BP.loc[66,'Name']="The Patriot"

In [None]:
# Now we can drop the name row to only have film name
df_BP = df_BP[['Name','Winner']]

In [None]:
df_BP['Name'].unique().size

In [None]:
df_BP['Name'].value_counts()

Some movie title nominated/won for Best Picture in different years! But that's OK because it's just 5 entires out of 528!!

In [None]:
# Strip whitespace and lowercase the movie titles for consistency
df_BP['Name']=df_BP['Name'].str.strip()
df_BP['Name']=df_BP['Name'].str.lower()
df_BP['Winner'].fillna(0,inplace=True)

In [None]:
# Everything in this dataset is by default nominated
df_BP['Nominated']=1

In [None]:
df_BP

# Month of Release
Using the movies dataset, we try to analyze when each movie was released.

In [None]:
dfm = pd.read_csv('../cogs108_data/movies_metadata.csv')
dfm.dropna(subset=['release_date'], inplace = True)

In [None]:
def replace(date):
    output = date[5:7]
    return output

# Use apply() to extra only the month of release
dfm['release_date']=dfm['release_date'].apply(replace)
dfm=dfm[["title","imdb_id","release_date"]]

In [None]:
# Strip whitespace and tolower for consistency
dfm["title"]=dfm["title"].str.strip()
dfm["title"]=dfm["title"].str.lower()

In [None]:
# There seems to be 3 movies with blank months. Remove them!
dfm['release_date'].value_counts()

In [None]:
# Remove all rows with blank release dates
dfm = dfm[dfm["release_date"]!=""]

In [None]:
# Set all month columns as 0 by default
dfm['Jan-Feb']=0.0
dfm['Mar-Apr']=0.0
dfm['May-Jun']=0.0
dfm['Jul-Aug']=0.0
dfm['Sept-Oct']=0.0
dfm['Nov-Dec']=0.0

# Set each month category with repective truth value
for index,row in dfm.iterrows():
    month_num = int(row['release_date'])
    if(month_num <=2):
        dfm.loc[index,'Jan-Feb']=1.0
    elif(month_num<=4):
        dfm.loc[index,'Mar-Apr']=1.0
    elif(month_num<=6):
        dfm.loc[index,'May-Jun']=1.0
    elif(month_num<=8):
        dfm.loc[index,'Jul-Aug']=1.0
    elif(month_num<=10):
        dfm.loc[index,'Sept-Oct']=1.0
    elif(month_num<=12):
        dfm.loc[index,'Nov-Dec']=1.0

In [None]:
dfm

# IMDB Weighted Ratings
Using the IMDB dataset, we want to create a dataframe that notes the IMDB ratings of each movie

In [None]:
# Renamed data from title.ratings.tsv.gz->data.tsv to ratings.tsv
ratings = pd.read_csv('../cogs108_data/ratings.tsv',delimiter='\t')
# Renamed data from title.basics.tsv.gz->data.tsv to basics.tsv
basics = pd.read_csv('../cogs108_data/basics.tsv', delimiter='\t')

In [None]:
# Remove all things that arent movies
basics = basics[basics['titleType'] == "movie"]
# Remove all adult titles
basics = basics[basics['isAdult'] == 0]
# Remove unnecessary columns
basics = basics.drop(['titleType','originalTitle','isAdult','startYear','endYear','runtimeMinutes','genres'],axis=1)

In [None]:
# Do a inner join on movies that have both ratings and basic info
df = pd.merge(basics, ratings, on='tconst', how='inner')
# Dont drop the IMDB id just in case for matching
#df = df.drop(['tconst'], axis=1)

Since a movie could get a 10/10 rating with very few votes, it's only fair for movies to be weighted based on the number of votes.

In [None]:
# Add a new column with weighted ratings based on minimum votes
minVote = 30000

# Scroll down to the bottom of the following link to check how the weighted rating was calculated
# https://help.imdb.com/article/imdb/track-movies-tv/faq-for-imdb-ratings/G67Y87TFYYP6TWAV?ref_=helpsect_pro_2_4#
df = df.assign(weighted_ratings=((df['numVotes']/(df['numVotes']+minVote))*df['averageRating'])+(minVote/(df['numVotes']+minVote))*df['averageRating'].mean())
# Standardize movie names
df['primaryTitle']=df['primaryTitle'].str.strip()
df['primaryTitle']=df['primaryTitle'].str.lower()

df.sort_values(by=['weighted_ratings'], ascending=False)

# Production Awards
We want to see which movies were nominated production awards and which movies won which awards.

In [3]:
awards_df = pd.read_csv("../cogs108_data/awards.csv")

# For some reason the dataset switched the name/film columns starting at the 3rd ceremony
# We had to compensate for this by moving the name of the film to the correct column for the first 3 ceremonies
for index, row in awards_df.iterrows():
    if(row["Ceremony"] < 3):
        awards_df.set_value(index,'Name',row["Film"])


# Drop the Ceremony and Year category since they aren't important
awards_df.drop(["Ceremony", "Year", "Film"], axis=1, inplace=True)
awards_df.rename(columns={'Name': 'Film'}, inplace=True)
# Strip whitespace and tolower for consistency
awards_df["Film"]=awards_df["Film"].str.strip()
awards_df["Film"]=awards_df["Film"].str.lower()

awards_df = awards_df.fillna(0)


# These are all the awards we consider as production awards
awards = ["Film Editing", "Cinematography", "Makeup", "Production Design", "Art Direction",
         "Sound Editing", "Sound Mixing", "Special Effects", "Special Visual Effects", 
          "Special Achievement Award (Visual Effects)", "Visual Effects", "Engineering Effects"]


# Remove all of the rows that do not pertain to any of the production related awards
awards_df = awards_df[awards_df["Award"].isin(awards)]

# Number of nominations for production related awards 
nominations = awards_df["Film"].value_counts()

# Remove the nominations and only get the winenrs
# awards_df = awards_df[awards_df["Winner"] == 1]

# Numer of winners for production related awards
winners_count = awards_df["Film"].value_counts()



In [4]:
# Production Design = Art Design
# Special Effects = Special Visual Effects = Visual Effects = Engineering Effects = Special Achievement Award (Visual Effects)
# Make a new DF indexed by film name
prod_awards=pd.DataFrame(columns=['Film','Nominated Production',"Film Editing", "Cinematography", "Makeup", "Production Design",
         "Sound Editing", "Sound Mixing", "Visual Effects"])

In [5]:
prod_awards['Film'] = awards_df['Film'].unique()
# Initialize all fields to 0
prod_awards=prod_awards.fillna(0)
prod_awards

Unnamed: 0,Film,Nominated Production,Film Editing,Cinematography,Makeup,Production Design,Sound Editing,Sound Mixing,Visual Effects
0,sunrise,0,0,0,0,0,0,0,0
1,the dove; tempest,0,0,0,0,0,0,0,0
2,7th heaven,0,0,0,0,0,0,0,0
3,the devil dancer; the magic flame; sadie thompson,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0
5,wings,0,0,0,0,0,0,0,0
6,the patriot,0,0,0,0,0,0,0,0
7,the bridge of san luis rey,0,0,0,0,0,0,0,0
8,dynamite,0,0,0,0,0,0,0,0
9,alibi; and the awakening,0,0,0,0,0,0,0,0


In index 1, for example, you can see that there is actually two movies listed in one cell. We need to deal with any instance of this. Fortunately, it only occurs a few times, so we can do everything manually.

In [6]:
# Removing specific movie names that put more than one movie in the same name
prod_awards = prod_awards[prod_awards['Film']!="the devil dancer; the magic flame; sadie thompson"]
prod_awards = prod_awards[prod_awards['Film']!="the dove; tempest"]
prod_awards = prod_awards[prod_awards['Film']!="alibi; and the awakening"]
prod_awards = prod_awards[prod_awards['Film']!="four devils; and street angel"]
# Removing specific movie names that put more than one movie in the same name
awards_df = awards_df[awards_df['Film']!="the devil dancer; the magic flame; sadie thompson"]
awards_df = awards_df[awards_df['Film']!="the dove; tempest"]
awards_df = awards_df[awards_df['Film']!="alibi; and the awakening"]
awards_df = awards_df[awards_df['Film']!="four devils; and street angel"]

In [7]:
# Manually insert all the names with multiple production awards
prod_awards.loc[884,"Film"]="the dove"
prod_awards.loc[885,"Film"]="tempest"
prod_awards.loc[886,"Film"]="the devil dancer"
prod_awards.loc[887,"Film"]="the magic flame"
prod_awards.loc[888,"Film"]="saddie thompson"
prod_awards.loc[889,"Film"]="alibi"
prod_awards.loc[890,"Film"]="the awakening"
prod_awards.loc[891,"Film"]="four devils"
prod_awards=prod_awards.fillna(0)
# Every film was nominated
prod_awards["Nominated Production"]=1

# Set the awards for these two films that won
prod_awards.loc[884,"Production Design"]=1.0
prod_awards.loc[885,"Production Design"]=1.0

In [8]:
# Temporarily set index as film name
prod_awards.set_index('Film',inplace=True)
prod_awards

Unnamed: 0_level_0,Nominated Production,Film Editing,Cinematography,Makeup,Production Design,Sound Editing,Sound Mixing,Visual Effects
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
sunrise,1,0,0,0,0.0,0,0,0
7th heaven,1,0,0,0,0.0,0,0,0
0,1,0,0,0,0.0,0,0,0
wings,1,0,0,0,0.0,0,0,0
the patriot,1,0,0,0,0.0,0,0,0
the bridge of san luis rey,1,0,0,0,0.0,0,0,0
dynamite,1,0,0,0,0.0,0,0,0
street angel,1,0,0,0,0.0,0,0,0
our dancing daughters,1,0,0,0,0.0,0,0,0
white shadows in the south seas,1,0,0,0,0.0,0,0,0


In [9]:
# Set the respective column's truth value for any awards
for index,row in awards_df.iterrows():
    if row['Award']=='Film Editing':
            prod_awards.loc[row["Film"],'Film Editing']=row["Winner"]
    elif row['Award']=='Art Direction' or row['Award']=='Production Design':
            prod_awards.loc[row["Film"],'Art Direction']=row['Winner']
    elif row['Award']=='Cinematography':
            prod_awards.loc[row["Film"],'Cinematography']=row['Winner']
    elif row['Award']=='Visual Effects' or row['Award']=='Special Achievement Award (Visual Effects)' or row['Award']=='Engineering Effects' or row['Award']=='Special Effects' or row['Award']=='Special Visual Effects':
            prod_awards.loc[row["Film"],'Visual Effects']=row['Winner']
    elif row['Award']=='Makeup':
            prod_awards.loc[row["Film"],'Makeup']=row['Winner']
    elif row['Award']=='Sound Editing':
            prod_awards.loc[row["Film"],'Sound Editing']=row['Winner']
    elif row['Award']=='Sound Mixing':
            prod_awards.loc[row["Film"],'Sound Mixing']=row['Winner']

In [10]:
# Set index back to numbers and not film name
prod_awards=prod_awards.reset_index()

# Actor Awards
Extract from the Oscars dataframe all movies that got nominated or won an actor/actress related award.

In [None]:
# Setting up the data frame to contain acting related awards

actors_df = pd.read_csv('../cogs108_data/awards.csv')

actors_df = actors_df.loc[((actors_df['Award'] == "Actor") | (actors_df['Award'] == "Actress") | (actors_df['Award'] == "Actor in a Supporting Role") | (actors_df['Award'] == "Actress in a Supporting Role"))]

actors_df = actors_df.drop(["Name", "Year", "Ceremony"], axis=1)
actors_df.fillna(0, inplace=True)

In [None]:
# Setting up the data frame to contain acting related awards

actors_df = pd.read_csv('../cogs108_data/awards.csv')
actors_df[actors_df['Award']=="Actor"]

In [None]:
awards_df = pd.DataFrame(columns=["Film", "Nominated Actor", "Actor", "Actress", "Supporting Actor", "Supporting Actress"])

# List out all the unique films listed in the awards
awards_df["Film"] = actors_df["Film"].unique()

# Every one of these movies were nominated an actor award by default
awards_df["Nominated Actor"] = 1
awards_df.fillna(0, inplace=True)

In [None]:
actors_df.set_index('Film',inplace=True)
awards_df.set_index('Film',inplace=True)
# Fill all relevant truth values for each award
for index, row in actors_df.iterrows():
    if(row["Award"] == "Actor"):
        awards_df.loc[index, "Actor"] = row["Winner"]
    elif(row["Award"] == "Actress"):
        awards_df.loc[index, "Actress"] = row["Winner"]
    elif(row["Award"] == "Actor in a Supporting Role"):
        awards_df.loc[index, "Supporting Actor"] = row["Winner"]
    elif(row["Award"] == "Actress in a Supporting Role"):
        awards_df.loc[index, "Supporting Actress"] = row["Winner"]

actors_df.reset_index(inplace=True)
awards_df.reset_index(inplace=True)

# Standardize film name
awards_df['Film']=awards_df['Film'].str.strip()
awards_df['Film']=awards_df['Film'].str.lower()

awards_df

# Movie Budget

In [None]:
budget_df = pd.read_csv("../cogs108_data/movies_metadata.csv")

# Cleaning up the dataset for some useful analysis on budget
budget_df.drop(["adult", "belongs_to_collection", "genres", "homepage", "id", "original_language", "runtime", 
                "spoken_languages", "status", "tagline", "poster_path", "production_countries", 
                "original_title", "overview", "video"], inplace = True, axis=1)


def convertToInt(string):
    return int(string)

# Strip whitespace and tolower for consistency
budget_df["title"]=budget_df["title"].str.strip()
budget_df["title"]=budget_df["title"].str.lower()

In [None]:
# Get the movies for which we have budget and revenue information
print(budget_df.shape)
budget_df = budget_df[(budget_df["budget"] != "0")]
budget_df = budget_df[(budget_df["budget"] != "/ff9qCepilowshEtG2GYWwzt2bs4.jpg")]
budget_df = budget_df[(budget_df["budget"] != "/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg")]
budget_df = budget_df[(budget_df["budget"] != "/zaSf5OG7V8X8gqFvly88zDdRm46.jpg")]
budget_df = budget_df[(budget_df["revenue"] != 0)]
print(budget_df.shape)
budget_df["budget"].apply(convertToInt)
plt.scatter(budget_df.budget, budget_df.revenue)
#budget_df.plot.scatter(x="budget", y="revenue")

In [None]:
test = budget_df[budget_df["revenue"] > 2500000000]
test

# Merging the three Predictor Datasets

LEGEND<br>
Best picture: df_BP<br>
Month of release: dfm<br>
Weighted Ratings: df<br>
Production Awards: prod_awards<br>
Actors Award: awards_df<br>
Budget: budget_df<br>

In [None]:
# Rename all the title columns to the same name
df_BP.rename(columns = {'Name':'Film'}, inplace = True)
dfm.rename(columns = {'title':'Film'}, inplace = True)
df.rename(columns = {'primaryTitle':'Film'}, inplace = True)
budget_df.rename(columns = {'title':'Film'}, inplace = True)
df.rename(columns = {'tconst':'imdb_id'}, inplace = True)


In [None]:
# Reduce the budget df to only the necessary columns
budget_df.drop(["popularity","production_companies","release_date","vote_average","vote_count"], inplace = True, axis=1)

In [None]:
predictor_df = pd.merge(dfm, budget_df, on='imdb_id', how='inner')
predictor_df = pd.merge(predictor_df, df, on='imdb_id', how='inner')

In [None]:
#budget_df
#dfm
#df
print(df.shape)
print(dfm.shape)
print(budget_df.shape)

In [None]:
# Drop all rows with repeating imdb ids
predictor_df = predictor_df.drop_duplicates(subset=['imdb_id'], keep='first')
# Drop all unneeded columns
predictor_df=predictor_df.drop(['imdb_id', 'Film_y', 'Film_x'], axis=1)

In [None]:
# Combining predictors with Best Picture
predict_BP = pd.merge(df_BP, predictor_df, on='Film', how='right')
predict_BP = predict_BP.fillna(0)
# Combining predictors with Production Awards
predict_Prod = pd.merge(prod_awards, predictor_df, on='Film', how='right')
predict_Prod = predict_Prod.fillna(0)
# Combining predictors with Actor Awards
predict_Actor = pd.merge(awards_df, predictor_df, on='Film', how='right')
predict_Actor = predict_Actor.fillna(0)

In [None]:
def convert_str(label):
    return float(label)

In [None]:
#Copy and Delete labels before performing algorithms
predict_BP_ML = predict_BP.copy()
predict_BP_ML.drop(['release_date','averageRating'],axis=1,inplace= True)

predict_Prod_ML = predict_Prod.copy()
predict_Prod_ML.drop(['release_date','averageRating'],axis=1,inplace= True)

predict_Actor_ML = predict_Actor.copy()
predict_Actor_ML.drop(['release_date','averageRating'],axis=1,inplace= True)

predict_BP_ML['budget'] = predict_BP_ML['budget'].apply(convert_str)
predict_BP_ML['numVotes'] = predict_BP_ML['numVotes'].apply(convert_str)
predict_BP_ML

In [None]:
#Fix Predictor Actor
predict_Actor_ML["Winner"] = 0.0
for i in range(0,predict_Actor_ML.shape[0]):
    if ((predict_Actor_ML['Actor'][i] == 1) | (predict_Actor_ML['Actress'][i] == 1)):
        predict_Actor_ML.iat[i,16] = 1.0
        
predict_Actor_ML.drop(['Actor','Actress',
                       'Supporting Actor',
                       'Supporting Actress'],axis = 1,inplace = True)
predict_Actor_ML['budget'] = predict_Actor_ML['budget'].apply(convert_str)
predict_Actor_ML['numVotes'] = predict_Actor_ML['numVotes'].apply(convert_str)
predict_Actor_ML

In [None]:
#Fix Predictor Prod
predict_Prod_ML["Winner"] = 0.0
for i in range(0,predict_Prod_ML.shape[0]):
    if ((predict_Prod_ML['Film Editing'][i] == 1) | 
        (predict_Prod_ML['Cinematography'][i] == 1)|
        (predict_Prod_ML['Makeup'][i] == 1) |
        (predict_Prod_ML['Production Design'][i] == 1) |
        (predict_Prod_ML['Art Direction'][i] == 1) |
        (predict_Prod_ML['Sound Editing'][i] == 1) |
        (predict_Prod_ML['Sound Mixing'][i] == 1) |
        (predict_Prod_ML['Special EffectsSpecial Visual Effects'][i] == 1) |
        (predict_Prod_ML['Special Achievement Award (Visual Effects)'][i] == 1)|
        (predict_Prod_ML['Visual Effects'][i] == 1)):
        predict_Prod_ML.iat[i,23] = 1.0
predict_Prod_ML.drop(['Film Editing','Cinematography',
                       'Makeup','Production Design',
                       'Art Direction','Sound Editing',
                       'Sound Mixing',
                       'Special EffectsSpecial Visual Effects',
                       'Special Achievement Award (Visual Effects)',
                       'Visual Effects','Engineering Effects'],axis = 1,inplace = True)
predict_Prod_ML['budget'] = predict_Prod_ML['budget'].apply(convert_str)
predict_Prod_ML['numVotes'] = predict_Prod_ML['numVotes'].apply(convert_str)
predict_Prod_ML

# Machine Learning Analysis

In [None]:
#y labels
y_BP = predict_BP_ML.iloc[:,1:2]
y_PA = predict_Prod_ML.iloc[:,12:13]
y_AA = predict_Actor_ML.iloc[:,12:13]

In [None]:
#x features
X_BP = predict_BP_ML.iloc[:,2:13]
X_PA = predict_Prod_ML.iloc[:,1:12]
X_AA = predict_Actor_ML.iloc[:,1:12]

In [None]:
#Combine
XY_BP = pd.concat([X_BP,y_BP],axis = 1).values
XY_PA = pd.concat([X_PA,y_PA],axis = 1).values
XY_AA = pd.concat([X_AA,y_AA],axis = 1).values

In [None]:
#Shuffle
np.random.shuffle(XY_BP)
np.random.shuffle(XY_PA)
np.random.shuffle(XY_AA)

#Print Shapes
print(XY_BP.shape)
print(XY_PA.shape)
print(XY_AA.shape)

In [None]:
#Get Each X and Y
BP_X = XY_BP[:,0:11]
BP_Y = XY_BP[:,11:12]

PA_X = XY_PA[:,0:11]
PA_Y = XY_PA[:,11:12]

AA_X = XY_AA[:,0:11]
AA_Y = XY_AA[:,11:12]

In [None]:
#Split
num_training_BP = int(0.8*BP_X.shape[0])
num_testing_BP = int(0.2*BP_X.shape[0])

num_training_PA = int(0.8*PA_X.shape[0])
num_testing_PA = int(0.2*PA_X.shape[0])

num_training_AA = int(0.8*AA_X.shape[0])
num_testing_AA = int(0.2*AA_X.shape[0])

BP_X_train = BP_X[:num_training_BP] 
BP_Y_train = BP_Y[:num_training_BP] 
BP_X_test = BP_X[num_training_BP:] 
BP_Y_test = BP_Y[num_training_BP:] 

PA_X_train = PA_X[:num_training_PA] 
PA_Y_train = PA_Y[:num_training_PA] 
PA_X_test = PA_X[num_training_PA:] 
PA_Y_test = PA_Y[num_training_PA:] 

AA_X_train =AA_X[:num_training_AA] 
AA_Y_train = AA_Y[:num_training_AA] 
AA_X_test = AA_X[num_training_AA:] 
AA_Y_test = AA_Y[num_training_AA:] 

## Random Forest for Best Picture Awards

In [None]:
depth_list = [1, 2, 3, 4, 5]
params = {"max_depth": depth_list}
classifier = RandomForestClassifier(max_depth=5,
                                    random_state=0)
grid_rf_BP = GridSearchCV(classifier, params, 
                            return_train_score = True, cv = 10)
grid_rf_BP.fit(BP_X_train, BP_Y_train.ravel())

In [None]:
predictions = grid_rf_BP.predict(BP_X_test)
correct = []
size = predictions.shape[0]
for i in range(0,size):
    if predictions[i] == BP_Y_test[i]:
        correct.append(predictions[i])
test_accuracy = len(correct)/predictions.shape[0]
print("Test Accuracy:")
print(test_accuracy)

## Random Forest for Production Awards

In [None]:
depth_list = [1, 2, 3, 4, 5]
params = {"max_depth": depth_list}
classifier = RandomForestClassifier(max_depth=5,
                                    random_state=0)
grid_rf_PA = GridSearchCV(classifier, params, 
                            return_train_score = True, cv = 10)
grid_rf_PA.fit(PA_X_train, PA_Y_train.ravel())

In [None]:
predictions = grid_rf_PA.predict(PA_X_test)
correct = []
size = predictions.shape[0]
for i in range(0,size):
    if predictions[i] == PA_Y_test[i]:
        correct.append(predictions[i])
test_accuracy = len(correct)/predictions.shape[0]
print("Test Accuracy:")
print(test_accuracy)

## Random Forest for Actor Awards

In [None]:
depth_list = [1, 2, 3, 4, 5]
params = {"max_depth": depth_list}
classifier = RandomForestClassifier(max_depth=5,
                                    random_state=0)
grid_rf_AA = GridSearchCV(classifier, params, 
                            return_train_score = True, cv = 10)
grid_rf_AA.fit(AA_X_train, AA_Y_train.ravel())

In [None]:
predictions = grid_rf_AA.predict(AA_X_test)
correct = []
size = predictions.shape[0]
for i in range(0,size):
    if predictions[i] == AA_Y_test[i]:
        correct.append(predictions[i])
test_accuracy = len(correct)/predictions.shape[0]
print("Test Accuracy:")
print(test_accuracy)