In [1]:
# Imports for project

%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import patsy
import statsmodels.api as sm
import scipy.stats as stats
from scipy.stats import ttest_ind, chisquare, normaltest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

  from pandas.core import datetools


# Oscars Best Picture

In [2]:
# Import database
df = pd.read_csv('awards.csv')

# Drop unnecessary columns
df = df[['Award','Winner','Name','Film']]

df['Award'].unique()

array(['Actor', 'Actress', 'Art Direction', 'Cinematography',
       'Directing (Comedy Picture)', 'Directing (Dramatic Picture)',
       'Engineering Effects', 'Outstanding Picture',
       'Unique and Artistic Picture', 'Writing (Adaptation)',
       'Writing (Original Story)', 'Writing (Title Writing)',
       'Special Award', 'Directing', 'Writing', 'Outstanding Production',
       'Sound Recording', 'Short Subject (Cartoon)',
       'Short Subject (Comedy)', 'Short Subject (Novelty)',
       'Assistant Director', 'Film Editing', 'Music (Scoring)',
       'Music (Song)', 'Dance Direction', 'Writing (Screenplay)',
       'Actor in a Supporting Role', 'Actress in a Supporting Role',
       'Short Subject (Color)', 'Short Subject (One Reel)',
       'Short Subject (Two Reel)', 'Irving G. Thalberg Memorial Award',
       'Music (Original Score)', 'Cinematography (Black and White)',
       'Cinematography (Color)', 'Special Effects',
       'Art Direction (Black and White)', 'Art Direct

In [3]:
# Replace all the variant names of Best Picture into Best Picture
bp_variants = {'Outstanding Picture', 'Outstanding Production', 'Outstanding Motion Picture', 'Best Motion Picture', 'Best Picture'}

for index, row in df.iterrows():
    if(row['Award'] in bp_variants):
        df.loc[index,'Award'] = 'Best Picture'

In [4]:
# Only keep best picture awards
df = df[df['Award']=='Best Picture']

In [5]:
print((df[df['Winner'] == 1]).to_string())

             Award  Winner                                              Name                                               Film
21    Best Picture     1.0                            Paramount Famous Lasky                                              Wings
64    Best Picture     1.0                               Metro-Goldwyn-Mayer                                The Broadway Melody
100   Best Picture     1.0                   All Quiet on the Western Front                                           Universal
140   Best Picture     1.0                                         Cimarron                                           RKO Radio
178   Best Picture     1.0                                      Grand Hotel                                 Metro-Goldwyn-Mayer
237   Best Picture     1.0                                        Cavalcade                                                 Fox
301   Best Picture     1.0                            It Happened One Night                             

We noticed that there are 89 pictures instead of the expected 88. We went through them by hand and found that M*A*S*H was put under winning when it should not have. Also there are movies in which the producer is mistakenly put as film name.

In [6]:
# M*A*S*H was not a winner
df.loc[4852,'Winner']=np.nan
# Change to correct film names
df.loc[21,'Name']="Wings"
df.loc[64,'Name']="The Broadway Melody"

Several of the nominations had wrong names too...

In [7]:
df.loc[19,'Name']="The Racket"
df.loc[20,'Name']="7th Heaven"
df.loc[62,'Name']="Alibi"
df.loc[63,'Name']="In Old Arizona"
df.loc[65,'Name']="Hollywood Revue"
df.loc[66,'Name']="The Patriot"

In [8]:
# Now we can drop the name row to only have film name
df = df[['Name','Winner']]

In [9]:
df['Name'].unique().size

523

In [10]:
df['Name'].value_counts()

Romeo and Juliet                        2
Moulin Rouge                            2
Heaven Can Wait                         2
Cleopatra                               2
Mutiny on the Bounty                    2
7th Heaven                              1
The Apartment                           1
Sayonara                                1
The Greatest Show on Earth              1
Fanny                                   1
Foreign Correspondent                   1
Bonnie and Clyde                        1
The Towering Inferno                    1
The Conversation                        1
The Longest Day                         1
Shine                                   1
Hugo                                    1
Fiddler on the Roof                     1
12 Angry Men                            1
Julia                                   1
Ruggles of Red Gap                      1
The Curious Case of Benjamin Button     1
Our Town                                1
Four Daughters                    

This is because there are movies of the same name that got nominated/won for Best Picture in different years! But that's OK because it's just 5 entires out of 528!!

In [11]:
# Strip whitespace and lowercase the movie titles for consistency
df['Name']=df['Name'].str.strip()
df['Name']=df['Name'].str.lower()
df['Winner'].fillna(0,inplace=True)

In [12]:
df['Nominated']=1

In [13]:
df

Unnamed: 0,Name,Winner,Nominated
19,the racket,0.0,1
20,7th heaven,0.0,1
21,wings,1.0,1
62,alibi,0.0,1
63,in old arizona,0.0,1
64,the broadway melody,1.0,1
65,hollywood revue,0.0,1
66,the patriot,0.0,1
100,all quiet on the western front,1.0,1
101,the big house,0.0,1


# Month of Release

In [14]:
dfm = pd.read_csv('movies_metadata.csv')
dfm.dropna(subset=['release_date'], inplace = True)

In [15]:
def replace(date):
    output = date[5:7]
    return output



dfm['release_date']=dfm['release_date'].apply(replace)
dfm=dfm[["title","release_date"]]

In [16]:
# Strip whitespace and tolower for consistency
dfm["title"]=dfm["title"].str.strip()
dfm["title"]=dfm["title"].str.lower()

In [17]:
# There seems to be 3 movies with blank months. Remove them!
dfm['release_date'].value_counts()

09    588
10    474
12    456
08    413
06    389
01    382
03    372
05    363
07    363
04    350
02    326
11    326
Name: release_date, dtype: int64

In [18]:
# Remove all rows with blank release dates
dfm = dfm[dfm["release_date"]!=""]

In [19]:
# Set all month columns as 0 by default
dfm['Jan-Feb']=0
dfm['Mar-Apr']=0
dfm['May-Jun']=0
dfm['Jul-Aug']=0
dfm['Sept-Oct']=0
dfm['Nov-Dec']=0

# Set each month category with repective truth value
for index,row in dfm.iterrows():
    month_num = int(row['release_date'])
    if(month_num <=2):
        dfm.loc[index,'Jan-Feb']=1
    elif(month_num<=4):
        dfm.loc[index,'Mar-Apr']=1
    elif(month_num<=6):
        dfm.loc[index,'May-Jun']=1
    elif(month_num<=8):
        dfm.loc[index,'Jul-Aug']=1
    elif(month_num<=10):
        dfm.loc[index,'Sept-Oct']=1
    elif(month_num<=12):
        dfm.loc[index,'Nov-Dec']=1

In [20]:
dfm

Unnamed: 0,title,release_date,Jan-Feb,Mar-Apr,May-Jun,Jul-Aug,Sept-Oct,Nov-Dec
0,avatar,12,0,0,0,0,0,1
1,pirates of the caribbean: at world's end,05,0,0,1,0,0,0
2,spectre,10,0,0,0,0,1,0
3,the dark knight rises,07,0,0,0,1,0,0
4,john carter,03,0,1,0,0,0,0
5,spider-man 3,05,0,0,1,0,0,0
6,tangled,11,0,0,0,0,0,1
7,avengers: age of ultron,04,0,1,0,0,0,0
8,harry potter and the half-blood prince,07,0,0,0,1,0,0
9,batman v superman: dawn of justice,03,0,1,0,0,0,0


# IMDB Weighted Ratings

In [21]:
# Renamed data from title.ratings.tsv.gz->data.tsv
ratings = pd.read_csv('title.ratings.tsv.gz',delimiter='\t')
# Renamed data from title.basics.tsv.gz->data.tsv
basics = pd.read_csv('title.basics.tsv.gz', delimiter='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [22]:
# Remove all things that arent movies
basics = basics[basics['titleType'] == "movie"]
# Remove all adult titles
basics = basics[basics['isAdult'] == 0]
# Remove unnecessary columns
basics = basics.drop(['titleType','originalTitle','isAdult','startYear','endYear','runtimeMinutes','genres'],axis=1)

In [23]:
# Do a inner join on movies that have both ratings and basic info
df = pd.merge(basics, ratings, on='tconst', how='inner')
# Dont drop the IMDB id just in case for matching
#df = df.drop(['tconst'], axis=1)

In [24]:
# Drop movies with too little votes
# df = df[df['numVotes']>50000]
# Add a new column with weighted ratings based on minimum votes
minVote = 30000

# Scroll down to the bottom of the following link to check how the weighted rating was calculated
# https://help.imdb.com/article/imdb/track-movies-tv/faq-for-imdb-ratings/G67Y87TFYYP6TWAV?ref_=helpsect_pro_2_4#
df = df.assign(weighted_ratings=((df['numVotes']/(df['numVotes']+minVote))*df['averageRating'])+(minVote/(df['numVotes']+minVote))*df['averageRating'].mean())

df.sort_values(by=['weighted_ratings'], ascending=False)

Unnamed: 0,tconst,primaryTitle,averageRating,numVotes,weighted_ratings
57541,tt0111161,The Shawshank Redemption,9.3,1928712,9.253596
33745,tt0068646,The Godfather,9.2,1319020,9.134848
121542,tt0468569,The Dark Knight,9.0,1899597,8.957560
35590,tt0071562,The Godfather: Part II,9.0,911219,8.912994
57397,tt0110912,Pulp Fiction,8.9,1506704,8.848662
72331,tt0167260,The Lord of the Rings: The Return of the King,8.9,1376743,8.843919
56012,tt0108052,Schindler's List,8.9,993617,8.822929
204075,tt5813916,The Mountain II,9.6,94942,8.800498
28288,tt0060196,"The Good, the Bad and the Ugly",8.9,571561,8.768856
21197,tt0050083,12 Angry Men,8.9,533701,8.760048


# Production Awards

In [25]:
awards_df = pd.read_csv("awards.csv")

# For some reason the dataset switched the name/film columns starting at the 3rd ceremony
# We had to compensate for this by moving the name of the film to the correct column for the first 3 ceremonies
for index, row in awards_df.iterrows():
    if(row["Ceremony"] < 3):
        awards_df.set_value(index,'Name',row["Film"])


# Drop the Ceremony and Year category since they aren't important
awards_df.drop(["Ceremony", "Year", "Film"], axis=1, inplace=True)
awards_df.rename(columns={'Name': 'Film'}, inplace=True)
# Strip whitespace and tolower for consistency
awards_df["Film"]=awards_df["Film"].str.strip()
awards_df["Film"]=awards_df["Film"].str.lower()

awards_df = awards_df.fillna(0)



awards = ["Film Editing", "Cinematography", "Makeup", "Production Design", "Art Direction",
         "Sound Editing", "Sound Mixing", "Special Effects" "Special Visual Effects", 
          "Special Achievement Award (Visual Effects)", "Visual Effects", "Engineering Effects"]


# Remove all of the rows that do not pertain to any of the production related awards
awards_df = awards_df[awards_df["Award"].isin(awards)]

# Number of nominations for production related awards 
nominations = awards_df["Film"].value_counts()

# Remove the nominations and only get the winenrs
# awards_df = awards_df[awards_df["Winner"] == 1]

# Numer of winners for production related awards
winners_count = awards_df["Film"].value_counts()



In [26]:
# Make a new DF indexed by film name
prod_awards=pd.DataFrame(columns=['Film','Nominated Production',"Film Editing", "Cinematography", "Makeup", "Production Design", "Art Direction",
         "Sound Editing", "Sound Mixing", "Special Effects" "Special Visual Effects", 
          "Special Achievement Award (Visual Effects)", "Visual Effects", "Engineering Effects"])

In [27]:
prod_awards['Film'] = awards_df['Film'].unique()
# Initialize all fields to 0
prod_awards=prod_awards.fillna(0)
prod_awards

Unnamed: 0,Film,Nominated Production,Film Editing,Cinematography,Makeup,Production Design,Art Direction,Sound Editing,Sound Mixing,Special EffectsSpecial Visual Effects,Special Achievement Award (Visual Effects),Visual Effects,Engineering Effects
0,sunrise,0,0,0,0,0,0,0,0,0,0,0,0
1,the dove; tempest,0,0,0,0,0,0,0,0,0,0,0,0
2,7th heaven,0,0,0,0,0,0,0,0,0,0,0,0
3,the devil dancer; the magic flame; sadie thompson,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0
5,wings,0,0,0,0,0,0,0,0,0,0,0,0
6,the patriot,0,0,0,0,0,0,0,0,0,0,0,0
7,the bridge of san luis rey,0,0,0,0,0,0,0,0,0,0,0,0
8,dynamite,0,0,0,0,0,0,0,0,0,0,0,0
9,alibi; and the awakening,0,0,0,0,0,0,0,0,0,0,0,0


In [28]:
# Removing specific movie names that put more than one movie in the same name
prod_awards = prod_awards[prod_awards['Film']!="the devil dancer; the magic flame; sadie thompson"]
prod_awards = prod_awards[prod_awards['Film']!="the dove; tempest"]
prod_awards = prod_awards[prod_awards['Film']!="alibi; and the awakening"]
prod_awards = prod_awards[prod_awards['Film']!="four devils; and street angel"]
# Removing specific movie names that put more than one movie in the same name
awards_df = awards_df[awards_df['Film']!="the devil dancer; the magic flame; sadie thompson"]
awards_df = awards_df[awards_df['Film']!="the dove; tempest"]
awards_df = awards_df[awards_df['Film']!="alibi; and the awakening"]
awards_df = awards_df[awards_df['Film']!="four devils; and street angel"]

In [29]:
# Manually insert all the names with multiple production awards
prod_awards.loc[884,"Film"]="the dove"
prod_awards.loc[885,"Film"]="tempest"
prod_awards.loc[886,"Film"]="the devil dancer"
prod_awards.loc[887,"Film"]="the magic flame"
prod_awards.loc[888,"Film"]="saddie thompson"
prod_awards.loc[889,"Film"]="alibi"
prod_awards.loc[890,"Film"]="the awakening"
prod_awards.loc[891,"Film"]="four devils"
prod_awards=prod_awards.fillna(0)
# Every film was nominated
prod_awards["Nominated Production"]=1

# Set the awards for these two films that won
prod_awards.loc[884,"Art Direction"]=1.0
prod_awards.loc[885,"Art Direction"]=1.0

In [30]:
# Temporarily set index based on film name
prod_awards.set_index('Film',inplace=True)
prod_awards

Unnamed: 0_level_0,Nominated Production,Film Editing,Cinematography,Makeup,Production Design,Art Direction,Sound Editing,Sound Mixing,Special EffectsSpecial Visual Effects,Special Achievement Award (Visual Effects),Visual Effects,Engineering Effects
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
sunrise,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7th heaven,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wings,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
the patriot,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
the bridge of san luis rey,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dynamite,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
street angel,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
our dancing daughters,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
white shadows in the south seas,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
for index,row in awards_df.iterrows():
    if row['Award']=='Film Editing':
            prod_awards.loc[row["Film"],'Film Editing']=row["Winner"]
    elif row['Award']=='Art Direction':
            prod_awards.loc[row["Film"],'Art Direction']=row['Winner']
    elif row['Award']=='Cinematography':
            prod_awards.loc[row["Film"],'Cinematography']=row['Winner']
    elif row['Award']=='Visual Effects':
            prod_awards.loc[row["Film"],'Visual Effects']=row['Winner']
    elif row['Award']=='Makeup':
            prod_awards.loc[row["Film"],'Makeup']=row['Winner']
    elif row['Award']=='Sound Editing':
            prod_awards.loc[row["Film"],'Sound Editing']=row['Winner']
    elif row['Award']=='Sound Mixing':
            prod_awards.loc[row["Film"],'Sound Mixing']=row['Winner']
    elif row['Award']=='Production Design':
            prod_awards.loc[row["Film"],'Production Design']=row['Winner']
    elif row['Award']=='Special Achievement Award (Visual Effects)':
            prod_awards.loc[row["Film"],'Special Achievement Award (Visual Effects)']=row['Winner']
    elif row['Award']=='Engineering Effects':
            prod_awards.loc[row["Film"],'Engineering Effects']=row['Winner']

In [32]:
# Set index back to numbers and not film name
prod_awards=prod_awards.reset_index()
prod_awards

Unnamed: 0,Film,Nominated Production,Film Editing,Cinematography,Makeup,Production Design,Art Direction,Sound Editing,Sound Mixing,Special EffectsSpecial Visual Effects,Special Achievement Award (Visual Effects),Visual Effects,Engineering Effects
0,sunrise,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7th heaven,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,wings,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,the patriot,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,the bridge of san luis rey,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6,dynamite,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,street angel,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,our dancing daughters,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,white shadows in the south seas,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Merging

In [33]:
def toLower(string):
    return string.lower()
df.rename(columns={'primaryTitle':'title'}, inplace=True)
df['title'] = df['title'].apply(toLower)
df.rename(columns={'primaryTitle':'title'}, inplace=True)
prod_awards.rename(columns={'Film':'title'}, inplace=True)
good_bye_list = ['column_1', 'column_2', 'column_3']
df.drop(['tconst'], axis=1, inplace=True)

In [34]:
features_df = pd.merge(df, dfm, on='title')

In [35]:
final_df = pd.merge(features_df,prod_awards,on='title')

In [36]:
#All Y labels
y_film = final_df['Film Editing'].copy()
y_cin = final_df['Cinematography'].copy()
y_makeup = final_df['Makeup'].copy()
y_prod = final_df['Production Design'].copy()
y_art = final_df['Art Direction'].copy()
y_sounded = final_df['Sound Editing'].copy()
y_soundmix = final_df['Sound Mixing'].copy()
y_ssesve = final_df['Special EffectsSpecial Visual Effects'].copy()
y_saave = final_df['Special Achievement Award (Visual Effects)'].copy()
y_visual = final_df['Visual Effects'].copy()
y_eng = final_df['Engineering Effects'].copy()

In [37]:
#X labels
x_feat = final_df.iloc[:,1:11]
x_feat = x_feat.drop('release_date',1)

# Machine Learning Part

In [38]:
#All sets to run algorithm on
XY_film = pd.concat([x_feat,y_film],axis = 1).values
XY_cin = pd.concat([x_feat,y_cin],axis = 1).values
XY_makeup = pd.concat([x_feat,y_makeup],axis = 1).values
XY_prod = pd.concat([x_feat,y_prod],axis = 1).values
XY_art = pd.concat([x_feat,y_art],axis = 1).values
XY_sounded = pd.concat([x_feat,y_sounded],axis = 1).values
XY_soundmix = pd.concat([x_feat,y_soundmix],axis = 1).values
XY_ssesve = pd.concat([x_feat,y_ssesve],axis = 1).values
XY_saave = pd.concat([x_feat,y_saave],axis = 1).values
XY_visual = pd.concat([x_feat,y_visual],axis = 1).values
XY_eng = pd.concat([x_feat,y_eng],axis = 1).values

In [39]:
#Shuffle
np.random.shuffle(XY_film)
np.random.shuffle(XY_cin)
np.random.shuffle(XY_makeup)
np.random.shuffle(XY_prod)  
np.random.shuffle(XY_art) 
np.random.shuffle(XY_sounded) 
np.random.shuffle(XY_soundmix)  
np.random.shuffle(XY_ssesve) 
np.random.shuffle(XY_saave)
np.random.shuffle(XY_visual)  
np.random.shuffle(XY_eng)

In [40]:
#Each xy
film_X = XY_film[:,0:9]
film_Y = XY_film[:,9:10]

cin_X = XY_cin[:,0:9]
cin_Y = XY_cin[:,9:10]

makeup_X = XY_makeup[:,0:9]
makeup_Y = XY_makeup[:,9:10]

prod_X = XY_prod[:,0:9]
prod_Y = XY_prod[:,9:10]

art_X = XY_art[:,0:9]
art_Y = XY_art[:,9:10]

sounded_X = XY_sounded[:,0:9]
sounded_Y = XY_sounded[:,9:10]

soundmix_X = XY_soundmix[:,0:9]
soundmix_Y = XY_soundmix[:,9:10]

ssesve_X = XY_ssesve[:,0:9]
ssesve_Y = XY_ssesve[:,9:10]

saave_X = XY_saave[:,0:9]
saave_Y = XY_saave[:,9:10]

visual_X = XY_visual[:,0:9]
visual_Y = XY_visual[:,9:10]

eng_X = XY_eng[:,0:9]
eng_Y = XY_eng[:,9:10]

In [41]:
#Split
num_training = int(0.8*film_X.shape[0])
num_testing = int(0.2*film_X.shape[0])

film_X_train = film_X[:num_training] 
film_Y_train = film_Y[:num_training] 
film_X_test = film_X[num_training:] 
film_Y_test = film_Y[num_training:] 

cin_X_train = cin_X[:num_training] 
cin_Y_train = cin_Y[:num_training] 
cin_X_test = cin_X[num_training:] 
cin_Y_test = cin_Y[num_training:] 

makeup_X_train = makeup_X[:num_training] 
makeup_Y_train = makeup_Y[:num_training] 
makeup_X_test = makeup_X[num_training:] 
makeup_Y_test = makeup_Y[num_training:] 

prod_X_train = prod_X[:num_training] 
prod_Y_train = prod_Y[:num_training] 
prod_X_test = prod_X[num_training:] 
prod_Y_test = prod_Y[num_training:] 

art_X_train = art_X[:num_training] 
art_Y_train = art_Y[:num_training] 
art_X_test = art_X[num_training:] 
art_Y_test = art_Y[num_training:] 

sounded_X_train = sounded_X[:num_training] 
sounded_Y_train = sounded_Y[:num_training] 
sounded_X_test = sounded_X[num_training:] 
sounded_Y_test = sounded_Y[num_training:] 

soundmix_X_train = soundmix_X[:num_training] 
soundmix_Y_train = soundmix_Y[:num_training] 
soundmix_X_test = soundmix_X[num_training:] 
soundmix_Y_test = soundmix_Y[num_training:] 

ssesve_X_train = ssesve_X[:num_training] 
ssesve_Y_train = ssesve_Y[:num_training] 
ssesve_X_test = ssesve_X[num_training:] 
ssesve_Y_test = ssesve_Y[num_training:] 

saave_X_train = saave_X[:num_training] 
saave_Y_train = saave_Y[:num_training] 
saave_X_test = saave_X[num_training:] 
saave_Y_test = saave_Y[num_training:] 

visual_X_train = visual_X[:num_training] 
visual_Y_train = visual_Y[:num_training] 
visual_X_test = visual_X[num_training:] 
visual_Y_test = visual_Y[num_training:] 

eng_X_train = eng_X[:num_training] 
eng_Y_train = eng_Y[:num_training] 
eng_X_test = eng_X[num_training:] 
eng_Y_test = eng_Y[num_training:] 

# Random Forest for film

In [42]:
depth_list = [1, 2, 3, 4, 5]
params = {"max_depth": depth_list}
classifier = RandomForestClassifier(max_depth=5,
                                    random_state=0)
grid_rf_film = GridSearchCV(classifier, params, 
                            return_train_score = True, cv = 10)
grid_rf_film.fit(film_X_train, film_Y_train.ravel())

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [43]:
predictions = grid_rf_film.predict(film_X_test)
correct = []
size = predictions.shape[0]
for i in range(0,size):
    if predictions[i] == film_Y_test[i]:
        correct.append(predictions[i])
test_accuracy = len(correct)/predictions.shape[0]
print("Test Accuracy:")
print(test_accuracy)

Test Accuracy:
0.896774193548387


# Random Forest for cin

In [44]:
depth_list = [1, 2, 3, 4, 5]
params = {"max_depth": depth_list}
classifier = RandomForestClassifier(max_depth=5,
                                    random_state=0)
grid_rf_cin = GridSearchCV(classifier, params, 
                            return_train_score = True, cv = 10)
grid_rf_cin.fit(cin_X_train, cin_Y_train.ravel())

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [45]:
predictions = grid_rf_cin.predict(cin_X_test)
correct = []
size = predictions.shape[0]
for i in range(0,size):
    if predictions[i] == cin_Y_test[i]:
        correct.append(predictions[i])
test_accuracy = len(correct)/predictions.shape[0]
print("Test Accuracy:")
print(test_accuracy)

Test Accuracy:
0.9419354838709677


# Random Forest for makeup

In [46]:
depth_list = [1, 2, 3, 4, 5]
params = {"max_depth": depth_list}
classifier = RandomForestClassifier(max_depth=5,
                                    random_state=0)
grid_rf_makeup = GridSearchCV(classifier, params, 
                            return_train_score = True, cv = 10)
grid_rf_makeup.fit(makeup_X_train, makeup_Y_train.ravel())

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [47]:
predictions = grid_rf_makeup.predict(makeup_X_test)
correct = []
size = predictions.shape[0]
for i in range(0,size):
    if predictions[i] == makeup_Y_test[i]:
        correct.append(predictions[i])
test_accuracy = len(correct)/predictions.shape[0]
print("Test Accuracy:")
print(test_accuracy)

Test Accuracy:
0.9806451612903225


# Random Forest for production

In [48]:
depth_list = [1, 2, 3, 4, 5]
params = {"max_depth": depth_list}
classifier = RandomForestClassifier(max_depth=3,
                                    random_state=0)
grid_rf_prod = GridSearchCV(classifier, params, 
                            return_train_score = True, cv = 3)
grid_rf_prod.fit(prod_X_train, prod_Y_train.ravel())

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [49]:
predictions = grid_rf_prod.predict(prod_X_test)
correct = []
size = predictions.shape[0]
for i in range(0,size):
    if predictions[i] == prod_Y_test[i]:
        correct.append(predictions[i])
test_accuracy = len(correct)/predictions.shape[0]
print("Test Accuracy:")
print(test_accuracy)

Test Accuracy:
0.9935483870967742


# Random Forest for art

In [50]:
depth_list = [1, 2, 3, 4, 5]
params = {"max_depth": depth_list}
classifier = RandomForestClassifier(max_depth=5,
                                    random_state=0)
grid_rf_art = GridSearchCV(classifier, params, 
                            return_train_score = True, cv = 10)
grid_rf_art.fit(art_X_train, art_Y_train.ravel())

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [51]:
predictions = grid_rf_art.predict(art_X_test)
correct = []
size = predictions.shape[0]
for i in range(0,size):
    if predictions[i] == art_Y_test[i]:
        correct.append(predictions[i])
test_accuracy = len(correct)/predictions.shape[0]
print("Test Accuracy:")
print(test_accuracy)

Test Accuracy:
0.9290322580645162


# Random Forest for sound

In [52]:
depth_list = [1, 2, 3, 4, 5]
params = {"max_depth": depth_list}
classifier = RandomForestClassifier(max_depth=5,
                                    random_state=0)
grid_rf_sound = GridSearchCV(classifier, params, 
                            return_train_score = True, cv = 10)
grid_rf_sound.fit(sounded_X_train, sounded_Y_train.ravel())

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [53]:
predictions = grid_rf_sound.predict(sounded_X_test)
correct = []
size = predictions.shape[0]
for i in range(0,size):
    if predictions[i] == sounded_Y_test[i]:
        correct.append(predictions[i])
test_accuracy = len(correct)/predictions.shape[0]
print("Test Accuracy:")
print(test_accuracy)

Test Accuracy:
0.967741935483871


# Random Forest for sound mix

In [54]:
depth_list = [1, 2, 3, 4, 5]
params = {"max_depth": depth_list}
classifier = RandomForestClassifier(max_depth=5,
                                    random_state=0)
grid_rf_soundmix = GridSearchCV(classifier, params, 
                            return_train_score = True, cv = 10)
grid_rf_soundmix.fit(soundmix_X_train, soundmix_Y_train.ravel())

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [55]:
predictions = grid_rf_soundmix.predict(soundmix_X_test)
correct = []
size = predictions.shape[0]
for i in range(0,size):
    if predictions[i] == soundmix_Y_test[i]:
        correct.append(predictions[i])
test_accuracy = len(correct)/predictions.shape[0]
print("Test Accuracy:")
print(test_accuracy)

Test Accuracy:
0.9741935483870968


# Random Forest for ssesve

In [56]:
depth_list = [1, 2, 3, 4, 5]
params = {"max_depth": depth_list}
classifier = RandomForestClassifier(max_depth=5,
                                    random_state=0)
grid_rf_ssesve = GridSearchCV(classifier, params, 
                            return_train_score = True, cv = 10)
grid_rf_ssesve.fit(ssesve_X_train, ssesve_Y_train.ravel())

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [57]:
predictions = grid_rf_ssesve.predict(ssesve_X_test)
correct = []
size = predictions.shape[0]
for i in range(0,size):
    if predictions[i] == ssesve_Y_test[i]:
        correct.append(predictions[i])
test_accuracy = len(correct)/predictions.shape[0]
print("Test Accuracy:")
print(test_accuracy)

Test Accuracy:
1.0


# Random Forest for saave

In [58]:
depth_list = [1, 2, 3, 4, 5]
params = {"max_depth": depth_list}
classifier = RandomForestClassifier(max_depth=5,
                                    random_state=0)
grid_rf_saave = GridSearchCV(classifier, params, 
                            return_train_score = True, cv = 10)
grid_rf_saave.fit(saave_X_train, saave_Y_train.ravel())

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [59]:
predictions = grid_rf_saave.predict(saave_X_test)
correct = []
size = predictions.shape[0]
for i in range(0,size):
    if predictions[i] == saave_Y_test[i]:
        correct.append(predictions[i])
test_accuracy = len(correct)/predictions.shape[0]
print("Test Accuracy:")
print(test_accuracy)

Test Accuracy:
0.9935483870967742


# Random Forest for visual

In [60]:
depth_list = [1, 2, 3, 4, 5]
params = {"max_depth": depth_list}
classifier = RandomForestClassifier(max_depth=5,
                                    random_state=0)
grid_rf_visual = GridSearchCV(classifier, params, 
                            return_train_score = True, cv = 10)
grid_rf_visual.fit(visual_X_train, visual_Y_train.ravel())

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [61]:
predictions = grid_rf_visual.predict(visual_X_test)
correct = []
size = predictions.shape[0]
for i in range(0,size):
    if predictions[i] == visual_Y_test[i]:
        correct.append(predictions[i])
test_accuracy = len(correct)/predictions.shape[0]
print("Test Accuracy:")
print(test_accuracy)

Test Accuracy:
0.9354838709677419


# Random Forest for eng

In [62]:
depth_list = [1, 2, 3, 4, 5]
params = {"max_depth": depth_list}
classifier = RandomForestClassifier(max_depth=5,
                                    random_state=0)
grid_rf_eng = GridSearchCV(classifier, params, 
                            return_train_score = True, cv = 10)
grid_rf_eng.fit(eng_X_train, eng_Y_train.ravel())

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [63]:
predictions = grid_rf_eng.predict(eng_X_test)
correct = []
size = predictions.shape[0]
for i in range(0,size):
    if predictions[i] == eng_Y_test[i]:
        correct.append(predictions[i])
test_accuracy = len(correct)/predictions.shape[0]
print("Test Accuracy:")
print(test_accuracy)

Test Accuracy:
1.0
