In [1]:
# Imports for project

%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import patsy
import statsmodels.api as sm
import scipy.stats as stats
from scipy.stats import ttest_ind, chisquare, normaltest

  from pandas.core import datetools


# Oscars Best Picture

In [None]:
# Import database
df = pd.read_csv('awards.csv')

# Drop unnecessary columns
df = df[['Award','Winner','Name','Film']]

df['Award'].unique()

In [None]:
# Replace all the variant names of Best Picture into Best Picture
bp_variants = {'Outstanding Picture', 'Outstanding Production', 'Outstanding Motion Picture', 'Best Motion Picture', 'Best Picture'}

for index, row in df.iterrows():
    if(row['Award'] in bp_variants):
        df.loc[index,'Award'] = 'Best Picture'

In [None]:
# Only keep best picture awards
df = df[df['Award']=='Best Picture']

In [None]:
print((df[df['Winner'] == 1]).to_string())

We noticed that there are 89 pictures instead of the expected 88. We went through them by hand and found that M*A*S*H was put under winning when it should not have. Also there are movies in which the producer is mistakenly put as film name.

In [None]:
# M*A*S*H was not a winner
df.loc[4852,'Winner']=np.nan
# Change to correct film names
df.loc[21,'Name']="Wings"
df.loc[64,'Name']="The Broadway Melody"

Several of the nominations had wrong names too...

In [None]:
df.loc[19,'Name']="The Racket"
df.loc[20,'Name']="7th Heaven"
df.loc[62,'Name']="Alibi"
df.loc[63,'Name']="In Old Arizona"
df.loc[65,'Name']="Hollywood Revue"
df.loc[66,'Name']="The Patriot"

In [None]:
# Now we can drop the name row to only have film name
df = df[['Name','Winner']]

In [None]:
df['Name'].unique().size

In [None]:
df['Name'].value_counts()

This is because there are movies of the same name that got nominated/won for Best Picture in different years! But that's OK because it's just 5 entires out of 528!!

In [None]:
# Strip whitespace and lowercase the movie titles for consistency
df['Name']=df['Name'].str.strip()
df['Name']=df['Name'].str.lower()
df['Winner'].fillna(0,inplace=True)

In [None]:
df['Nominated']=1

In [None]:
df

# Month of Release

In [None]:
dfm = pd.read_csv('../the-movies-dataset/movies_metadata.csv')
dfm.dropna(subset=['release_date'], inplace = True)

In [None]:
def replace(date):
    output = date[5:7]
    return output



dfm['release_date']=dfm['release_date'].apply(replace)
dfm=dfm[["title","imdb_id","release_date"]]

In [None]:
# Strip whitespace and tolower for consistency
dfm["title"]=dfm["title"].str.strip()
dfm["title"]=dfm["title"].str.lower()

In [None]:
# There seems to be 3 movies with blank months. Remove them!
dfm['release_date'].value_counts()

In [None]:
# Remove all rows with blank release dates
dfm = dfm[dfm["release_date"]!=""]

In [None]:
# Set all month columns as 0 by default
dfm['Jan-Feb']=0
dfm['Mar-Apr']=0
dfm['May-Jun']=0
dfm['Jul-Aug']=0
dfm['Sept-Oct']=0
dfm['Nov-Dec']=0

# Set each month category with repective truth value
for index,row in dfm.iterrows():
    month_num = int(row['release_date'])
    if(month_num <=2):
        dfm.loc[index,'Jan-Feb']=1
    elif(month_num<=4):
        dfm.loc[index,'Mar-Apr']=1
    elif(month_num<=6):
        dfm.loc[index,'May-Jun']=1
    elif(month_num<=8):
        dfm.loc[index,'Jul-Aug']=1
    elif(month_num<=10):
        dfm.loc[index,'Sept-Oct']=1
    elif(month_num<=12):
        dfm.loc[index,'Nov-Dec']=1

In [None]:
dfm

# IMDB Weighted Ratings

In [None]:
# Renamed data from title.ratings.tsv.gz->data.tsv
ratings = pd.read_csv('../ratings.tsv',delimiter='\t')
# Renamed data from title.basics.tsv.gz->data.tsv
basics = pd.read_csv('../basics.tsv', delimiter='\t')

In [None]:
# Remove all things that arent movies
basics = basics[basics['titleType'] == "movie"]
# Remove all adult titles
basics = basics[basics['isAdult'] == 0]
# Remove unnecessary columns
basics = basics.drop(['titleType','originalTitle','isAdult','startYear','endYear','runtimeMinutes','genres'],axis=1)

In [None]:
# Do a inner join on movies that have both ratings and basic info
df = pd.merge(basics, ratings, on='tconst', how='inner')
# Dont drop the IMDB id just in case for matching
#df = df.drop(['tconst'], axis=1)

In [None]:
# Drop movies with too little votes
# df = df[df['numVotes']>50000]
# Add a new column with weighted ratings based on minimum votes
minVote = 30000

# Scroll down to the bottom of the following link to check how the weighted rating was calculated
# https://help.imdb.com/article/imdb/track-movies-tv/faq-for-imdb-ratings/G67Y87TFYYP6TWAV?ref_=helpsect_pro_2_4#
df = df.assign(weighted_ratings=((df['numVotes']/(df['numVotes']+minVote))*df['averageRating'])+(minVote/(df['numVotes']+minVote))*df['averageRating'].mean())

df.sort_values(by=['weighted_ratings'], ascending=False)

# Production Awards

In [2]:
awards_df = pd.read_csv("awards.csv")

# For some reason the dataset switched the name/film columns starting at the 3rd ceremony
# We had to compensate for this by moving the name of the film to the correct column for the first 3 ceremonies
for index, row in awards_df.iterrows():
    if(row["Ceremony"] < 3):
        awards_df.set_value(index,'Name',row["Film"])


# Drop the Ceremony and Year category since they aren't important
awards_df.drop(["Ceremony", "Year", "Film"], axis=1, inplace=True)
awards_df.rename(columns={'Name': 'Film'}, inplace=True)
# Strip whitespace and tolower for consistency
awards_df["Film"]=awards_df["Film"].str.strip()
awards_df["Film"]=awards_df["Film"].str.lower()

awards_df = awards_df.fillna(0)



awards = ["Film Editing", "Cinematography", "Makeup", "Production Design", "Art Direction",
         "Sound Editing", "Sound Mixing", "Special Effects" "Special Visual Effects", 
          "Special Achievement Award (Visual Effects)", "Visual Effects", "Engineering Effects"]


# Remove all of the rows that do not pertain to any of the production related awards
awards_df = awards_df[awards_df["Award"].isin(awards)]

# Number of nominations for production related awards 
nominations = awards_df["Film"].value_counts()

# Remove the nominations and only get the winenrs
# awards_df = awards_df[awards_df["Winner"] == 1]

# Numer of winners for production related awards
winners_count = awards_df["Film"].value_counts()



  import sys


In [3]:
# Make a new DF indexed by film name
prod_awards=pd.DataFrame(columns=['Film','Nominated Production',"Film Editing", "Cinematography", "Makeup", "Production Design", "Art Direction",
         "Sound Editing", "Sound Mixing", "Special Effects" "Special Visual Effects", 
          "Special Achievement Award (Visual Effects)", "Visual Effects", "Engineering Effects"])

In [4]:
prod_awards['Film'] = awards_df['Film'].unique()
# Initialize all fields to 0
prod_awards=prod_awards.fillna(0)


In [5]:
# Removing specific movie names that put more than one movie in the same name
prod_awards = prod_awards[prod_awards['Film']!="the devil dancer; the magic flame; sadie thompson"]
prod_awards = prod_awards[prod_awards['Film']!="the dove; tempest"]
prod_awards = prod_awards[prod_awards['Film']!="alibi; and the awakening"]
prod_awards = prod_awards[prod_awards['Film']!="four devils; and street angel"]
# Removing specific movie names that put more than one movie in the same name
awards_df = awards_df[awards_df['Film']!="the devil dancer; the magic flame; sadie thompson"]
awards_df = awards_df[awards_df['Film']!="the dove; tempest"]
awards_df = awards_df[awards_df['Film']!="alibi; and the awakening"]
awards_df = awards_df[awards_df['Film']!="four devils; and street angel"]

In [6]:
# Manually insert all the names with multiple production awards
prod_awards.loc[884,"Film"]="the dove"
prod_awards.loc[885,"Film"]="tempest"
prod_awards.loc[886,"Film"]="the devil dancer"
prod_awards.loc[887,"Film"]="the magic flame"
prod_awards.loc[888,"Film"]="saddie thompson"
prod_awards.loc[889,"Film"]="alibi"
prod_awards.loc[890,"Film"]="the awakening"
prod_awards.loc[891,"Film"]="four devils"
prod_awards=prod_awards.fillna(0)
# Every film was nominated
prod_awards["Nominated Production"]=1

# Set the awards for these two films that won
prod_awards.loc[884,"Art Direction"]=1.0
prod_awards.loc[885,"Art Direction"]=1.0

In [11]:
# Temporarily set index based on film name
prod_awards.set_index('Film',inplace=True)

Unnamed: 0_level_0,Nominated Production,Film Editing,Cinematography,Makeup,Production Design,Art Direction,Sound Editing,Sound Mixing,Special EffectsSpecial Visual Effects,Special Achievement Award (Visual Effects),Visual Effects,Engineering Effects
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
sunrise,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7th heaven,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wings,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
the patriot,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
the bridge of san luis rey,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dynamite,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
street angel,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
our dancing daughters,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
white shadows in the south seas,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
for index,row in awards_df.iterrows():
    if row['Award']=='Film Editing':
            prod_awards.loc[row["Film"],'Film Editing']=row["Winner"]
    elif row['Award']=='Art Direction':
            prod_awards.loc[row["Film"],'Art Direction']=row['Winner']
    elif row['Award']=='Cinematography':
            prod_awards.loc[row["Film"],'Cinematography']=row['Winner']
    elif row['Award']=='Visual Effects':
            prod_awards.loc[row["Film"],'Visual Effects']=row['Winner']
    elif row['Award']=='Makeup':
            prod_awards.loc[row["Film"],'Makeup']=row['Winner']
    elif row['Award']=='Sound Editing':
            prod_awards.loc[row["Film"],'Sound Editing']=row['Winner']
    elif row['Award']=='Sound Mixing':
            prod_awards.loc[row["Film"],'Sound Mixing']=row['Winner']
    elif row['Award']=='Production Design':
            prod_awards.loc[row["Film"],'Production Design']=row['Winner']
    elif row['Award']=='Special Achievement Award (Visual Effects)':
            prod_awards.loc[row["Film"],'Special Achievement Award (Visual Effects)']=row['Winner']
    elif row['Award']=='Engineering Effects':
            prod_awards.loc[row["Film"],'Engineering Effects']=row['Winner']

In [17]:
# Set index back to numbers and not film name
prod_awards=prod_awards.reset_index()

# Actor Awards

In [25]:
actor_awards = pd.read_csv('awards.csv')
actor_awards = actor_awards.loc[((actor_awards['Award'] == "Actor") | (actor_awards['Award'] == "Actress")), 'Film']



0                             The Noose
1                      The Last Command
2                       A Ship Comes In
3                            7th Heaven
4                        Sadie Thompson
35                          Thunderbolt
36                       In Old Arizona
37                                Alibi
38                          The Valiant
39                          The Patriot
40                             Madame X
41                           The Barker
42                           The Letter
43                      The Divine Lady
44                  The Broadway Melody
45                             Coquette
73                             Disraeli
74                    The Green Goddess
75                        The Big House
76                         The Big Pond
77                     Bulldog Drummond
78                       The Rogue Song
79                  The Devil's Holiday
80                        Sarah and Son
81                        Anna Christie


In [26]:
arr_film_one = []
for i in range(0,actor_awards.shape[0]):
    try:
        if actor_awards[i] not in arr_film_one:
            arr_film_one.append(actor_awards[i])
    except KeyError:
        pass
    
total_movies = []
for i in range(0,df.shape[0]):
    try:
        if df.Film[i] not in total_movies:
            total_movies.append(df.Film[i])
    except KeyError:
        pass
    
arr_film_zero = list(arr_film_one)
for i in range(0,len(total_movies)):
    if total_movies[i] not in arr_film_zero:
        arr_film_zero.append(total_movies[i])

NameError: name 'df' is not defined