In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# create dataset for best picture

In [2]:
oscar_df = pd.read_csv("data/the_oscar_award.csv")
oscar_df = oscar_df[oscar_df['year_ceremony'] >= 1937]
oscar_df['category'] = oscar_df['category'].str.lower() 
bp_mask = oscar_df['category'].str.contains('best|outstanding (motion)? picture')

  bp_mask = oscar_df['category'].str.contains('best|outstanding (motion)? picture')


## Feature Engineering
### Add Nominations and win count

In [3]:
#create all masks
actor_mask = oscar_df['category'].str.contains('actor( in a leading role)?$')
actress_mask = oscar_df['category'].str.contains('actress( in a leading role)?$')
director_mask = oscar_df['category'].str.contains('directing')
writer_mask = oscar_df['category'].str.contains('writing.*screenplay')
sup_actor_mask = oscar_df['category'].str.contains('actor in a supporting role')
sup_actress_mask = oscar_df['category'].str.contains('actress in a supporting role')

mask_dct = {
    'actor': actor_mask,
    'actress': actress_mask,
    'director': director_mask,
    'writer': writer_mask,
    'sup_actor': sup_actor_mask,
    'sup_actress': sup_actress_mask,
    'best_picture': bp_mask,
}

for k,v in mask_dct.items():
    oscar_df['category'] = np.where(v,k,oscar_df['category'])



big5_mask = actor_mask | actress_mask | bp_mask | director_mask | writer_mask
atl_mask = big5_mask | sup_actor_mask | sup_actress_mask

  actor_mask = oscar_df['category'].str.contains('actor( in a leading role)?$')
  actress_mask = oscar_df['category'].str.contains('actress( in a leading role)?$')


In [4]:
noms = oscar_df.groupby(['year_film', 'film']).size().rename('noms')
wins = oscar_df[oscar_df['winner']==True & ~bp_mask].groupby(['year_film', 'film'])['category'].nunique().rename('wins')

big5_noms = oscar_df[big5_mask].groupby(['year_film', 'film'])['category'].nunique().rename('big5_noms')
big5_wins = oscar_df[~bp_mask & big5_mask & oscar_df['winner']==True].groupby(['year_film', 'film'])['category'].nunique().rename('big5_wins')

atl_noms = oscar_df[atl_mask].groupby(['year_film', 'film'])['category'].nunique().rename('atl_noms')
atl_wins = oscar_df[~bp_mask & atl_mask & oscar_df['winner']==True].groupby(['year_film', 'film'])['category'].nunique().rename('atl_wins')

for ser in (noms,wins,big5_noms,big5_wins,atl_noms,atl_wins):
    oscar_df = pd.merge(oscar_df,ser,left_on=['year_film', 'film'],right_index=True,how='left')

oscar_df[['wins','big5_wins','atl_wins']] = oscar_df[['wins','big5_wins','atl_wins']].fillna(0)


In [5]:
noms_df = (~oscar_df[atl_mask].astype({'winner':int}).pivot_table(index=['year_ceremony','film'],columns='category',values='winner').isna())
wins_df = (~oscar_df[atl_mask&(oscar_df['winner'])].astype({'winner':int}).pivot_table(index=['year_ceremony','film'],columns='category',values='wins').isna())

#remove best_film from both datasets
noms_df = noms_df.drop('best_picture',axis=1)
wins_df = wins_df.drop('best_picture',axis=1)

In [6]:
oscar_df = pd.merge(oscar_df,noms_df,left_on=['year_ceremony','film'],right_index=True,how='left',suffixes=('','_noms'))
oscar_df = pd.merge(oscar_df,wins_df,left_on=['year_ceremony','film'],right_index=True,how='left',suffixes=('_noms','_wins'))


In [7]:
oscar_df.isna().sum()
oscar_df[['atl_noms','big5_noms']] = oscar_df[['atl_noms','big5_noms']].fillna(0)
noms_wins_cols = oscar_df.columns[oscar_df.columns.str.contains('_wins|_noms$')]
oscar_df[noms_wins_cols] = oscar_df[noms_wins_cols].fillna(False)

## Filter Best Picture, remove data leakage, split data

In [8]:
model_df = oscar_df[oscar_df['category']=='best_picture'].set_index(['year_ceremony','film']).drop('category',axis=1)
model_df.drop(['year_film','ceremony','name'],axis=1,inplace=True)
model_df = model_df.astype(int)


#remove data leakage
model_df[model_df['winner']==1][['wins','big5_wins','atl_wins']] = model_df[model_df['winner']==1][['wins','big5_wins','atl_wins']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_df[model_df['winner']==1][['wins','big5_wins','atl_wins']] = model_df[model_df['winner']==1][['wins','big5_wins','atl_wins']]


In [9]:
test_mask = model_df.index.get_level_values(0) >= 2007

train_df = model_df[~test_mask]
test_df = model_df[test_mask]

# Train Model

In [10]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
model = GaussianNB()
model.fit(train_df.drop('winner',axis=1),train_df['winner'])

test_df['predict'] = model.predict_proba(test_df.drop('winner',axis=1))[:,1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['predict'] = model.predict_proba(test_df.drop('winner',axis=1))[:,1]


In [11]:
predictions = test_df.groupby('year_ceremony')['predict'].idxmax()
actual = test_df.groupby('year_ceremony')['winner'].idxmax()

In [12]:

(predictions == actual).sum()/len(predictions)

0.6470588235294118

## Create model based on actors


In [13]:
model_df

Unnamed: 0_level_0,Unnamed: 1_level_0,winner,noms,wins,big5_noms,big5_wins,atl_noms,atl_wins,actor_noms,actress_noms,director_noms,sup_actor_noms,sup_actress_noms,writer_noms,actor_wins,actress_wins,director_wins,sup_actor_wins,sup_actress_wins,writer_wins
year_ceremony,film,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1942,Blossoms in the Dust,0,4,2,2,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0
1942,Citizen Kane,0,9,2,4,1,4,1,1,0,1,0,0,1,0,0,0,0,0,1
1942,Here Comes Mr. Jordan,0,7,3,4,1,5,1,1,0,1,1,0,1,0,0,0,0,0,1
1942,Hold Back the Dawn,0,6,1,3,0,3,0,0,1,0,0,0,1,0,0,0,0,0,0
1942,How Green Was My Valley,1,10,4,3,1,5,2,0,0,1,1,1,1,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023,The Fabelmans,0,7,1,4,0,5,0,0,1,1,1,0,1,0,0,0,0,0,0
2023,Tár,0,6,1,4,0,4,0,0,1,1,0,0,1,0,0,0,0,0,0
2023,Top Gun: Maverick,0,6,2,2,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0
2023,Triangle of Sadness,0,3,1,3,0,3,0,0,0,1,0,0,1,0,0,0,0,0,0


In [14]:
#load database of cast
cast_pkl = open("cast_info.pickle", "rb")
cast_info = pickle.load(cast_pkl)
cast_db = pd.DataFrame(cast_info).T

#database is indexed by tmdb id, so load tmdb to get mapping
tmdb = pd.read_csv('data/tmdb.csv',index_col=0)

#create multiIndex based on nested list
new_index = pd.MultiIndex.from_tuples([(int(yr),title) for (title,yr) in tmdb.index.str.split('_')])
tmdb.index = new_index


#create a df just mapping the cast
cast_df = pd.merge(tmdb['id'],cast_db,left_on='id',right_index=True,how='left')['cast']

#only keep ids and then explode with id representing a 
cast_df = cast_df[~cast_df.isna()].apply(lambda x: [i['id'] for i in x])

In [15]:
#create dataframe
t = cast_df[~cast_df.isna()].explode().to_frame()

#remove duplicates within a movie
t = t.reset_index().drop_duplicates(subset=['level_0','level_1','cast'])#.set_index(['level_0','level_1'])
#pivot list to dataframe where cast value is a column and 1 represents that cast member was in that movie
t = t.groupby(['level_0','level_1','cast']).size().unstack(fill_value=0)
t.index.get_level_values(0).astype(int)
t.index.rename(['year_film','film'],inplace=True)


In [30]:
#create actor df for modelling
actor_df = oscar_df[bp_mask].merge(t,right_index=True,left_on=('year_film','film'),how='left')
actor_df = actor_df.set_index(['year_ceremony','film'])


# remove unnessecary columns
cols = ['winner'] + t.columns.to_list()
actor_df = actor_df[cols]
actor_df['winner'] = actor_df['winner'].astype(int)

# remove columns less than a certain amount
threshold = 1
cols = actor_df.columns[(actor_df.sum()>threshold)|actor_df.columns.str.contains('winner')]
actor_df = actor_df[cols]

In [31]:
actor_df =actor_df[~actor_df.isna().any(axis=1)]
#not sure why 4 films have no data but will remove for now


test_mask = actor_df.index.get_level_values(0) >= 2007

train_df = actor_df[~test_mask]
test_df = actor_df[test_mask]

In [32]:
model = GaussianNB()
model.fit(train_df.drop('winner',axis=1),train_df['winner'])

test_df['predict'] = model.predict_proba(test_df.drop('winner',axis=1))[:,1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['predict'] = model.predict_proba(test_df.drop('winner',axis=1))[:,1]


In [33]:
predictions = test_df.groupby('year_ceremony')['predict'].idxmax()
actual = test_df.groupby('year_ceremony')['winner'].idxmax()

In [34]:
(predictions == actual).sum()/len(predictions)

0.17647058823529413

Cast model is awful. Very rare that a actor has a high likelihood of being part of an academy award so biases lesser known actors / casts.

## Create model based on film description

In [51]:
#add overview column from TMBD to oscars dataframe
overview_df = pd.merge(oscar_df[bp_mask][['year_film','film','winner']],tmdb['overview'],left_on=['year_film','film'],right_index=True,how='left').set_index(['year_film','film'])


In [38]:
test_df['predict'].tail(30)

year_ceremony  film                             
2020           1917                                 0.0
               Once upon a Time...in Hollywood      0.0
               Parasite                             1.0
2021           The Father                           0.0
               Mank                                 0.0
               Minari                               0.0
               Nomadland                            0.0
               Promising Young Woman                0.0
               Sound of Metal                       0.0
               The Trial of the Chicago 7           0.0
2022           Belfast                              0.0
               CODA                                 0.0
               Don't Look Up                        0.0
               Drive My Car                         1.0
               Dune                                 0.0
               King Richard                         0.0
               Licorice Pizza                       0.0

In [22]:
predictions

year_ceremony
2007                             (2007, Babel)
2008                         (2008, Atonement)
2009               (2009, Slumdog Millionaire)
2010                    (2010, The Blind Side)
2011                     (2011, Winter's Bone)
2012                        (2012, The Artist)
2013       (2013, Beasts of the Southern Wild)
2014                           (2014, Gravity)
2015                   (2015, American Sniper)
2016                              (2016, Room)
2017                    (2017, Hidden Figures)
2018              (2018, Call Me by Your Name)
2019                     (2019, Black Panther)
2020                          (2020, Parasite)
2021                        (2021, The Father)
2022                      (2022, Drive My Car)
2023    (2023, All Quiet on the Western Front)
Name: predict, dtype: object

Unnamed: 0,Unnamed: 1,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
1927,The Noose,,,,,,,,,,,,,,
1927,The Last Command,,,,,,,,,,,,,,
1927,A Ship Comes In,,,,,,,,,,,,,,
1927,7th Heaven,False,/qRDUrxnMjLGXBqooHvfT9zzICZ0.jpg,"[18, 10749]",82474.0,en,7th Heaven,A dejected Parisian sewer worker feels his pra...,5.476,/3ETXMu5PrcVMRJpfN2Z3MwcfXzy.jpg,1927-09-10,7th Heaven,False,7.4,77.0
1927,Sadie Thompson,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022,Ivalu,False,/572w5U3T7CiyAswyshiS48vO7uR.jpg,[18],1042171.0,kl,Ivalu,Ivalu is gone. Her little sister is desperate ...,1.814,/qxu1Rv9BSTMlVUKrQy2tPWEFRv9.jpg,2022-01-01,Ivalu,False,0.0,0.0
2022,Le Pupille,False,/aHkjDwtkPkuc88X7FBQuN1EMjis.jpg,"[10751, 35]",974586.0,it,Le pupille,A facetious coming-of-age fable that ends with...,8.162,/tuee2So5p4zWbEznpL8EKRT4cre.jpg,2022-05-27,Le Pupille,False,6.9,21.0
2022,Night Ride,False,,[18],1043141.0,hr,Noćna vožnja,"Through a sequence of dreams and nightmares, D...",1.473,/keukc9pxXxGOKX8l40x4WJ5pydl.jpg,2022-10-26,Night Ride,False,0.0,0.0
2022,The Red Suitcase,False,/lF1bmtWOfwqlWcu7ybJYvdqzQPD.jpg,[18],1032734.0,fa,La valise rouge,A veiled 16 year old Iranian teenager is terri...,3.487,/erJFQ6iVz73HoDG99GcwNNQN5up.jpg,2022-09-28,The Red Suitcase,False,0.0,0.0
