In [37]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier

In [3]:
oscar_winners = pd.read_csv('/Users/paulettedvm/Downloads/the_oscar_award.csv')
oscar_winners

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
0,1927,1928,1,ACTOR,Richard Barthelmess,The Noose,False
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True
2,1927,1928,1,ACTRESS,Louise Dresser,A Ship Comes In,False
3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True
4,1927,1928,1,ACTRESS,Gloria Swanson,Sadie Thompson,False
...,...,...,...,...,...,...,...
10884,2023,2024,96,WRITING (Original Screenplay),Written by Celine Song,Past Lives,False
10885,2023,2024,96,JEAN HERSHOLT HUMANITARIAN AWARD,,,True
10886,2023,2024,96,HONORARY AWARD,"To Angela Bassett, who has inspired audiences ...",,True
10887,2023,2024,96,HONORARY AWARD,"To Mel Brooks, for his comedic brilliance, pro...",,True


In [4]:
# cut the data so that 'year_film' is 2000-2023

oscar_winners_2000_2023 = oscar_winners[oscar_winners['year_film'] >= 2000]
oscar_winners_2000_2023

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
8007,2000,2001,73,ACTOR IN A LEADING ROLE,Javier Bardem,Before Night Falls,False
8008,2000,2001,73,ACTOR IN A LEADING ROLE,Russell Crowe,Gladiator,True
8009,2000,2001,73,ACTOR IN A LEADING ROLE,Tom Hanks,Cast Away,False
8010,2000,2001,73,ACTOR IN A LEADING ROLE,Ed Harris,Pollock,False
8011,2000,2001,73,ACTOR IN A LEADING ROLE,Geoffrey Rush,Quills,False
...,...,...,...,...,...,...,...
10884,2023,2024,96,WRITING (Original Screenplay),Written by Celine Song,Past Lives,False
10885,2023,2024,96,JEAN HERSHOLT HUMANITARIAN AWARD,,,True
10886,2023,2024,96,HONORARY AWARD,"To Angela Bassett, who has inspired audiences ...",,True
10887,2023,2024,96,HONORARY AWARD,"To Mel Brooks, for his comedic brilliance, pro...",,True


In [5]:
# delete null values

oscar_winners_2000_2023 = oscar_winners_2000_2023.dropna()

In [6]:
oscar_winners_2000_2023 = oscar_winners_2000_2023.drop(columns=['ceremony', 'category', 'name'])

In [7]:
# group by film and drop necessary columns

oscar_winners_2000_2023 = oscar_winners_2000_2023.groupby('film').agg(
    year_film=('year_film','first'), 
    year_ceremony=('year_ceremony','first'),
    wins=('winner','sum'),
    nominations=('winner','count')
).reset_index()

In [8]:
# adding boolean column winner

oscar_winners_2000_2023['winner'] = np.where(oscar_winners_2000_2023['wins'] > 0, True, False)

In [9]:
oscar_winners_2000_2023[oscar_winners_2000_2023['film'] == 'Inception']

Unnamed: 0,film,year_film,year_ceremony,wins,nominations,winner
493,Inception,2010,2011,4,8,True


In [10]:
# now, read the other imdb dataset to join with the kaggle
oscar_winners_2000_2023_imdb = pd.read_csv('/Users/paulettedvm/oscars_2000_2023_nominations_imdb.csv')

In [11]:
# join the two datasets (or concat ?? don't know)
# inner join to avoid NaN??
# or really i need to merge these two datasets based on the title column
oscar_winners_2000_2023_imdb = oscar_winners_2000_2023_imdb.fillna('Unknown')

In [12]:
oscar_winners_2000_2023_imdb = oscar_winners_2000_2023_imdb.rename(columns={'title':'film'})

In [13]:
oscar_winners_2000_2023_imdb

Unnamed: 0,film,year,runtime,age_rating,star_rating,rating_count
0,The Dark Knight,2008,2h 32m,PG-13,9.0,3000000
1,Inception,2010,2h 28m,PG-13,8.8,2600000
2,Interstellar,2014,2h 49m,PG-13,8.7,2300000
3,The Lord of the Rings: The Fellowship of the Ring,2001,2h 58m,PG-13,8.9,2100000
4,The Lord of the Rings: The Return of the King,2003,3h 21m,PG-13,9.0,2100000
...,...,...,...,...,...,...
972,Scottsboro: An American Tragedy,2000,1h 24m,Unknown,7.5,369
973,Watani: My Homeland,2016,1h 16m,Unknown,7.4,305
974,Open Heart,2013,40m,Unknown,7.1,173
975,Legacy,2000,1h 30m,Not Rated,6.9,161


In [14]:
# now merge oscar wins + imdb data 
# make it an inner join

merged_oscars_2000_2023 = pd.merge(oscar_winners_2000_2023, oscar_winners_2000_2023_imdb, on='film', how='inner')

In [15]:
# drop year column since it's duplicated w/ 'year_film'

merged_oscars_2000_2023 = merged_oscars_2000_2023.drop(columns=['year'])

In [16]:
merged_oscars_2000_2023

Unnamed: 0,film,year_film,year_ceremony,wins,nominations,winner,runtime,age_rating,star_rating,rating_count
0,102 Dalmatians,2000,2001,0,1,False,1h 40m,G,4.8,40000
1,12,2007,2008,0,1,False,2h 39m,PG-13,7.5,16000
2,12 Years a Slave,2013,2014,3,9,True,2h 14m,R,8.1,758000
3,127 Hours,2010,2011,0,6,False,1h 34m,R,7.5,411000
4,13 Hours: The Secret Soldiers of Benghazi,2016,2017,0,1,False,2h 24m,R,7.3,169000
...,...,...,...,...,...,...,...,...,...,...
910,Youth,2015,2016,0,1,False,2h 4m,R,7.3,85000
911,Zelary,2003,2004,0,1,False,2h 30m,R,7.5,3800
912,Zero Dark Thirty,2012,2013,1,5,True,2h 37m,R,7.4,328000
913,Zootopia,2016,2017,1,1,True,1h 48m,PG,8.0,571000


In [None]:
# okay now that this is done, we can deal with the 2025 nominations dataset

In [17]:
oscar_2025_noms = pd.read_csv('/Users/paulettedvm/oscars_2025_nominations_imdb_filled.csv')

In [19]:
oscar_2025_noms = oscar_2025_noms.fillna('Unrated')

In [20]:
oscar_2025_noms.head()

Unnamed: 0,title,year_film,runtime,age_rating,star_rating,rating_count,nominations,year_ceremony
0,Dune: Part Two,2024,2h 46m,PG-13,8.5,592000,5,2025
1,The Substance,2024,2h 21m,R,7.3,232000,5,2025
2,Alien: Romulus,2024,1h 59m,R,7.1,222000,1,2025
3,Inside Out 2,2024,1h 36m,PG,7.6,203000,1,2025
4,Gladiator II,2024,2h 28m,R,6.6,189000,1,2025


In [196]:
# try my hand at modeling
# fit/build the model using the 2000_2023 data, test it and then use it on the 2025 data

In [21]:
y = merged_oscars_2000_2023.winner

In [22]:
features = ['nominations', 'star_rating', 'rating_count']
X = merged_oscars_2000_2023[features]

# i want to use 'wins as a feature but i can't because when i go to test it
# with the oscars 2025 data (what i actually want to predict) i can't 
# because the dataset doesn't have that feature... because it HASNT HAPPENED
# it's part of what i want to PREDICTTT

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [24]:
# simple decision tree model

oscar_dt_model = DecisionTreeClassifier(random_state=1)
oscar_dt_model.fit(X_train, y_train)

DecisionTreeClassifier(random_state=1)

In [25]:
y_pred = oscar_dt_model.predict(X_test)

In [26]:
acc_dt = accuracy_score(y_test, y_pred)
acc_dt

0.7554585152838428

In [27]:
# decision tree model with limited leafs

leaf_limit_model = DecisionTreeClassifier(max_leaf_nodes=100)
leaf_limit_model.fit(X_train, y_train)

DecisionTreeClassifier(max_leaf_nodes=100)

In [28]:
leaf_limit_pred = leaf_limit_model.predict(X_test)

In [29]:
acc_leaf_limit = accuracy_score(y_test, leaf_limit_pred)
acc_leaf_limit

# slightly more accuracy

0.7816593886462883

In [39]:
ensemble_model = VotingClassifier(estimators=[
    ('xgb', XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.05)),
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=10)),
    ('lr', LogisticRegression(max_iter=1000))
], voting='soft')
ensemble_model.fit(X_train, y_train)

VotingClassifier(estimators=[('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, device=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None,
                                            feature_types=None, gamma=None,
                                            grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_ra...
                                            max_cat_threshold=None,
    

In [40]:
ensemble_y_pred = ensemble_model.predict(X_test)

In [42]:
ensemble_y_pred

array([False,  True, False, False, False,  True, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False,  True, False,  True,
       False, False, False,  True,  True, False,  True,  True, False,
       False, False, False,  True, False,  True, False, False,  True,
       False, False, False, False, False, False,  True, False, False,
       False, False, False,  True, False, False, False, False,  True,
       False, False, False, False, False,  True,  True, False, False,
       False, False, False, False,  True, False, False, False,  True,
       False, False, False, False,  True, False, False,  True,  True,
       False,  True,  True, False,  True, False,  True, False,  True,
       False, False, False, False,  True, False, False, False, False,
       False, False,

In [44]:
X_train

Unnamed: 0,nominations,star_rating,rating_count
790,2,8.0,87000
185,1,7.6,319000
363,2,7.9,1200000
888,1,7.4,305
531,6,7.4,150000
...,...,...,...
767,3,7.2,257000
72,2,7.7,208000
908,1,7.5,2500
235,2,8.3,1100000


In [41]:
accuracy_score(y_test, ensemble_y_pred)

0.8253275109170306

In [None]:
## use trained ensemble model to predict 2025 winners

In [45]:
X_2025 = oscar_2025_noms[features]

In [46]:
y_2025_probs = ensemble_model.predict_proba(X_2025)[:, 1]

y_2025_preds = ensemble_model.predict(X_2025)

oscar_2025_noms['predicted_winner'] = y_2025_preds
oscar_2025_noms['winning_probability'] = y_2025_probs

In [48]:
oscar_2025_noms_sorted = oscar_2025_noms.sort_values(by='winning_probability', ascending=False)
oscar_2025_noms_sorted

Unnamed: 0,title,year_film,runtime,age_rating,star_rating,rating_count,nominations,year_ceremony,predicted_winner,winning_probability
1,The Substance,2024,2h 21m,R,7.3,232000,5,2025,True,0.718891
0,Dune: Part Two,2024,2h 46m,PG-13,8.5,592000,5,2025,True,0.698985
9,Conclave,2024,2h,PG,7.4,86000,8,2025,True,0.649418
8,Wicked,2024,2h 40m,PG,7.7,120000,10,2025,True,0.602095
11,Emilia Perez,2024,2h 12m,R,5.7,56000,13,2025,True,0.580223
7,The Wild Robot,2024,1h 42m,PG,8.2,128000,3,2025,True,0.54666
10,Anora,2024,2h 19m,R,7.8,83000,6,2025,True,0.53828
19,I'm Still Here,2024,2h 17m,PG-13,8.8,19000,3,2025,False,0.486215
20,The Brutalist,2024,3h 34m,R,8.0,17000,10,2025,False,0.455392
3,Inside Out 2,2024,1h 36m,PG,7.6,203000,1,2025,False,0.445076
