# Extracting Oscar winners into AllWinners

In [1]:
# For data processing
import numpy as np
import pandas as pd

# For API usage
import requests as r

# For progress bar
from tqdm import tqdm

In [3]:
oscarData = pd.read_csv('datasets/the_oscar_award.csv')
oscarData.head()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
0,1927,1928,1,ACTOR,Richard Barthelmess,The Noose,False
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True
2,1927,1928,1,ACTRESS,Louise Dresser,A Ship Comes In,False
3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True
4,1927,1928,1,ACTRESS,Gloria Swanson,Sadie Thompson,False


In [4]:
print("Data type : ", type(oscarData))
print("Data dims : ", oscarData.shape)

Data type :  <class 'pandas.core.frame.DataFrame'>
Data dims :  (10765, 7)


In [5]:
print(oscarData.dtypes)

year_film         int64
year_ceremony     int64
ceremony          int64
category         object
name             object
film             object
winner             bool
dtype: object


In [6]:
winners = oscarData[oscarData['winner']]

In [7]:
winners = winners.dropna()

# Create 2DF for ALL winners and another for further cleaning to select only some winners

In [8]:
winners_ALL_clean = winners.drop(columns=['ceremony', 'category', 'name']).drop_duplicates()
winners_ALL_clean

Unnamed: 0,year_film,year_ceremony,film,winner
1,1927,1928,The Last Command,True
3,1927,1928,7th Heaven,True
6,1927,1928,The Dove;,True
9,1927,1928,Sunrise,True
11,1927,1928,Two Arabian Knights,True
...,...,...,...,...
10734,2022,2023,An Irish Goodbye,True
10743,2022,2023,Top Gun: Maverick,True
10745,2022,2023,Avatar: The Way of Water,True
10753,2022,2023,Women Talking,True


# Now that we have extracted all the best actors and best actress and best picture, need to check for duplicates

In [9]:
winners_clean = winners_ALL_clean.reset_index(drop=True)

In [10]:
winners_clean

Unnamed: 0,year_film,year_ceremony,film,winner
0,1927,1928,The Last Command,True
1,1927,1928,7th Heaven,True
2,1927,1928,The Dove;,True
3,1927,1928,Sunrise,True
4,1927,1928,Two Arabian Knights,True
...,...,...,...,...
1325,2022,2023,An Irish Goodbye,True
1326,2022,2023,Top Gun: Maverick,True
1327,2022,2023,Avatar: The Way of Water,True
1328,2022,2023,Women Talking,True


# Export all the DF into CSV files for usein data cleaning

In [11]:
winners_clean.to_csv('datasets/AllWinners.csv', index = True)