In [1]:
# import dependencies
import pandas as pd
import os

In [2]:
# Create the path and import the data
oscar_data = os.path.join('../Margo_group_work/the_oscar_award.csv')
oscar_df = pd.read_csv(oscar_data)

In [3]:
# View data
oscar_df.head()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
0,1927,1928,1,ACTOR,Richard Barthelmess,The Noose,False
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True
2,1927,1928,1,ACTRESS,Louise Dresser,A Ship Comes In,False
3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True
4,1927,1928,1,ACTRESS,Gloria Swanson,Sadie Thompson,False


In [4]:
# View tail of dataframe
oscar_df.tail()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
10754,2022,2023,95,Writing (Original Screenplay),Written by Martin McDonagh,The Banshees of Inisherin,
10755,2022,2023,95,Writing (Original Screenplay),Written by Daniel Kwan & Daniel Scheinert,Everything Everywhere All at Once,
10756,2022,2023,95,Writing (Original Screenplay),Written by Steven Spielberg & Tony Kushner,The Fabelmans,
10757,2022,2023,95,Writing (Original Screenplay),Written by Todd Field,Tár,
10758,2022,2023,95,Writing (Original Screenplay),Written by Ruben Östlund,Triangle of Sadness,


In [5]:
# Check for null values
oscar_df.isnull().sum()

year_film          0
year_ceremony      0
ceremony           0
category           0
name               3
film             310
winner           120
dtype: int64

In [6]:
# Drop rows with null values and verify removal
oscar_df.dropna(axis=0,how="any",inplace=True)
oscar_df.isnull().sum()

year_film        0
year_ceremony    0
ceremony         0
category         0
name             0
film             0
winner           0
dtype: int64

In [7]:
# Check for duplicated rows
oscar_df.duplicated().sum()

6

In [8]:
# Drop duplicated rows and verify removal
oscar_df = oscar_df.drop_duplicates()
oscar_df.duplicated().sum()

0

In [26]:
# view head of cleaned oscar dataframe
oscar_df.head()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
0,1927,1928,1,ACTOR,Richard Barthelmess,The Noose,False
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True
2,1927,1928,1,ACTRESS,Louise Dresser,A Ship Comes In,False
3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True
4,1927,1928,1,ACTRESS,Gloria Swanson,Sadie Thompson,False


In [9]:
# Check data types
oscar_df.dtypes

year_film         int64
year_ceremony     int64
ceremony          int64
category         object
name             object
film             object
winner           object
dtype: object

In [10]:
# Display summary statistics 
oscar_df.describe()

Unnamed: 0,year_film,year_ceremony,ceremony
count,10323.0,10323.0,10323.0
mean,1975.710743,1976.710743,48.733314
std,26.667309,26.667309,26.628722
min,1927.0,1928.0,1.0
25%,1952.0,1953.0,25.0
50%,1975.0,1976.0,48.0
75%,1999.0,2000.0,72.0
max,2021.0,2022.0,94.0


In [11]:
# Examine the winner column
oscar_df['winner'].unique()

array([False, True], dtype=object)

In [12]:
oscar_df['winner'].value_counts()

False    8198
True     2125
Name: winner, dtype: int64

In [13]:
winner_oscar = oscar_df.loc[(oscar_df['winner'] == 'False')]
winner_oscar.head()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner


In [14]:
# Explore categories
oscar_df['category'].unique()

array(['ACTOR', 'ACTRESS', 'ART DIRECTION', 'CINEMATOGRAPHY',
       'DIRECTING (Comedy Picture)', 'DIRECTING (Dramatic Picture)',
       'ENGINEERING EFFECTS', 'OUTSTANDING PICTURE',
       'UNIQUE AND ARTISTIC PICTURE', 'WRITING (Adaptation)',
       'WRITING (Original Story)', 'WRITING (Title Writing)', 'DIRECTING',
       'WRITING', 'OUTSTANDING PRODUCTION', 'SOUND RECORDING',
       'SHORT SUBJECT (Cartoon)', 'SHORT SUBJECT (Comedy)',
       'SHORT SUBJECT (Novelty)', 'ASSISTANT DIRECTOR', 'FILM EDITING',
       'MUSIC (Scoring)', 'MUSIC (Song)', 'DANCE DIRECTION',
       'WRITING (Screenplay)', 'ACTOR IN A SUPPORTING ROLE',
       'ACTRESS IN A SUPPORTING ROLE', 'SHORT SUBJECT (Color)',
       'SHORT SUBJECT (One-reel)', 'SHORT SUBJECT (Two-reel)',
       'MUSIC (Original Score)', 'CINEMATOGRAPHY (Black-and-White)',
       'CINEMATOGRAPHY (Color)', 'SPECIAL EFFECTS',
       'ART DIRECTION (Black-and-White)', 'ART DIRECTION (Color)',
       'WRITING (Original Screenplay)', 'DOCUME

In [15]:
# Count category column
oscar_df['category'].value_counts()

DIRECTING                                    459
FILM EDITING                                 440
ACTRESS IN A SUPPORTING ROLE                 430
ACTOR IN A SUPPORTING ROLE                   430
DOCUMENTARY (Short Subject)                  378
                                            ... 
DIRECTING (Comedy Picture)                     2
SPECIAL ACHIEVEMENT AWARD (Sound Editing)      1
SPECIAL ACHIEVEMENT AWARD (Sound Effects)      1
WRITING (Title Writing)                        1
ENGINEERING EFFECTS                            1
Name: category, Length: 104, dtype: int64

In [16]:
# create dataframe for sound effects category
sound_effects = oscar_df.loc[(oscar_df['category'] == 'SOUND EFFECTS EDITING')]
sound_effects.head()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
6130,1982,1983,55,SOUND EFFECTS EDITING,Mike Le-Mare,Das Boot,False
6131,1982,1983,55,SOUND EFFECTS EDITING,"Charles L. Campbell, Ben Burtt",E.T. The Extra-Terrestrial,True
6132,1982,1983,55,SOUND EFFECTS EDITING,"Stephen Hunter Flick, Richard L. Anderson",Poltergeist,False
6237,1983,1984,56,SOUND EFFECTS EDITING,Ben Burtt,Return of the Jedi,False
6238,1983,1984,56,SOUND EFFECTS EDITING,Jay Boekelheide,The Right Stuff,True


In [17]:
# count sound effects dataframe
sound_effects['winner'].value_counts()

False    31
True     16
Name: winner, dtype: int64

In [18]:
# Explore category column 'ACTOR'
actor_df = oscar_df.loc[(oscar_df['category'] == 'ACTOR')]
actor_df.head()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
0,1927,1928,1,ACTOR,Richard Barthelmess,The Noose,False
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True
35,1928,1929,2,ACTOR,George Bancroft,Thunderbolt,False
36,1928,1929,2,ACTOR,Warner Baxter,In Old Arizona,True
37,1928,1929,2,ACTOR,Chester Morris,Alibi,False


In [19]:
# count actor dataframe
actor_df['winner'].value_counts()

False    183
True      49
Name: winner, dtype: int64

In [20]:
# Explore category column 'ACTOR'
actress_df = oscar_df.loc[(oscar_df['category'] == 'ACTRESS')]
actress_df.head()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
2,1927,1928,1,ACTRESS,Louise Dresser,A Ship Comes In,False
3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True
4,1927,1928,1,ACTRESS,Gloria Swanson,Sadie Thompson,False
40,1928,1929,2,ACTRESS,Ruth Chatterton,Madame X,False
41,1928,1929,2,ACTRESS,Betty Compson,The Barker,False


In [21]:
# count actress dataframe
actress_df['winner'].value_counts()

False    187
True      49
Name: winner, dtype: int64

In [25]:
# count actress 
actress_df.count()

year_film        236
year_ceremony    236
ceremony         236
category         236
name             236
film             236
winner           236
dtype: int64

In [27]:
oscar_df.head()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
0,1927,1928,1,ACTOR,Richard Barthelmess,The Noose,False
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True
2,1927,1928,1,ACTRESS,Louise Dresser,A Ship Comes In,False
3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True
4,1927,1928,1,ACTRESS,Gloria Swanson,Sadie Thompson,False


In [33]:
by_film_df = oscar_df.iloc[0:,[5,3,4,6]]
by_film_df

Unnamed: 0,film,category,name,winner
0,The Noose,ACTOR,Richard Barthelmess,False
1,The Last Command,ACTOR,Emil Jannings,True
2,A Ship Comes In,ACTRESS,Louise Dresser,False
3,7th Heaven,ACTRESS,Janet Gaynor,True
4,Sadie Thompson,ACTRESS,Gloria Swanson,False
...,...,...,...,...
10630,Belfast,WRITING (Original Screenplay),Written by Kenneth Branagh,True
10631,Don't Look Up,WRITING (Original Screenplay),Screenplay by Adam McKay; Story by Adam McKay ...,False
10632,King Richard,WRITING (Original Screenplay),Written by Zach Baylin,False
10633,Licorice Pizza,WRITING (Original Screenplay),Written by Paul Thomas Anderson,False


In [None]:
by_film_df