In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.ticker as mtick

In [None]:
# read in data file
oscar_dataset = pd.read_csv("the_oscar_award.csv")

# inspect
oscar_dataset.head()


In [None]:

#summarize
oscar_dataset.info()

In [None]:
#count null values
print(oscar_dataset.isna().sum())
#drop null values
oscars_clean = oscar_dataset.dropna(subset = "film")
print(oscars_clean.isna().sum())

#print unique values in category
print(oscars_clean["category"].unique())


In [None]:

#only include the categories from the website
filtered_categories = ['ACTOR IN A LEADING ROLE','ACTRESS IN A LEADING ROLE','ACTOR IN A SUPPORTING ROLE','ACTRESS IN A SUPPORTING ROLE', 'ANIMATED FEATURE FILM', 
            'CINEMATOGRAPHY','COSTUME DESIGN','DIRECTING','DOCUMENTARY FEATURE FILM','FILM EDITING','INTERNATIONAL FEATURE FILM','MAKEUP AND HAIRSTYLING',
            'MUSIC (ORIGINAL SCORE)','MUSIC (ORIGINAL SONG)','BEST PICTURE','PRODUCTION DESIGN','ANIMATED SHORT FILM','LIVE ACTION SHORT FILM',
            'SOUND','VISUAL EFFECTS','WRITING (ADAPTED SCREENPLAY)','WRITING (ORIGINAL SCREENPLAY)']
#filtered years to be more recent (past 10 yrs)
filtered_years = [2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024]


# Create a new DataFrame with only filtered categories
filtered_data = oscars_clean[oscars_clean['category'].isin(filtered_categories)]
#filtered_data = filtered_data[filtered_data['year_film'].isin(filtered_years)]

#checking
print(filtered_data.head())
filtered_data.isna().sum()


In [None]:

# Find the most frequent winners for each category
top_winners_by_category = filtered_data.groupby(['category', 'name']).size().reset_index(name='count')
top_winners_by_category = top_winners_by_category.sort_values(by=['category', 'count'], ascending=[True, False])
top_winners_by_category = top_winners_by_category.drop_duplicates(subset='category', keep='first')

# Plot the top winners by category
plt.figure(figsize=(14, 8))
sns.barplot(x='count', y='name', hue='category', data=top_winners_by_category.head(10), palette='viridis')
plt.title('Top Oscar Winners by Category')
plt.xlabel('Number of Wins')
plt.ylabel('Winner')
plt.show()


In [None]:
# Find the total nominations for each movie
movie_nominations = filtered_data.groupby('film').size().reset_index(name='total_nominations')
movie_nominations = movie_nominations.sort_values(by='total_nominations', ascending=False)

# Plot the top movies with the most nominations
plt.figure(figsize=(14, 8))
sns.barplot(x='total_nominations', y='film', data=movie_nominations.head(10), palette='viridis')
plt.title('Movies with the Most Nominations')
plt.xlabel('Total Nominations')
plt.ylabel('Movie')
plt.show()



In [3]:
#import current oscar nominations
oscars_2024 = pd.read_excel("Oscars_2024.xlsx")

#inspect
oscars_2024.info()
oscars_2024.head(10)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   category              120 non-null    object 
 1   name                  120 non-null    object 
 2   film                  120 non-null    object 
 3   genre                 120 non-null    object 
 4   worldwide box office  120 non-null    int64  
 5   rotten tomatoes       105 non-null    float64
dtypes: float64(1), int64(1), object(4)
memory usage: 5.8+ KB


Unnamed: 0,category,name,film,genre,worldwide box office,rotten tomatoes
0,Actor In A Leading Role,Bradley Cooper,Maestro,Romance,383532,0.79
1,Actor In A Leading Role,Colman Domingo,Rustin,Drama,0,0.84
2,Actor In A Leading Role,Paul Giamatti,The Holdovers,Comedy,42246660,0.97
3,Actor In A Leading Role,Cillian Murphy,Oppenheimer,Thriller,957700200,0.93
4,Actor In A Leading Role,Jeffrey Wright,American Fiction,Comedy,21983570,0.94
5,Actress In A Leading Role,Annette Bening,Nyad,Sports,16056,0.86
6,Actress In A Leading Role,Lily Gladstone,Killers of the Flower Moon,Western,156874211,0.93
7,Actress In A Leading Role,Sandra Huller,Anatomy of a Fall,Thriller,31054565,0.96
8,Actress In A Leading Role,Carey Mulligan,Maestro,Romance,383532,0.79
9,Actress In A Leading Role,Emma Stone,Poor Things,Sci-Fi,104571592,0.92


In [None]:
#visuals: Movies with most nominations
movie_nominations_2024 = oscars_2024.groupby('film').size().reset_index(name='total_nominations')
movie_nominations_2024 = movie_nominations_2024.sort_values(by='total_nominations', ascending=False)

# Plot the top movies with the most nominations
plt.figure(figsize=(14, 8))
sns.barplot(x='total_nominations', y='film', data=movie_nominations_2024.head(10), palette='viridis')
plt.title('Movies with the Most Nominations')
plt.xlabel('Total Nominations')
plt.ylabel('Movie')
plt.show()


In [None]:
# Combine first and last names with an underscore
oscars_2024['Full Name'] = oscars_2024['name'].str.replace(' ', '')

# Generate word cloud for names
def generate_wordcloud(text):
    return WordCloud(width=800, height=400, background_color='white').generate(text)

# Plot the word clouds
plt.figure(figsize=(12, 6))

# Plot the word cloud for names
wordcloud_names = generate_wordcloud(' '.join(oscars_2024['Full Name']))
# Replace underscores with spaces before displaying
wordcloud_names.words_ = {key.replace('_', ' '): value for key, value in wordcloud_names.words_.items()}
plt.subplot(1, 2, 1)
plt.imshow(wordcloud_names, interpolation='bilinear')
plt.title('Word Cloud for Names')
plt.axis('off')

# Plot the word cloud for genres
wordcloud_genre = generate_wordcloud(' '.join(oscars_2024['genre']))
plt.subplot(1, 2, 2)
plt.imshow(wordcloud_genre, interpolation='bilinear')
plt.title('Word Cloud for Genre')
plt.axis('off')

plt.show()

In [None]:
#checking which names appear in past decade, so who's been nominated before
common_names = pd.merge(filtered_data['name'], oscars_2024['name'], how='inner')

unique_names = common_names['name'].unique()

unique_names = unique_names[:-3]
print(unique_names)

#checking whos won before
common_names_winners = filtered_data[filtered_data['name'].isin(unique_names) & filtered_data['winner']]

print(common_names_winners)

In [None]:
# Group by 'Category' and calculate the mean of 'Rotten Tomatoes' scores
avg_scores = oscars_2024.groupby('category')['rotten tomatoes'].mean().sort_values(ascending=False)

# Print or visualize the result
print(avg_scores)

In [None]:
# Exclude NA values
top_movies_by_rt = oscars_2024.dropna(subset=['rotten tomatoes'])

# Sort the data by rotten tomatoes values in descending order to get the top movies
top_movies_by_rt = top_movies_by_rt.sort_values(by='rotten tomatoes', ascending=False)

# Plotting
plt.figure(figsize=(12, 6))
sns.barplot(x='rotten tomatoes', y='film', data=top_movies_by_rt, palette='viridis')

# Set x-axis scale to percentage
plt.gca().xaxis.set_major_formatter(mtick.PercentFormatter(1.0))

plt.title('Top Movies with Highest Rotten Tomatoes Percentages (Excluding NA)')
plt.xlabel('Rotten Tomatoes Score')
plt.ylabel('Film')
plt.show()

In [None]:
# Group by film and get the maximum Rotten Tomatoes score for each movie
movies_rt = oscars_2024.groupby('film')['rotten tomatoes'].max().reset_index()

# Sort the data by the maximum Rotten Tomatoes score in descending order
movies_rt = movies_rt.sort_values(by='rotten tomatoes', ascending=False)

# Plotting
plt.figure(figsize=(12, 6))
sns.barplot(x='rotten tomatoes', y='film', data=movies_rt.head(10), palette='viridis')

# Set x-axis scale to percentage
plt.gca().xaxis.set_major_formatter(mtick.PercentFormatter(1.0))

plt.title('Top Movies with Highest Rotten Tomatoes Scores (Unique Movies)')
plt.xlabel('Rotten Tomatoes Score')
plt.ylabel('Film')
plt.show()

In [None]:
# Group by film and get the maximum worldwide box office value for each movie
movies_bo = oscars_2024.groupby('film')['worldwide box office'].max().reset_index()

# Sort the data by the maximum worldwide box office value in descending order
movies_bo = movies_bo.sort_values(by='worldwide box office', ascending=False)

# Plotting
plt.figure(figsize=(12, 6))
sns.barplot(x='worldwide box office', y='film', data=movies_bo.head(10), palette='viridis')

# Set x-axis scale to billions for better readability
plt.gca().xaxis.set_major_formatter(mtick.StrMethodFormatter('${x:,.0f}'))

plt.title('Top Movies with Highest Box Office Numbers (Unique Movies)')
plt.xlabel('Worldwide Box Office')
plt.ylabel('Film')
plt.show()

In [7]:
na_values = oscars_2024[oscars_2024['rotten tomatoes'].isna()]

#print(na_values)

# Remove rows with NA values
oscars_2024_cleaned = oscars_2024.dropna()

# Print 
print(oscars_2024_cleaned)

                          category                           name  \
0          Actor In A Leading Role                 Bradley Cooper   
1          Actor In A Leading Role                 Colman Domingo   
2          Actor In A Leading Role                  Paul Giamatti   
3          Actor In A Leading Role                 Cillian Murphy   
4          Actor In A Leading Role                 Jeffrey Wright   
..                             ...                            ...   
115  Writing (Original Screenplay)  Justine Triet , Arthur Harari   
116  Writing (Original Screenplay)                David Hemingson   
117  Writing (Original Screenplay)    Bradley Cooper, Josh Singer   
118  Writing (Original Screenplay)      Samy Burch, Alex Mechanik   
119  Writing (Original Screenplay)                    Celine Song   

                  film     genre  worldwide box office  rotten tomatoes  
0              Maestro   Romance                383532             0.79  
1               Rustin 

In [14]:
# Select relevant features
features = ['category', 'name', 'film', 'worldwide box office', 'rotten tomatoes']

#drop na 
data = oscars_2024_cleaned[features].dropna()

# Rank nominees within each category based on box office and Rotten Tomatoes
data['box office rank'] = data.groupby('category')['worldwide box office'].rank(ascending=False, method='min')
data['rotten tomatoes rank'] = data.groupby('category')['rotten tomatoes'].rank(ascending=False, method='min')

# Calculate a combined rank 
data['combined rank'] = data['box office rank'] + data['rotten tomatoes rank']

# Sort the data by category and combined rank to find the top nominee in each category
top_nominees = data.sort_values(by=['category', 'combined rank']).groupby('category').head(1)

# Display the top nominees in each category
# Format the numeric columns to display as integers
top_nominees['box office rank'] = top_nominees['box office rank'].astype(int)
top_nominees['rotten tomatoes rank'] = top_nominees['rotten tomatoes rank'].astype(int)
top_nominees['combined rank'] = top_nominees['combined rank'].astype(int)

# Display the formatted DataFrame
print(top_nominees.to_string(index=False))



                     category                                                          name                                          film  worldwide box office  rotten tomatoes  box office rank  rotten tomatoes rank  combined rank
      Actor In A Leading Role                                                 Paul Giamatti                                 The Holdovers              42246660             0.97                2                     1              3
   Actor In A Supporting Role                                             Robert Downey Jr.                                   Oppenheimer             957700200             0.93                2                     2              4
    Actress In A Leading Role                                                Lily Gladstone                    Killers of the Flower Moon             156874211             0.93                1                     2              3
 Actress In A Supporting Role                                               