In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.ticker as mtick

In [None]:
# read in data file
oscar_dataset = pd.read_csv("the_oscar_award.csv")

# inspect
oscar_dataset.head()


In [None]:

#summarize
oscar_dataset.info()

In [None]:
#count null values
print(oscar_dataset.isna().sum())
#drop null values
oscars_clean = oscar_dataset.dropna(subset = "film")
print(oscars_clean.isna().sum())

#print unique values in category
print(oscars_clean["category"].unique())


In [None]:

#only include the categories from the website
filtered_categories = ['ACTOR IN A LEADING ROLE','ACTRESS IN A LEADING ROLE','ACTOR IN A SUPPORTING ROLE','ACTRESS IN A SUPPORTING ROLE', 'ANIMATED FEATURE FILM', 
            'CINEMATOGRAPHY','COSTUME DESIGN','DIRECTING','DOCUMENTARY FEATURE FILM','FILM EDITING','INTERNATIONAL FEATURE FILM','MAKEUP AND HAIRSTYLING',
            'MUSIC (ORIGINAL SCORE)','MUSIC (ORIGINAL SONG)','BEST PICTURE','PRODUCTION DESIGN','ANIMATED SHORT FILM','LIVE ACTION SHORT FILM',
            'SOUND','VISUAL EFFECTS','WRITING (ADAPTED SCREENPLAY)','WRITING (ORIGINAL SCREENPLAY)']
#filtered years to be more recent (past 10 yrs)
filtered_years = [2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024]


# Create a new DataFrame with only filtered categories
filtered_data = oscars_clean[oscars_clean['category'].isin(filtered_categories)]
#filtered_data = filtered_data[filtered_data['year_film'].isin(filtered_years)]

#checking
print(filtered_data.head())
filtered_data.isna().sum()


In [None]:

# Find the most frequent winners for each category
top_winners_by_category = filtered_data.groupby(['category', 'name']).size().reset_index(name='count')
top_winners_by_category = top_winners_by_category.sort_values(by=['category', 'count'], ascending=[True, False])
top_winners_by_category = top_winners_by_category.drop_duplicates(subset='category', keep='first')

# Plot the top winners by category
plt.figure(figsize=(14, 8))
sns.barplot(x='count', y='name', hue='category', data=top_winners_by_category.head(10), palette='viridis')
plt.title('Top Oscar Winners by Category')
plt.xlabel('Number of Wins')
plt.ylabel('Winner')
plt.show()


In [None]:
# Find the total nominations for each movie
movie_nominations = filtered_data.groupby('film').size().reset_index(name='total_nominations')
movie_nominations = movie_nominations.sort_values(by='total_nominations', ascending=False)

# Plot the top movies with the most nominations
plt.figure(figsize=(14, 8))
sns.barplot(x='total_nominations', y='film', data=movie_nominations.head(10), palette='viridis')
plt.title('Movies with the Most Nominations')
plt.xlabel('Total Nominations')
plt.ylabel('Movie')
plt.show()



In [None]:
#import current oscar nominations
oscars_2024 = pd.read_excel("Oscars_2024.xlsx")

#inspect
oscars_2024.info()
oscars_2024.head(10)


In [None]:
#visuals: Movies with most nominations
movie_nominations_2024 = oscars_2024.groupby('film').size().reset_index(name='total_nominations')
movie_nominations_2024 = movie_nominations_2024.sort_values(by='total_nominations', ascending=False)

# Plot the top movies with the most nominations
plt.figure(figsize=(14, 8))
sns.barplot(x='total_nominations', y='film', data=movie_nominations_2024.head(10), palette='viridis')
plt.title('Movies with the Most Nominations')
plt.xlabel('Total Nominations')
plt.ylabel('Movie')
plt.show()


In [None]:
# Combine first and last names with an underscore
oscars_2024['Full Name'] = oscars_2024['name'].str.replace(' ', '')

# Generate word cloud for names
def generate_wordcloud(text):
    return WordCloud(width=800, height=400, background_color='white').generate(text)

# Plot the word clouds
plt.figure(figsize=(12, 6))

# Plot the word cloud for names
wordcloud_names = generate_wordcloud(' '.join(oscars_2024['Full Name']))
# Replace underscores with spaces before displaying
wordcloud_names.words_ = {key.replace('_', ' '): value for key, value in wordcloud_names.words_.items()}
plt.subplot(1, 2, 1)
plt.imshow(wordcloud_names, interpolation='bilinear')
plt.title('Word Cloud for Names')
plt.axis('off')

# Plot the word cloud for genres
wordcloud_genre = generate_wordcloud(' '.join(oscars_2024['genre']))
plt.subplot(1, 2, 2)
plt.imshow(wordcloud_genre, interpolation='bilinear')
plt.title('Word Cloud for Genre')
plt.axis('off')

plt.show()

In [None]:
#checking which names appear in past decade, so who's been nominated before
common_names = pd.merge(filtered_data['name'], oscars_2024['name'], how='inner')

unique_names = common_names['name'].unique()

unique_names = unique_names[:-3]
print(unique_names)

#checking whos won before
common_names_winners = filtered_data[filtered_data['name'].isin(unique_names) & filtered_data['winner']]

print(common_names_winners.to_string())

In [None]:
# Group by 'Category' and calculate the mean of 'Rotten Tomatoes' scores
avg_scores = oscars_2024.groupby('category')['rotten tomatoes'].mean().sort_values(ascending=False)

# Print or visualize the result
print(avg_scores)

In [None]:
# Exclude NA values
top_movies_by_rt = oscars_2024.dropna(subset=['rotten tomatoes'])

# Sort the data by rotten tomatoes values in descending order to get the top movies
top_movies_by_rt = top_movies_by_rt.sort_values(by='rotten tomatoes', ascending=False)

# Plotting
plt.figure(figsize=(12, 6))
sns.barplot(x='rotten tomatoes', y='film', data=top_movies_by_rt, palette='viridis')

# Set x-axis scale to percentage
plt.gca().xaxis.set_major_formatter(mtick.PercentFormatter(1.0))

plt.title('Top Movies with Highest Rotten Tomatoes Percentages (Excluding NA)')
plt.xlabel('Rotten Tomatoes Score')
plt.ylabel('Film')
plt.show()

In [None]:
# Group by film and get the maximum Rotten Tomatoes score for each movie
movies_rt = oscars_2024.groupby('film')['rotten tomatoes'].max().reset_index()

# Sort the data by the maximum Rotten Tomatoes score in descending order
movies_rt = movies_rt.sort_values(by='rotten tomatoes', ascending=False)

# Plotting
plt.figure(figsize=(12, 6))
sns.barplot(x='rotten tomatoes', y='film', data=movies_rt.head(10), palette='viridis')

# Set x-axis scale to percentage
plt.gca().xaxis.set_major_formatter(mtick.PercentFormatter(1.0))

plt.title('Top Movies with Highest Rotten Tomatoes Scores (Unique Movies)')
plt.xlabel('Rotten Tomatoes Score')
plt.ylabel('Film')
plt.show()

In [None]:
# Group by film and get the maximum worldwide box office value for each movie
movies_bo = oscars_2024.groupby('film')['worldwide box office'].max().reset_index()

# Sort the data by the maximum worldwide box office value in descending order
movies_bo = movies_bo.sort_values(by='worldwide box office', ascending=False)

# Plotting
plt.figure(figsize=(12, 6))
sns.barplot(x='worldwide box office', y='film', data=movies_bo.head(10), palette='viridis')

# Set x-axis scale to billions for better readability
plt.gca().xaxis.set_major_formatter(mtick.StrMethodFormatter('${x:,.0f}'))

plt.title('Top Movies with Highest Box Office Numbers (Unique Movies)')
plt.xlabel('Worldwide Box Office')
plt.ylabel('Film')
plt.show()

In [None]:
na_values = oscars_2024[oscars_2024['rotten tomatoes'].isna()]

#print(na_values)

# Remove rows with NA values
oscars_2024_cleaned = oscars_2024.dropna()
oscars_2024_cleaned.drop(['Full Name'], axis=1, inplace=True)

# Print 
print(oscars_2024_cleaned.to_string())

In [None]:
# Select relevant features
features = ['category', 'name', 'film', 'worldwide box office', 'rotten tomatoes']

#drop na 
data = oscars_2024_cleaned[features].dropna()

# Rank nominees within each category based on box office and Rotten Tomatoes
data['box office rank'] = data.groupby('category')['worldwide box office'].rank(ascending=False, method='min')
data['rotten tomatoes rank'] = data.groupby('category')['rotten tomatoes'].rank(ascending=False, method='min')

# Calculate a combined rank 
data['combined rank'] = data['box office rank'] + data['rotten tomatoes rank']

# Sort the data by category and combined rank to find the top nominee in each category
top_nominees = data.sort_values(by=['category', 'combined rank']).groupby('category').head(1)

# Display the top nominees in each category
# Format the numeric columns to display as integers
top_nominees['box office rank'] = top_nominees['box office rank'].astype(int)
top_nominees['rotten tomatoes rank'] = top_nominees['rotten tomatoes rank'].astype(int)
top_nominees['combined rank'] = top_nominees['combined rank'].astype(int)

# Display the formatted DataFrame
print(top_nominees.to_string(index=False))



In [None]:
#drop columns
na_values.drop(['genre', 'worldwide box office','rotten tomatoes','Full Name'], errors = 'ignore')

na_values

#add imdb reviews and letterboxd reviews
na_values["imdb score"] = [6.3,6.5,6.1,7.3,7.1,6.6,7.4,6.7,7.0,6.3,6.2,7.1,7.1,7.1,7.4]
na_values["letterboxd score"] = [2.9,3.3,3.1,3.8,3.8,3.1,3.8,3.3,3.5,2.6,2.5,3.6,3.6,3.5,3.8]

na_values

In [None]:
# Rank nominees within each category based on IMDb and Letterboxd scores
na_values['imdb rank'] = na_values.groupby('category')['imdb score'].rank(ascending=False, method='min')
na_values['letterboxd rank'] = na_values.groupby('category')['letterboxd score'].rank(ascending=False, method='min')

# Calculate a combined rank 
na_values['combined rank'] = na_values['imdb rank'] + na_values['letterboxd rank']

# Sort the data by category and combined rank to find the top nominee in each category
top_nominees_sf = na_values.sort_values(by=['category', 'combined rank']).groupby('category').head(1)

# Display the top nominees in each category
# Format the numeric columns to display as integers
top_nominees_sf['imdb rank'] = top_nominees_sf['imdb rank'].astype(int)
top_nominees_sf['letterboxd rank'] = top_nominees_sf['letterboxd rank'].astype(int)
top_nominees_sf['combined rank'] = top_nominees_sf['combined rank'].astype(int)

# Display the formatted DataFrame
print(top_nominees_sf.to_string(index=False))