In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval

# An Analysis of Movie Performance, Part 2: Data Analysis and Presentation

In this part, you’ll use the datasets you gathered in Part 1 to explore what makes a movie successful. You’ll perform exploratory data analysis, create visualizations, and, where appropriate, fit simple statistical models.

The main goal is to discover patterns and communicate them clearly, not just to crunch numbers.

**Learning Objectives**  
By the end of this part, you should be able to:  
* Combine and clean multiple real-world datasets from different sources.  
* Conduct exploratory data analysis (EDA) with pandas and visualization libraries.  
* Apply basic statistical modeling to test hypotheses.  
* Communicate insights effectively to a non-technical audience.

1. **Exploratory Analysis**  
Begin by understanding your dataset as a whole. Look for trends, outliers, and relationships.  

    Examples of questions to explore:  
    * How popular is each genre over the last decade?
    * How do budget and revenue vary by year or by genre?  
    * What is the average profit margin (revenue - budget) across genres or years?  
    * How do movie ratings (vote average) relate to box-office performance?  
    * Adjust all financial metrics (budget and revenue) for inflation to 2024 dollars using CPI data.

    Deliverables:  
    * At least 3-4 clearly labeled visualizations (bar charts, scatter plots, or boxplots).  

2. **Performance Comparisons**  
    Use your merged dataset to examine whether recognition and awards are associated with better performance.  
    Investigate:  
    * Do Best Picture nominees or winners tend to earn higher box-office revenue than non-nominated movies?  
    * Are certain genres or types of roles more common among award-winning films?
    * (If you did the optional scrape) Do movies featuring a Best Actor or Best Actress winner in their cast tend to perform better?  
  
    Deliverables:  
    * At least one regression model 

In [4]:
# read in csv files
best_picture = pd.read_csv('best_picture.csv')
movies_2015_2024 = pd.read_csv('movies_2015_2024.csv')

In [6]:
best_picture.shape

(611, 3)

In [7]:
movies_2015_2024.shape

(1000, 9)

In [8]:
best_picture.head()

Unnamed: 0,Year,Film,Winner
0,1927,Wings,Yes
1,1927,7th Heaven,No
2,1927,The Racket,No
3,1928,The Broadway Melody,Yes
4,1928,Alibi,No


In [14]:
best_picture_winners = best_picture[best_picture['Winner'] == 'Yes']
best_picture_winners.head()

Unnamed: 0,Year,Film,Winner
0,1927,Wings,Yes
3,1928,The Broadway Melody,Yes
8,1929,All Quiet on the Western Front,Yes
13,1930,Cimarron,Yes
18,1931,Grand Hotel,Yes


In [9]:
movies_2015_2024.head()

Unnamed: 0,title,vote_average,vote_count,budget,revenue,imdb_id,id,movie_genres,release_year
0,#Alive,7.227,1955,6300000,13416285,tt10620868,614696,"['Action', 'Horror', 'Science Fiction']",2020
1,10 Cloverfield Lane,6.994,8359,15000000,110216998,tt1179933,333371,"['Thriller', 'Science Fiction', 'Drama', 'Horr...",2016
2,12 Strong,6.3,3096,35000000,67450815,tt1413492,429351,"['War', 'Drama', 'Action', 'History']",2018
3,13 Hours: The Secret Soldiers of Benghazi,7.269,3789,50000000,69411370,tt4172430,300671,"['War', 'Action', 'History', 'Drama', 'Thriller']",2016
4,1917,7.986,13091,100000000,446064352,tt8579674,530915,"['War', 'History', 'Drama', 'Action']",2019


In [15]:
# rename columns so keys match
best_picture_rename = best_picture.rename(columns={
    "Year": "release_year",
    "Film": "title"
})

# merge dataframes and keep all movies
merged = movies_2015_2024.merge(
    best_picture_rename[["release_year", "title", "Winner"]],
    on=["release_year", "title"],
    how="left"
)

# fill in missing winner entries with no
merged["Winner"] = merged["Winner"].fillna("No")

In [18]:
# display merged data
merged.head(2)

Unnamed: 0,title,vote_average,vote_count,budget,revenue,imdb_id,id,movie_genres,release_year,Winner
0,#Alive,7.227,1955,6300000,13416285,tt10620868,614696,"['Action', 'Horror', 'Science Fiction']",2020,No
1,10 Cloverfield Lane,6.994,8359,15000000,110216998,tt1179933,333371,"['Thriller', 'Science Fiction', 'Drama', 'Horr...",2016,No
2,12 Strong,6.3,3096,35000000,67450815,tt1413492,429351,"['War', 'Drama', 'Action', 'History']",2018,No
3,13 Hours: The Secret Soldiers of Benghazi,7.269,3789,50000000,69411370,tt4172430,300671,"['War', 'Action', 'History', 'Drama', 'Thriller']",2016,No
4,1917,7.986,13091,100000000,446064352,tt8579674,530915,"['War', 'History', 'Drama', 'Action']",2019,No


In [22]:
# create dataframe with award winning movies
award_winners = merged[merged['Winner'] == 'Yes']
award_winners.head(2)

Unnamed: 0,title,vote_average,vote_count,budget,revenue,imdb_id,id,movie_genres,release_year,Winner
40,All Quiet on the Western Front,7.722,4376,20000000,0,tt1016150,49046,"['War', 'History', 'Drama']",2022,Yes
58,Anora,7.0,2729,6000000,56286295,tt28607951,1064213,"['Drama', 'Comedy', 'Romance']",2024,Yes
138,CODA,7.903,2426,10000000,1905058,tt10366460,776503,"['Drama', 'Music', 'Romance']",2021,Yes
236,Everything Everywhere All at Once,7.728,7441,25000000,139200000,tt6710474,545611,"['Action', 'Adventure', 'Science Fiction']",2022,Yes
305,Green Book,8.224,12370,23000000,321752656,tt6966692,490132,"['Drama', 'Comedy', 'History']",2018,Yes


In [23]:
# create dataframe with non-award winning movies
non_award_winners = merged[merged['Winner'] == 'No']
non_award_winners.head(2)

Unnamed: 0,title,vote_average,vote_count,budget,revenue,imdb_id,id,movie_genres,release_year,Winner
0,#Alive,7.227,1955,6300000,13416285,tt10620868,614696,"['Action', 'Horror', 'Science Fiction']",2020,No
1,10 Cloverfield Lane,6.994,8359,15000000,110216998,tt1179933,333371,"['Thriller', 'Science Fiction', 'Drama', 'Horr...",2016,No
2,12 Strong,6.3,3096,35000000,67450815,tt1413492,429351,"['War', 'Drama', 'Action', 'History']",2018,No
3,13 Hours: The Secret Soldiers of Benghazi,7.269,3789,50000000,69411370,tt4172430,300671,"['War', 'Action', 'History', 'Drama', 'Thriller']",2016,No
4,1917,7.986,13091,100000000,446064352,tt8579674,530915,"['War', 'History', 'Drama', 'Action']",2019,No


In [30]:
# generate percent difference in average revenue
(award_winners['revenue'].mean() - non_award_winners['revenue'].mean()) / award_winners['revenue'].mean()

np.float64(0.011985465683477832)

> Award winning movies generate **1.2%** more annual revenue than non-award winning movies.

3. **Presentation**  
    Prepare a 10-12 minute presentation of your findings.  
    Your presentation should:  
    * Focus on findings and insights, not code.  
    * Tell a clear story: the questions you asked and what you found.  
    * Include visuals such as charts, summary tables, or model results.
    * Avoid screenshots of code or raw output. Think of your audience as studio executives or film analysts, not programmers.

    Your presentation should be done using PowerPoint/Google Slides or other presentation software.

In [None]:
# genre_popularity['movie_genres'] = genre_popularity['movie_genres'].apply(literal_eval)
# genre_popularity.explode('movie_genres')