### Exploratory Data Analysis

In [20]:
import pandas as pd
import ast
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import matplotlib.pyplot
import seaborn as sns

#### Importing data

In [16]:
# Read in TMDB movie data
tmdb_data = pd.read_csv(f'../data/movies_2015_2024.csv').drop_duplicates(subset=['Title','Year'])

# Import best picture data
best_picture = pd.read_csv('../data/best_picture.csv')

In [18]:
# Merge best picture data and TMDB API data into a single DataFrame
movies_df = best_picture.merge(tmdb_data, how='right', on=['Title','Year'], validate='1:1')

# Fill NaN values 
values = {'Winner': 'No', 'Nominated': 'No'}
movies_df = movies_df.fillna(value=values)

#### Research question: convert budget and revenue data to 2024 dollars using CPI data

In [24]:
# Store CPI values 
cpi_data = pd.read_csv('../data/CPI_data.csv').set_index('Year')

FileNotFoundError: [Errno 2] No such file or directory: '../data/CPI_data.csv'

In [None]:
budget_2024 = []
for year,budget in zip(movies_df['Year'], movies_df['Budget']):
    budget_adjusted = budget*(cpi_data.loc[2024,'CPI'] / cpi_data.loc[year,'CPI'])
    budget_2024.append(budget_adjusted)

# Reassign adjusted budget numbers to the Budget column
movies_df['Budget'] = budget_2024

In [None]:
revenue_2024 = []
for year,revenue in zip(movies_df['Year'], movies_df['Revenue']):
    revenue_adjusted = revenue*(cpi_data.loc[2024,'CPI'] / cpi_data.loc[year,'CPI'])
    revenue_2024.append(revenue_adjusted)

# Reassign adjusted budget numbers to the Revenue column
movies_df['Revenue'] = revenue_2024

In [None]:
# Rename the Budget and Revenue columns to reflect the conversion to 2024 dollars
movies_df = movies_df.rename(columns={'Budget': 'Budget (2024 dollars)', 'Revenue': 'Revenue (2024 dollars)'})

#### Research question: How popular is each genre over the last decade?

##### Attempt #1: Creating a linear model for vote average based on year and genre

In [22]:
# Convert Genre column to list type so that it can be exploded
movies_df['Genre'] = movies_df['Genre'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith('[') else x)

NameError: name 'movies_2015_2024_df' is not defined

In [None]:
# Explode the movies_df DataFrame by genre
movies_df_exploded = movies_2015_2024_df.explode('Genre')

In [None]:
# Create an indicator variable for each genre
genre_dummies = pd.get_dummies(movies_df_exploded['Genre'], prefix='genre')

In [None]:
# Group the genre indicator variables so that we have 1 row per movie
genre_dummies_grouped = genre_dummies.groupby(level=0).max()

# Replace spaces in column names with underscores
genre_dummies_grouped.columns = [column.replace(' ','_') for column in genre_dummies_grouped.columns]

In [25]:
# Join the genre indicator variables to the movies_df DataFrame
movies_genre_indicator = movies_2015_2024_df.join(genre_dummies_grouped)

NameError: name 'movies_2015_2024_df' is not defined

In [None]:
# Fitting a linear regression model for vote_average using years and each of the genre indicator variables

# Extract all of the genre indicator variables and concatenate them with plus signs to be used in the linear model
genre_indicators = [column for column in movies_genre_indicator.columns if 'genre_' in column]
genre_indicators = ' + '.join(genre_indicators)

popularity_model_with_genres = smf.ols(f'Vote_Average ~ Year + {genre_indicators}', data = movies_genre_indicator).fit()
popularity_model_with_genres.summary()

##### Attempt #2: Bar chart plotting vote average by genre

In [None]:
# Creating bar chart to show average vote count by genre
avg_vote_by_genre = movies_df_exploded.groupby('Genre').agg({'Vote_Average': 'mean'}).sort_values(by='Vote_Average')
sns.barplot(data=avg_vote_by_genre, x='Genre', y='Vote_Average')
plt.xticks(rotation=70) 
plt.show()

##### Attempt #3: Evaluating popularity by revenue

In [None]:
# Creating bar chart to show average revenue by genre
avg_vote_by_genre = movies_df_exploded.groupby('Genre').agg({'Revenue (2024 dollars)': 'mean'}).sort_values(by='Revenue (2024 dollars)')
sns.barplot(data=avg_vote_by_genre, x='Genre', y='Revenue (2024 dollars)')
plt.xticks(rotation=70) 
plt.show()