# Exploring Missing Values

In [7]:
# import the stuff I might need
import pandas as pd
import numpy as np
import plotly.express as px
import os

In [11]:
# load in data

# os.chdir("/Users/sabinesegaloff/Code/UVA_Fall_25/Intro_to_Predictive_Modeling_6021/Final_Proj/ds6021-final-project")
movies_to_model = pd.read_csv("../data/movies_15_to_19.csv")
genre_df = pd.read_csv("../data/movie_genre_database.csv")

In [12]:
# drop the columns I don't need
movies_to_model = movies_to_model.drop(columns=["backdrop_path", "video", "poster_path"])
movies_to_model.head(3).T

Unnamed: 0,0,1,2
Unnamed: 0,0,1,2
adult,False,False,False
genre_ids,"[10749, 18]","[16, 878, 28]","[18, 14, 53]"
id,271039,296917,312849
original_language,en,ja,tr
original_title,"Something, Anything",劇場版 PSYCHO-PASS サイコパス,Sarmaşık
overview,When a tragedy shatters her plans for domestic...,"In a futuristic Japan, the Sibyl System is cha...","After the owner's bankruptcy, the crew is stra..."
popularity,12.0941,9.3462,8.6511
release_date,2015-01-09,2015-01-09,2015-01-26
title,"Something, Anything",PSYCHO-PASS: The Movie,Ivy


## Missing Budgets and Missing Revenues

In [10]:
# is there correlation between release date and if budget value is NaN?

# Convert release_date to datetime objects
movies_to_model['release_date'] = pd.to_datetime(movies_to_model['release_date'], errors='coerce')

# Get the year (numeric) to use for correlation
movies_to_model['release_year'] = movies_to_model['release_date'].dt.year

# Create a binary column: 1 if budget is NaN, 0 if it is present
movies_to_model['budget_is_missing'] = movies_to_model['budget'].isna().astype(int)

# Calculate correlation
correlation = movies_to_model['release_year'].corr(movies_to_model['budget_is_missing'])

print(f"Correlation between Release Year and Missing Budget: {correlation:.4f}")

# Visualize the relationship
# This helps see if missing data is clustered around specific time periods
fig = px.histogram(
    movies_to_model, 
    x="release_year", 
    color="budget_is_missing",
    barmode="overlay",
    title="Distribution of Missing Budgets over Time",
    labels={"budget_is_missing": "Budget is NaN"}
)
fig.show()

KeyError: 'budget'

In [None]:
# is there correlation between release date and if revenue value is NaN?

# Convert release_date to datetime objects
movies_to_model['release_date'] = pd.to_datetime(movies_to_model['release_date'], errors='coerce')

# Get the year (numeric) to use for correlation
movies_to_model['release_year'] = movies_to_model['release_date'].dt.year

# Create a binary column: 1 if budget is NaN, 0 if it is present
movies_to_model['revenue_is_missing'] = movies_to_model['revenue'].isna().astype(int)

# Calculate correlation
correlation = movies_to_model['release_year'].corr(movies_to_model['revenue_is_missing'])

print(f"Correlation between Release Year and Missing Revenue: {correlation:.4f}")

# Visualize the relationship
# This helps see if missing data is clustered around specific time periods
fig = px.histogram(
    movies_to_model, 
    x="release_year", 
    color="revenue_is_missing",
    barmode="overlay",
    title="Distribution of Missing Revenue over Time",
    labels={"revenue_is_missing": "Revenue is NaN"}
)
fig.show()

Correlation between Release Year and Missing Revenue: 0.0028


## Need to account for number of movies per year!

In [None]:
# Group by year and calculate the mean (which acts as the percentage for 0/1 data)
# We also count the movies to ensure we aren't misled by years with only 1 movie
yearly_stats = movies_to_model.groupby('release_year')['budget_is_missing'].agg(
    missing_rate='mean',
    total_movies='count'
).reset_index()

# Filter out years with very few movies to reduce noise (optional, e.g., < 10 movies)
# yearly_stats = yearly_stats[yearly_stats['total_movies'] > 10]

# Plot the percentage of missing budgets over time
fig = px.bar(
    yearly_stats, 
    x="release_year", 
    y="missing_rate",
    title="Percentage of Movies with Missing Budgets by Year",
    labels={"missing_rate": "Proportion Missing (0.0 - 1.0)", "release_year": "Year"},
    hover_data=['total_movies'] # Shows the total count when you hover over the bar
)

# Add a trendline to see the direction clearly
fig.add_scatter(
    x=yearly_stats['release_year'], 
    y=yearly_stats['missing_rate'], 
    mode='lines', 
    name='Trend'
)

fig.show()