#### Question 2: Gathering Movie Data via TMDB API

* Set up the API
    * Create a free TMDB account
    * Generate an API key and review their documentation, especially:
        * /discover/movie: https://developer.themoviedb.org/reference/discover-movie
        * /movie/{movie_id}: https://developer.themoviedb.org/reference/movie-details
        * /search/movie: https://developer.themoviedb.org/reference/search-movie
* Collect top movies (2015-2024)
    * For each year from 2015 to 2024:
        * Query TMDB for the top 100 movies (by vote count).
        * For each movie, gather:
            * Title
            * Release Year
            * Genre(s)
            * Vote Average
            * Vote Count
            * Budget
            * Revenue
            * TMDB ID
* Store all results in a single DataFrame and export to movies_2015_2024.csv.
* Hint: TMDB rate limits are generous for free accounts, but you should pause between requests (eg. time.sleep(0.25)).
* Some Oscar films may not appear in the top 100 by vote count. For any missing, use the /search/movie endpoint to add it.

In [1]:
import requests
import json
import time
import pandas as pd
import datetime
from pathlib import Path 

In [2]:
# Load API key from keys file
with open('keys.json') as fi:
    credentials = json.load(fi)

api_key = credentials['api_key']

In [3]:
# Iterate through all years between 2015 and 2024
for year in range(2015,2025):
    
    # Print status message
    print(f'Processing top 100 movies for {year}...')

    # Initialize empty lists for each piece of information we will be extracting
    movie_titles = []
    release_dates = []
    vote_averages = []
    vote_counts = []
    tmdb_ids = []
    budgets = []
    revenues = []
    genres = [] 
    
    # Each page contains 20 results, so we need to iterate through 5 pages to get 100 results
    for page in range(1,6):

        # Define endpoint 
        endpoint = 'https://api.themoviedb.org/3/discover/movie'

        # Define params
        params = {
            'api_key': api_key,
            'primary_release_year': year,
            'sort_by': 'vote_count.desc',
            'page': page
        }
    
        # Get response
        response = requests.get(endpoint, params = params)
        top_movies = response.json()['results']

        # Extract movie title, release date, vote average, and vote count for each movie
        for movie in top_movies:
            movie_titles.append(movie['title'])
            release_dates.append(movie['release_date'])
            vote_averages.append(movie['vote_average'])
            vote_counts.append(movie['vote_count'])
            tmdb_ids.append(movie['id'])
            
            # Use the movie id to extract budget, revenue, and genre information
            endpoint = f'https://api.themoviedb.org/3/movie/{movie['id']}'
            
            # Define params
            params = {
                'api_key': api_key,
            }
            
            # Get response
            response = requests.get(endpoint, params = params)
            res = response.json()
            
            # Extract budget, revenue, and genres
            budgets.append(res['budget'])
            revenues.append(res['revenue'])
            genres.append([genre['name'] for genre in res['genres']])
            
            # Sleep before next API call
            time.sleep(0.25)

        # Sleep before next API call 
        time.sleep(0.25)

    # Extract just the years from the release dates
    years = [datetime.datetime.strptime(date_str, "%Y-%m-%d").year for date_str in release_dates]

    # Convert movie data to a pandas DataFrame
    tmdb_movie_data_df = pd.DataFrame({'Title': movie_titles, 
                                       'Year': years, 
                                       'Genre': genres, 
                                       'Vote_Average': vote_averages, 
                                       'Vote_Count': vote_counts, 
                                       'Budget': budgets, 
                                       'Revenue': revenues, 
                                       'TMDB_ID': tmdb_ids})
    
    # Write this year's movie data to a csv file in the data/intermediate folder
    filepath = Path(f'../data/intermediate/tmdb_movies_{year}.csv')  
    filepath.parent.mkdir(parents=True, exist_ok=True)  
    tmdb_movie_data_df.to_csv(filepath, index=False)  

print('Finished processing movies!')

Processing top 100 movies for 2015...
Processing top 100 movies for 2016...
Processing top 100 movies for 2017...
Processing top 100 movies for 2018...
Processing top 100 movies for 2019...
Processing top 100 movies for 2020...
Processing top 100 movies for 2021...
Processing top 100 movies for 2022...
Processing top 100 movies for 2023...
Processing top 100 movies for 2024...
Finished processing movies!


In [8]:
# Combine all years of movie data into one DataFrame
combined_movies_df = pd.concat([pd.read_csv(f'../data/intermediate/tmdb_movies_{year}.csv') for year in range(2015,2025)], ignore_index=True)

In [9]:
# Import list of 2015 - 2024 Oscar films from best_picture.csv
best_picture = pd.read_csv('../data/best_picture.csv')
oscar_films_2015_2024 = best_picture[(best_picture['Year'] >= 2015) 
                                        & (best_picture['Year'] <= 2024)]

In [10]:
# See if any of the 2015 - 2024 Oscar films are missing from the TMDB movie data
missing_titles = []
for title_year in zip(oscar_films_2015_2024['Title'], oscar_films_2015_2024['Year']):
    if title_year[0] not in combined_movies_df['Title'].values:
        print(f'{title_year[0]} is NOT in the DataFrame!')
        missing_titles.append(title_year)

Fences is NOT in the DataFrame!
Once Upon a Time in Hollywood is NOT in the DataFrame!
Drive My Car is NOT in the DataFrame!
TÃ¡r is NOT in the DataFrame!
Women Talking is NOT in the DataFrame!
Maestro is NOT in the DataFrame!
Nickel Boys is NOT in the DataFrame!


In [12]:
# For the Oscar films that are missing from the DataFrame, use the /search/movie API endpoint to add them
titles = []
years = []
genres = []
vote_averages = []
vote_counts = []
budgets = []
revenues = []
ids = []

for title_year in missing_titles:
    # Print status message
    print(f'Gathering movie info for {title_year[0]}...')
    
    # Search the TMDB search-movie endpoint for the movie title
    endpoint = 'https://api.themoviedb.org/3/search/movie'
    
    # Define params
    params = {
        'api_key': api_key,
        'primary_release_year': title_year[1],
        'query': title_year[0]
    }
    
    # Get response
    response = requests.get(endpoint, params = params)
    res = response.json()['results']
    if res == []:
        print(f'{title_year[0]} not found in TMDB database.')
    else:
        # Collect movie info
        titles.append(res[0]['title'])
        years.append(res[0]['release_date'][:4])
        ids.append(res[0]['id'])
        vote_averages.append(res[0]['vote_average'])
        vote_counts.append(res[0]['vote_count'])
        
        # Use the movie id to search for budget, revenue, and genre information
        endpoint = f'https://api.themoviedb.org/3/movie/{res[0]['id']}'
        
        # Define params
        params = {
            'api_key': api_key,
        }
        
        # Get response
        response = requests.get(endpoint, params = params)
        res = response.json()
        
        # Extract budget, revenue, and genres
        budgets.append(res['budget'])
        revenues.append(res['revenue'])
        genres.append([genre['name'] for genre in res['genres']])
        
        # Sleep before next API call
        time.sleep(0.25)
        
# Create a DataFrame of all the new rows
new_rows = pd.DataFrame({'Title': titles, 'Year': years, 'Genre': genres, 'Vote_Average': vote_averages, 'Vote_Count': vote_counts, 'Budget': budgets, 'Revenue': revenues, 'TMDB_ID': ids})
# Concatenate the new rows with the existing DataFrame
combined_movies_df = pd.concat([combined_movies_df, new_rows], ignore_index=True)

Gathering movie info for Fences...
Gathering movie info for Once Upon a Time in Hollywood...
Gathering movie info for Drive My Car...
Gathering movie info for TÃ¡r...
Gathering movie info for Women Talking...
Gathering movie info for Maestro...
Gathering movie info for Nickel Boys...


In [13]:
# Write the final combined movies DataFrame to a csv file in the data folder
filepath = Path('../data/movies_2015_2024.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
combined_movies_df.to_csv(filepath, index=False)  