In [1]:
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import time

2. Gather Movie Data via TMDB API  
    a. Set up the API    
    * Create a free [TMDB account](https://developer.themoviedb.org/docs/getting-started)  
    * Generate an API key are review their documentation, especially:  
        * /discover/movie  
        * /movie/{movie_id}  
        * /search/movie  
    b. Collect top movies (2015-2024)  
    For each year from 2015 to 2024:  
        * Query TMDB for the top 100 movies (by vote count).  
        * For each movie, gather:  
            * Title  
            * Release Year  
            * Genre(s)  
            * Vote Average  
            * Vote Count  
            * Budget  
            * Revenue  
            * TMDB ID  
        * Store all results in a single DataFrame and export to movies_2015_2024.csv.
        * Hint: TMDB rate limits are generous for free accounts, but you should pause between requests (eg. time.sleep(0.25)). 
        * Some Oscar films may not appear in the top 100 by vote count. For any missing, use the /search/movie endpoint to add it.  

> ## TMDB API Call
>
> 
> Below we will make an API call to gather data from [TMDB](https://www.themoviedb.org/?language=en-US). **"The Movie Data Base"** is a reliable source for movie data. We will make an API call and loop through the compiled data to create a dataframe. This dataframe will contain **The Top 100 movies from 2015-2024.** We will clean the configure data to provide Title, Release Year, Genre(s), Vote Average, Vote Count, Budget, Revenue and TMDB ID. Public API **Last Accessed:** November 17, 2025.
>
> 
>* Make API call with **Protected API key.**
>* **Compile Informative Data** from public API.
>* Clean, Sort and inspect collected data.


In [2]:
# create private API key
import json
with open('keys.json') as fi:
    credentials = json.load(fi)
api_key = credentials['api_key']

In [3]:
# loop through API to compile data
endpoint = 'https://api.themoviedb.org/3/discover/movie'

movie_data = pd.DataFrame()

for release_year in range(2015,2025):
    yearly_movies = pd. DataFrame()
    for page in range(1,6):
        params = {
        'page' :page,
        'api_key' : api_key,
        'sort_by' : 'vote_count.desc',
        #'release_date.gte' : f"{release_year}-01-01",
        #'release_date.lte' : f"{release_year}-12-31"
        'primary_release_year' : release_year
        }
        api_response = requests.get(endpoint, params=params)

        data = api_response.json()
        data
        
        page_data= pd.json_normalize(data['results'])
        yearly_movies = pd.concat([page_data,yearly_movies], ignore_index=True)
        
        yearly_movies["release_year"] = pd.to_datetime(yearly_movies["release_date"], errors="coerce").dt.year
        yearly_movies["release_year"] = release_year
    movie_data= pd.concat([yearly_movies,movie_data], ignore_index=True)
        
        #print(movie_data[:10])

In [4]:
# inspect shape of dataframe
movie_data.shape

(1000, 15)

In [5]:
# clean and sort data 
first_api_call_movie_data=movie_data[['title','genre_ids','vote_average','vote_count','release_year','id']]
first_api_call_movie_data_sorted = first_api_call_movie_data.sort_values(['release_year', 'vote_count'], ascending=[True, False])
first_api_call_movie_data_sorted.head()

Unnamed: 0,title,genre_ids,vote_average,vote_count,release_year,id
980,Avengers: Age of Ultron,"[28, 12, 878]",7.271,23855,2015,99861
981,Mad Max: Fury Road,"[28, 12, 878]",7.6,23511,2015,76341
982,Inside Out,"[16, 10751, 12, 18, 35]",7.91,22924,2015,150540
983,Jurassic World,"[28, 12, 878, 53]",6.699,21102,2015,135397
984,The Martian,"[18, 12, 878]",7.691,20590,2015,286217


> ## API Data Exploration
>
> We have successfully made our **API call.** Now that we have this information we can loop through more specific variables. We are going to use the data to make more exact extractions of information. In the last loop we collected **Title, Vote Average, Vote Count and ID.** Below, we itterate through each movie id to collect the **Revenue and Budget of Top 100 movies** from each Year. We then save our final sorted data rame to a csv file for further exploration.
>
> 
>* Loop through API to **Collect Movie Data.**
>* **Clean and sort** information.
>* Save final **csv file** for further analysis.
>
>

In [6]:
# check API connection before loop
movie_id = 1198426

endpoint1 = f'https://api.themoviedb.org/3/movie/{movie_id}'
params = {
        'page' :page,
        'api_key' : api_key,
        
        }
movie_response = requests.get(endpoint1,params)
movie_response.json()

{'adult': False,
 'backdrop_path': '/4damV6u8Za9p03SH9jvKr3TwHQC.jpg',
 'belongs_to_collection': None,
 'budget': 0,
 'genres': [{'id': 16, 'name': 'Animation'}, {'id': 10751, 'name': 'Family'}],
 'homepage': '',
 'id': 1198426,
 'imdb_id': 'tt17423376',
 'origin_country': ['DO'],
 'original_language': 'es',
 'original_title': 'Capitán Avispa',
 'overview': "In Avispatropolis, the fearless Captain Avispa emerges as an exemplary protector of the city, whose courage and convictions always prevail over the forces of evil. His unbreakable power is nourished by absolute sincerity, as he would lose his strength if he dared to weave deceit. Only in pursuit of noble causes does he allow himself to deviate from the truth. As is common in stories of this kind, Captain Wasp is surrounded by a constellation of close friends and arch-enemies, whose stories have their origins in the artist's songs.",
 'popularity': 26.5199,
 'poster_path': '/zmthz3CuFljmBQcfuaz4hBNwbQ0.jpg',
 'production_companies':

In [7]:
# create for loop to itterate through each movie id
movie_info_json_dicts = []

for movie_id in movie_data['id'].head(1000):
    tmdb_endpoint = f'https://api.themoviedb.org/3/movie/{movie_id}' 
    params = {
        'api_key' : api_key,
      
        }

    tmdb_response = requests.get(tmdb_endpoint,params=params)
    
    tmdb_data = tmdb_response.json()
    movie_info_json_dicts.append(tmdb_data)

    time.sleep(0.25)
    
all_movie_data = pd.json_normalize(movie_info_json_dicts)

second_api_call_movie_data = all_movie_data[['title','vote_average','vote_count','budget','revenue','imdb_id','genres', 'id']]
second_api_call_movie_data.head() 

Unnamed: 0,title,vote_average,vote_count,budget,revenue,imdb_id,genres,id
0,We Live in Time,7.259,1103,20000000,37182814,tt27131358,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",1100099
1,No Way Up,6.3,1072,0,0,tt16253418,"[{'id': 53, 'name': 'Thriller'}, {'id': 27, 'n...",1096197
2,The Union,6.1,1058,0,0,tt12610390,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",704239
3,Fly Me to the Moon,6.806,1028,100000000,42260534,tt1896747,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",956842
4,The Bikeriders,6.69,1003,40000000,36110860,tt21454134,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",1008409


In [8]:
# inspect dataframe shape
second_api_call_movie_data.shape

(1000, 8)

In [9]:
# inspect dataframe shape
first_api_call_movie_data.shape

(1000, 6)

In [10]:
# save each api call to csv for instant access
first_api_call_movie_data.to_csv("movie_data_api_1.csv", index=False) 
second_api_call_movie_data.to_csv("movie_data_api_2.csv", index=False)

In [11]:
movie_data_api_1 = pd.read_csv("movie_data_api_1.csv")

In [12]:
movie_data_api_2 = pd.read_csv("movie_data_api_2.csv")

In [13]:
# convert genre dictionarys to strings in seperate column
movie_data_api_2 = movie_data_api_2.copy()
movie_data_api_2['genres_strings'] = movie_data_api_2['genres'].astype(str)

In [14]:
# use regex to find all occurences of genre names
movie_data_api_2['movie_genres'] = (movie_data_api_2['genres_strings'].str.findall(r"'name': '(.*?)'"))

In [15]:
movie_data_api_2.head(2)

Unnamed: 0,title,vote_average,vote_count,budget,revenue,imdb_id,genres,id,genres_strings,movie_genres
0,We Live in Time,7.259,1103,20000000,37182814,tt27131358,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",1100099,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...","[Romance, Drama]"
1,No Way Up,6.3,1072,0,0,tt16253418,"[{'id': 53, 'name': 'Thriller'}, {'id': 27, 'n...",1096197,"[{'id': 53, 'name': 'Thriller'}, {'id': 27, 'n...","[Thriller, Horror, Action]"


In [16]:
# sort dataframes
movie_data_api_2_sorted = movie_data_api_2.sort_values('title', ascending=True)
movie_data_api_1_sorted = movie_data_api_1.sort_values('title', ascending=True)

In [17]:
# add release year
movie_data_api_2_sorted['release_year'] = movie_data_api_1_sorted['release_year']

In [18]:
# drop unnessecary columns
movies_2015_2024 = movie_data_api_2_sorted.drop(
    columns=['genres', 'genres_strings'])

In [19]:
# display final dataframe
movies_2015_2024.head(2)

Unnamed: 0,title,vote_average,vote_count,budget,revenue,imdb_id,id,movie_genres,release_year
452,#Alive,7.228,1956,6300000,13416285,tt10620868,614696,"[Action, Horror, Science Fiction]",2020
861,10 Cloverfield Lane,6.994,8359,15000000,110216998,tt1179933,333371,"[Thriller, Science Fiction, Drama, Horror]",2016


In [20]:
# export to csv file
movies_2015_2024.to_csv('movies_2015_2024.csv', index=False)