In [26]:
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import time

Gather Movie Data via TMDB API
a. Set up the API
Create a free TMDB account
Generate an API key are review their documentation, especially:
/discover/movie
/movie/{movie_id}
/search/movie
b. Collect top movies (2015-2024)
For each year from 2015 to 2024:
Query TMDB for the top 100 movies (by vote count).
For each movie, gather:
Title
Release Year
Genre(s)
Vote Average
Vote Count
Budget
Revenue
TMDB ID
Store all results in a single DataFrame and export to movies_2015_2024.csv.
Hint: TMDB rate limits are generous for free accounts, but you should pause between requests (eg. time.sleep(0.25)).
Some Oscar films may not appear in the top 100 by vote count. For any missing, use the /search/movie endpoint to add it.
TMDB API Call
Below we will make an API call to gather data from TMDB. "The Movie Data Base" is a reliable source for movie data. We will make an API call and loop through the compiled data to create a dataframe. This dataframe will contain The Top 100 movies from 2015-2024. We will clean the configure data to provide Title, Release Year, Genre(s), Vote Average, Vote Count, Budget, Revenue and TMDB ID. Public API Last Accessed: November 17, 2025.

Make API call with Protected API key.
Compile Informative Data from public API.
Clean, Sort and inspect collected data.

In [2]:
# create private API key
import json
with open('keys.json') as fi:
    credentials = json.load(fi)
api_key = credentials['api_key']

In [3]:
# loop through API to compile data
endpoint = 'https://api.themoviedb.org/3/discover/movie'

movie_data_1 = pd.DataFrame()

for release_year in range(2015,2025):
    yearly_movies = pd. DataFrame()
    for page in range(1,6):
        params = {
        'page' :page,
        'api_key' : api_key,
        'sort_by' : 'vote_count.desc',
        #'release_date.gte' : f"{release_year}-01-01",
        #'release_date.lte' : f"{release_year}-12-31"
        'primary_release_year' : release_year
        }
        api_response = requests.get(endpoint, params=params)

        data = api_response.json()
        data
        
        page_data= pd.json_normalize(data['results'])
        yearly_movies = pd.concat([page_data,yearly_movies], ignore_index=True)
        
        yearly_movies["release_year"] = pd.to_datetime(yearly_movies["release_date"], errors="coerce").dt.year
        yearly_movies["release_year"] = release_year
    movie_data_1= pd.concat([yearly_movies,movie_data_1], ignore_index=True)
        
        #print(movie_data[:10])

In [4]:
# inspect shape of dataframe
movie_data_1.shape

(1000, 15)

In [5]:
# clean and sort data 
movie_data_1=movie_data_1[['title','genre_ids','vote_average','vote_count','release_year','id']]
movie_data_1 = movie_data_1.sort_values(['release_year', 'vote_count'], ascending=True)
movie_data_1.head()

Unnamed: 0,title,genre_ids,vote_average,vote_count,release_year,id
919,Circle,"[27, 878, 9648, 53]",6.014,2070,2015,335866
918,The Boy Next Door,[53],4.919,2089,2015,241251
917,Aloha,"[18, 35, 10749]",5.5,2110,2015,222936
916,Blackhat,"[80, 53, 28]",5.514,2118,2015,201088
915,Youth,"[35, 18, 10749]",6.9,2144,2015,310593


API Data Exploration
We have successfully made our API call. Now that we have this information we can loop through more specific variables. We are going to use the data to make more exact extractions of information. In the last loop we collected Title, Vote Average, Vote Count and ID. Below, we itterate through each movie id to collect the Revenue and Budget of Top 100 movies from each Year. We then save our final sorted data rame to a csv file for further exploration.

Loop through API to Collect Movie Data.
Clean and sort information.
Save final csv file for further analysis.

In [6]:
# check API connection before loop
movie_id = 1198426

endpoint1 = f'https://api.themoviedb.org/3/movie/{movie_id}'
params = {
        'page' :page,
        'api_key' : api_key,
        
        }
movie_response = requests.get(endpoint1,params=params)
movie_response.json()

{'adult': False,
 'backdrop_path': '/4damV6u8Za9p03SH9jvKr3TwHQC.jpg',
 'belongs_to_collection': None,
 'budget': 0,
 'genres': [{'id': 16, 'name': 'Animation'}, {'id': 10751, 'name': 'Family'}],
 'homepage': '',
 'id': 1198426,
 'imdb_id': 'tt17423376',
 'origin_country': ['DO'],
 'original_language': 'es',
 'original_title': 'Capitán Avispa',
 'overview': "In Avispatropolis, the fearless Captain Avispa emerges as an exemplary protector of the city, whose courage and convictions always prevail over the forces of evil. His unbreakable power is nourished by absolute sincerity, as he would lose his strength if he dared to weave deceit. Only in pursuit of noble causes does he allow himself to deviate from the truth. As is common in stories of this kind, Captain Wasp is surrounded by a constellation of close friends and arch-enemies, whose stories have their origins in the artist's songs.",
 'popularity': 26.5199,
 'poster_path': '/zmthz3CuFljmBQcfuaz4hBNwbQ0.jpg',
 'production_companies':

In [7]:
## Assuming movie_data_1 is defined somewhere and api_key is set
movie_info_json_dicts = []

## Get the top 1000 movie IDs
movie_ids = movie_data_1['id'].head(100)

for movie_id in movie_ids:
    tmdb_endpoint = f'https://api.themoviedb.org/3/movie/{movie_id}' 
    params = {
        'api_key': api_key,
    }

    ## Make the API call
    tmdb_response = requests.get(tmdb_endpoint, params=params)

    tmdb_data = tmdb_response.json()
    movie_info_json_dicts.append(tmdb_data)
    ##else:
        ##print(f"Error fetching data for movie ID {movie_id}: {tmdb_response.text}")

    time.sleep(0.25)

## Normalize the collected data into a DataFrame
all_movie_data = pd.json_normalize(movie_info_json_dicts)

## Select relevant columns
movie_data_2 = all_movie_data[['title', 'vote_average', 'vote_count', 'budget', 'revenue', 'imdb_id', 'genres', 'id']]

## Optional: Display the first few rows of the resulting DataFrame
movie_data_2.head(5)

Unnamed: 0,title,vote_average,vote_count,budget,revenue,imdb_id,genres,id
0,Circle,6.014,2070,250000,0,tt3118452,"[{'id': 27, 'name': 'Horror'}, {'id': 878, 'na...",335866
1,The Boy Next Door,4.919,2089,4000000,52425855,tt3181822,"[{'id': 53, 'name': 'Thriller'}]",241251
2,Aloha,5.5,2110,37000000,26250020,tt1243974,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",222936
3,Blackhat,5.514,2118,70000000,17752940,tt2717822,"[{'id': 80, 'name': 'Crime'}, {'id': 53, 'name...",201088
4,Youth,6.9,2144,13360000,2000000,tt3312830,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",310593


In [8]:
# save each api call to csv for instant access
movie_data_1.to_csv("movie_data_api_1.csv", index=False) 
movie_data_2.to_csv("movie_data_api_2.csv", index=False)

In [9]:
movie_data_api_1_df = pd.read_csv("movie_data_api_1.csv")

In [10]:
movie_data_api_2_df = pd.read_csv('movie_data_api_2.csv').copy()

In [11]:
movie_data_api_2_df['genres_strings'] = movie_data_api_2_df['genres'].astype(str)

In [12]:
# use regex to find all occurences of genre names
movie_data_api_2_df['movie_genres'] = (movie_data_api_2_df['genres_strings'].str.findall(r"'name': '(.*?)'"))

In [13]:
movie_data_api_2_df.head(2)

Unnamed: 0,title,vote_average,vote_count,budget,revenue,imdb_id,genres,id,genres_strings,movie_genres
0,Circle,6.014,2070,250000,0,tt3118452,"[{'id': 27, 'name': 'Horror'}, {'id': 878, 'na...",335866,"[{'id': 27, 'name': 'Horror'}, {'id': 878, 'na...","[Horror, Science Fiction, Mystery, Thriller]"
1,The Boy Next Door,4.919,2089,4000000,52425855,tt3181822,"[{'id': 53, 'name': 'Thriller'}]",241251,"[{'id': 53, 'name': 'Thriller'}]",[Thriller]


In [14]:
# sort dataframes
movie_data_api_2_sorted = movie_data_api_2_df.sort_values('title', ascending=True)
movie_data_api_1_sorted = movie_data_api_1_df.sort_values('title', ascending=True)

In [15]:
movie_data_api_2_sorted

Unnamed: 0,title,vote_average,vote_count,budget,revenue,imdb_id,genres,id,genres_strings,movie_genres
2,Aloha,5.500,2110,37000000,26250020,tt1243974,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",222936,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...","[Drama, Comedy, Romance]"
26,American Ultra,6.090,2876,28000000,27100000,tt3316948,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",261392,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...","[Action, Comedy]"
94,Ant-Man,7.067,20422,130000000,519311965,tt0478970,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",102899,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...","[Science Fiction, Action, Adventure]"
99,Avengers: Age of Ultron,7.271,23855,365000000,1405403694,tt2395427,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",99861,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","[Action, Adventure, Science Fiction]"
38,Black Mass,6.523,3487,53000000,99975678,tt1355683,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",261023,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...","[Drama, Crime, History]"
...,...,...,...,...,...,...,...,...,...,...
20,Trainwreck,5.797,2656,35000000,140795793,tt3152624,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",271718,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...","[Comedy, Romance]"
42,Vacation,6.304,3863,31000000,104400000,tt1524930,"[{'id': 35, 'name': 'Comedy'}, {'id': 12, 'nam...",296099,"[{'id': 35, 'name': 'Comedy'}, {'id': 12, 'nam...","[Comedy, Adventure]"
5,Victor Frankenstein,5.948,2202,40000000,34200000,tt1976009,"[{'id': 18, 'name': 'Drama'}, {'id': 878, 'nam...",228066,"[{'id': 18, 'name': 'Drama'}, {'id': 878, 'nam...","[Drama, Science Fiction, Thriller]"
11,We Are Your Friends,6.786,2402,6000000,11100000,tt3787590,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",301351,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...","[Drama, Music, Romance, Comedy]"


In [16]:
movie_data_api_1_sorted['release_year'] = movie_data_api_1_sorted['release_year']

In [17]:
movie_data_api_1_sorted

Unnamed: 0,title,genre_ids,vote_average,vote_count,release_year,id
547,#Alive,"[28, 27, 878]",7.228,1956,2020,614696
178,10 Cloverfield Lane,"[53, 878, 18, 27]",6.994,8359,2016,333371
330,12 Strong,"[10752, 18, 28, 36]",6.300,3096,2018,429351
115,13 Hours: The Secret Soldiers of Benghazi,"[10752, 28, 36, 18, 53]",7.269,3789,2016,300671
492,1917,"[10752, 36, 18, 28]",7.986,13092,2019,530915
...,...,...,...,...,...,...
464,Zombieland: Double Tap,"[27, 35]",6.912,6022,2019,338967
191,Zootopia,"[16, 12, 10751, 35]",7.751,17070,2016,269149
260,mother!,"[18, 27]",6.967,6911,2017,381283
638,"tick, tick... BOOM!","[18, 10402]",7.612,2214,2021,537116


In [18]:
movie_data_api_2_sorted

Unnamed: 0,title,vote_average,vote_count,budget,revenue,imdb_id,genres,id,genres_strings,movie_genres
2,Aloha,5.500,2110,37000000,26250020,tt1243974,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",222936,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...","[Drama, Comedy, Romance]"
26,American Ultra,6.090,2876,28000000,27100000,tt3316948,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",261392,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...","[Action, Comedy]"
94,Ant-Man,7.067,20422,130000000,519311965,tt0478970,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",102899,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...","[Science Fiction, Action, Adventure]"
99,Avengers: Age of Ultron,7.271,23855,365000000,1405403694,tt2395427,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",99861,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","[Action, Adventure, Science Fiction]"
38,Black Mass,6.523,3487,53000000,99975678,tt1355683,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",261023,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...","[Drama, Crime, History]"
...,...,...,...,...,...,...,...,...,...,...
20,Trainwreck,5.797,2656,35000000,140795793,tt3152624,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",271718,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...","[Comedy, Romance]"
42,Vacation,6.304,3863,31000000,104400000,tt1524930,"[{'id': 35, 'name': 'Comedy'}, {'id': 12, 'nam...",296099,"[{'id': 35, 'name': 'Comedy'}, {'id': 12, 'nam...","[Comedy, Adventure]"
5,Victor Frankenstein,5.948,2202,40000000,34200000,tt1976009,"[{'id': 18, 'name': 'Drama'}, {'id': 878, 'nam...",228066,"[{'id': 18, 'name': 'Drama'}, {'id': 878, 'nam...","[Drama, Science Fiction, Thriller]"
11,We Are Your Friends,6.786,2402,6000000,11100000,tt3787590,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",301351,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...","[Drama, Music, Romance, Comedy]"


In [19]:
movie_data_api_2_sorted['release_year'] = movie_data_api_1_sorted['release_year']

In [20]:
# drop unnessecary columns
movies_2015_2024 = movie_data_api_2_sorted.drop(
    columns=['genres', 'genres_strings'])

In [21]:
movies_2015_2024.head(2)

Unnamed: 0,title,vote_average,vote_count,budget,revenue,imdb_id,id,movie_genres,release_year
2,Aloha,5.5,2110,37000000,26250020,tt1243974,222936,"[Drama, Comedy, Romance]",2015
26,American Ultra,6.09,2876,28000000,27100000,tt3316948,261392,"[Action, Comedy]",2015


In [22]:
movies_2015_2024.to_csv('movies_2015_2024.csv', index=False)

In [23]:
movies_2015_2024_df = pd.read_csv('movies_2015_2024.csv')

In [24]:
movies_2015_2024_df.head(5)

Unnamed: 0,title,vote_average,vote_count,budget,revenue,imdb_id,id,movie_genres,release_year
0,Aloha,5.5,2110,37000000,26250020,tt1243974,222936,"['Drama', 'Comedy', 'Romance']",2015
1,American Ultra,6.09,2876,28000000,27100000,tt3316948,261392,"['Action', 'Comedy']",2015
2,Ant-Man,7.067,20422,130000000,519311965,tt0478970,102899,"['Science Fiction', 'Action', 'Adventure']",2015
3,Avengers: Age of Ultron,7.271,23855,365000000,1405403694,tt2395427,99861,"['Action', 'Adventure', 'Science Fiction']",2015
4,Black Mass,6.523,3487,53000000,99975678,tt1355683,261023,"['Drama', 'Crime', 'History']",2015


In [25]:
movies_2015_2024_df.tail(5)

Unnamed: 0,title,vote_average,vote_count,budget,revenue,imdb_id,id,movie_genres,release_year
95,Trainwreck,5.797,2656,35000000,140795793,tt3152624,271718,"['Comedy', 'Romance']",2015
96,Vacation,6.304,3863,31000000,104400000,tt1524930,296099,"['Comedy', 'Adventure']",2015
97,Victor Frankenstein,5.948,2202,40000000,34200000,tt1976009,228066,"['Drama', 'Science Fiction', 'Thriller']",2015
98,We Are Your Friends,6.786,2402,6000000,11100000,tt3787590,301351,"['Drama', 'Music', 'Romance', 'Comedy']",2015
99,Youth,6.9,2144,13360000,2000000,tt3312830,310593,"['Comedy', 'Drama', 'Romance']",2015
