# read_tmdb_data.ipynb

This notebook reads JSON data from TMDB's API and stores it as CSV files:
- **movies.csv**: Data on approx. 2800 movies from United States from 2000-2023
- **persons.csv**: Data on the directors and top 10 cast in these movies.
- POSSIBLE EXTENSION: **credits.csv**: Data on credits for all persons in these movies (both cast and crew).


In [1]:
import os
import requests
from dotenv import load_dotenv
from time import sleep
import pandas as pd

load_dotenv()
tmdb_api_token = os.getenv("TMDB_API_TOKEN")

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {tmdb_api_token}"
}

REQUEST_DELAY_SECONDS = 0.02

First we request movie_ids for all movies that live up to these requirements:
- From the United States
- From the years 2000-2023
- With original_langauge = english
- With TMDB vote count ≥ 1000

In [2]:
base_url = (
    "https://api.themoviedb.org/3/discover/movie"
    "?include_adult=false"
    "&include_video=false"
    "&language=en-US"
    "&with_original_language=en"
    "&with_origin_country=US"
    "&vote_count.gte=1000"
    "&primary_release_date.gte=2000-01-01"
    "&primary_release_date.lte=2023-12-31"
    "&sort_by=primary_release_date.asc"
)

movie_ids = []

# Loop through all pages (adjust max page if needed)
for page in range(1, 501):
    url = f"{base_url}&page={page}"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Status code: {response.status_code} Text: {response.text}")
    
    # Extract movie_results from the response
    movie_results = response.json().get("results") 
    
    # Stop if we have reached the last page
    if not movie_results:
        break

    # Extract movie_ids from the movie_results
    movie_ids.extend([movie["id"] for movie in movie_results])
    
    sleep(REQUEST_DELAY_SECONDS)

print(f"Number of movie ids found: {len(movie_ids)}")

Number of movie ids found: 2801


Then we request movie data for all these movie_ids, and store that in a dataframe and a CSV file

In [3]:
all_movies = []

# Loop through all movie_ids and request movie data
for movie_id in movie_ids:
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?language=en-US&append_to_response=credits"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"movie_id: {movie_id} Status code: {response.status_code} Text: {response.text}")
    
    # Extract the movie data from the response as a dictionary
    movie = response.json()

    # Find the directors and put director_person_ids in the dictionary
    movie["director_person_ids"] = []
    for credit in movie["credits"]["crew"]:
        if(credit["job"]=="Director"):
            movie["director_person_ids"].append(credit["id"])

    # Simplify child dictionaries to list of ids
    movie["genre_ids"] = [genre["id"] for genre in movie["genres"]]
    movie["spoken_languages"] = [language["iso_639_1"] for language in movie["spoken_languages"]]
    movie["production_company_ids"] = [company["id"] for company in movie["production_companies"]]
    movie["production_countries"] = [country["iso_3166_1"] for country in movie["production_countries"]]
    movie["collection_id"] = movie["belongs_to_collection"]["id"] if movie["belongs_to_collection"] else pd.NA
    movie["cast_person_ids"] = [cast_member["id"] for cast_member in movie["credits"]["cast"]]
    movie["cast_credit_ids"] = [cast_member["credit_id"] for cast_member in movie["credits"]["cast"]]
    movie["crew_person_ids"] = [crew_member["id"] for crew_member in movie["credits"]["crew"]]
    movie["crew_credit_ids"] = [crew_member["credit_id"] for crew_member in movie["credits"]["crew"]]
    del movie['genres']
    del movie['production_companies']
    del movie["belongs_to_collection"]   
    del movie["credits"]
    
    all_movies.append(movie)
    sleep(REQUEST_DELAY_SECONDS)  

df_movies = pd.DataFrame(all_movies)
df_movies.rename(columns={"id": "movie_id"}, inplace=True)
df_movies.to_csv("../movie_data/movies.csv", index=False)
print(f"Number of movies collected: {len(df_movies)}")
df_movies

Number of movies collected: 2801


Unnamed: 0,adult,backdrop_path,budget,homepage,movie_id,imdb_id,origin_country,original_language,original_title,overview,...,vote_average,vote_count,director_person_ids,genre_ids,production_company_ids,collection_id,cast_person_ids,cast_credit_ids,crew_person_ids,crew_credit_ids
0,False,/n92EzFFg2cMkJiEhnMfD5sKHSH3.jpg,80000000,http://movies.disney.com/fantasia-2000,49948,tt0120910,[US],en,Fantasia 2000,"Blending lively music and brilliant animation,...",...,7.000,1324,"[5690, 56146, 74299, 65531, 74297, 12824, 5614...","[16, 10751, 10402]",[2],55427,"[67773, 166002, 13301, 73931, 15152, 37221, 74...","[52fe47b1c3a36847f814324f, 547f6942925141239b0...","[2066428, 5690, 56146, 74299, 65531, 74297, 12...","[624f173b24f2ce009ddf2543, 52fe47b1c3a36847f81..."
1,False,/rNOeiC5uruGnr5n7YW8hvnrbX9q.jpg,0,,10898,tt0240684,[US],en,The Little Mermaid II: Return to the Sea,"Set several years after the first film, Ariel ...",...,6.397,1747,[60725],"[16, 12, 10751, 35]","[3475, 5391]",33085,"[63978, 67392, 15762, 35232, 67393, 9601, 7133...","[52fe43cc9251416c7501e85d, 52fe43cc9251416c750...","[60725, 67394, 67395, 67396, 61386, 67397, 607...","[52fe43cc9251416c7501e873, 52fe43cc9251416c750..."
2,False,/AbFWty0o5nKGo4iLJaGRgqFtC8W.jpg,40000000,,4234,tt0134084,[US],en,Scream 3,While Sidney Prescott and her friends visit th...,...,5.995,3807,[5140],"[27, 9648]","[7405, 85, 1600]",2602,"[15234, 9206, 14405, 18352, 35595, 2714, 35598...","[52fe43b4c3a36847f806911d, 52fe43b4c3a36847f80...","[5140, 26458, 15244, 409, 35581, 26458, 7229, ...","[52fe43b4c3a36847f80690cb, 52fe43b4c3a36847f80..."
3,False,/mZGwhwIwYuF0G9XLS8j23dhNfUC.jpg,23000000,https://www.uphe.com/movies/pitch-black,2787,tt0134847,[US],en,Pitch Black,When their ship crash-lands on a remote planet...,...,6.851,4599,[28239],"[53, 878, 28]",[10201],2794,"[12835, 8329, 6614, 28099, 26054, 65827, 28098...","[52fe436dc3a36847f80535fd, 52fe436dc3a36847f80...","[28239, 13673, 28240, 9185, 63920, 63921, 1412...","[52fe436dc3a36847f805361f, 52fe436dc3a36847f80..."
4,False,/dA8CmAfzxVwNTcrUWTkunh7ZPqk.jpg,41300000,,2069,tt0190138,[US],en,The Whole Nine Yards,After a mobster agrees to cooperate with an FB...,...,6.465,1970,[21217],"[35, 80]","[53013, 53014]",103577,"[14408, 62, 57395, 2956, 61981, 2165, 7166, 16...","[52fe4331c3a36847f80418c5, 52fe4331c3a36847f80...","[21219, 21222, 14712, 21218, 21221, 21220, 212...","[52fe4331c3a36847f8041899, 52fe4331c3a36847f80..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2796,False,/sRLC052ieEzkQs9dEtPMfFxYkej.jpg,83000000,https://www.netflix.com/title/81464239,848326,tt14998742,[US],en,Rebel Moon - Part One: A Child of Fire,When the ruthless forces of the Motherworld th...,...,6.242,2385,[15217],"[878, 28, 12]","[114152, 156880]",934765,"[568657, 91520, 1047649, 938, 21688, 1564557, ...","[61818dfd11386c002a9b6ed9, 6250b2765a07f50050f...","[3486271, 15217, 17285, 1005508, 2104243, 1830...","[624462cdc50ad20047008aad, 626038010792e1151e8..."
2797,False,/bq9FpkTw9I3s1cSRuZxQibM2xOx.jpg,15000000,https://a24films.com/films/the-zone-of-interest,467244,tt7160372,[US],en,The Zone of Interest,"The commandant of Auschwitz, Rudolf Höss, and ...",...,7.005,2304,[66728],"[18, 36, 10752]","[41077, 6705, 103376, 27543, 113962]",,"[71374, 7152, 4072267, 4452138, 4072269, 40722...","[635c3ad9f28838007b15e2fe, 635c3acb88c65900823...","[1341727, 4222, 70604, 1623709, 1389542, 41551...","[620956a6fab3fa0132ea9f79, 6078629118864b002ce..."
2798,False,/bckxSN9ueOgm0gJpVJmPQrecWul.jpg,205000000,https://www.aquamanmovie.com,572802,tt9663764,[US],en,Aquaman and the Lost Kingdom,Black Manta seeks revenge on Aquaman for his f...,...,6.600,3223,[2127],"[28, 12, 14]","[174, 11565, 76907, 128064, 216687]",573693,"[117642, 17178, 1639847, 79082, 55085, 2227, 1...","[65b4239557530e0147d981bc, 5f41dea381a7fc00360...","[52600, 2127, 36, 1354914, 1545447, 62813, 212...","[60774e1d19ab59004062d106, 6078ca802faf4d0078a..."
2799,False,/j9eOeLlTGoHoM8BNUJVNyWmIvCi.jpg,25000000,https://www.anyonebutyou.movie,1072790,tt26047818,[US],en,Anyone But You,"After an amazing first date, Bea and Ben’s fie...",...,6.837,2735,[82511],"[10749, 35]","[7291, 105052, 124283, 5]",,"[115440, 83271, 3085680, 2728596, 2495673, 125...","[63bf14f4df857c0089269b1a, 63bf1503df857c007cb...","[1963222, 1325188, 2761446, 4321486, 4321489, ...","[652be8e4f2883802a25e99af, 652be8fa358da700c6f..."


First we find director_person_ids list.

Then we find the top 10 highest credited actors for each movie.

And we combine these two lists to a unique_person_ids set

In [4]:
director_person_ids = df_movies['director_person_ids'].explode().dropna().tolist()

top_10_cast_person_ids = (
    df_movies['cast_person_ids']
    .apply(lambda ids: ids[:10])   # slice first 10 for each row
    .explode()                     # flatten into one Series
    .dropna()
    .tolist()                      # convert to plain list
)

unique_person_ids = set(top_10_cast_person_ids) | set(director_person_ids)
len(unique_person_ids)

12475

And then we request data for all these unique_person_ids, and store that in a dataframe and a CSV file

In [5]:
all_persons = []

for person_id in unique_person_ids:
    url = f"https://api.themoviedb.org/3/person/{person_id}"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"person_id: {person_id} Status code: {response.status_code} Text: {response.text}")
    
    # Extract the person data from the response
    person = response.json()
    
    all_persons.append(person)
    
    sleep(REQUEST_DELAY_SECONDS)
    
    
df_persons = pd.DataFrame(all_persons)
df_persons.rename(columns={"id": "person_id"}, inplace=True)
df_persons.to_csv("../movie_data/persons.csv", index=False)
print(f"Number of persons collected: {len(df_persons)}")
df_persons


Number of persons collected: 12475


Unnamed: 0,adult,also_known_as,biography,birthday,deathday,gender,homepage,person_id,imdb_id,known_for_department,name,place_of_birth,popularity,profile_path
0,False,"[George Walton Lucas Jr. , George Walton Lucas...","George Walton Lucas Jr. (born May 14, 1944) is...",1944-05-14,,2,,1,nm0000184,Directing,George Lucas,"Modesto, California, USA",1.6016,/mDLDvsx8PaZoEThkBdyaG1JxPdf.jpg
1,False,"[Mark Hamil, Mark Richard Hamill, Patrick Will...","Mark Richard Hamill (born September 25, 1951) ...",1951-09-25,,2,,2,nm0000434,Acting,Mark Hamill,"Oakland, California, USA",2.3873,/2ZulC2Ccq1yv3pemusks6Zlfy2s.jpg
2,False,[Harrison J. Ford],Legendary Hollywood Icon Harrison Ford was bor...,1942-07-13,,2,,3,nm0000148,Acting,Harrison Ford,"Chicago, Illinois, USA",3.1719,/zVnHagUvXkR2StdOtquEwsiwSVt.jpg
3,False,"[Carrie Frances Fisher , Кэрри Фрэнсис Фишер, ...","Carrie Frances Fisher (October 21, 1956 – Dece...",1956-10-21,2016-12-27,1,https://carriefisher.com/,4,nm0000402,Acting,Carrie Fisher,"Beverly Hills, Los Angeles, California, USA",0.8295,/awb4UqzT6meD3JiQlraIzAqcRtH.jpg
4,False,"[林-曼努尔·米兰达, لین-منوئل میراندا]",Lin-Manuel Miranda (/mænˈwɛl/; born January 16...,1980-01-16,,2,http://www.linmanuel.com/,1179651,nm0592135,Acting,Lin-Manuel Miranda,"Manhattan, New York City, New York, USA",1.1104,/r0wFwPa041pZ1QM66yJWuQXCkqx.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12470,False,"[Roxanna Ortega, Roxanne Ortega]",,,,1,http://roxanaortega.com/,98292,nm0650945,Acting,Roxana Ortega,"Los Angeles, California, USA",0.2401,/tAkMeufq6ruljugyHhjvtIe72H4.jpg
12471,False,[],,1968-12-21,,2,,65525,nm0564586,Directing,Howard McCain,,0.9515,
12472,False,[],"Cynthia Stevenson (born August 2, 1962 in Oakl...",1962-08-02,,1,,65528,nm0828906,Acting,Cynthia Stevenson,"Piedmont, California, USA",0.9425,/zdrsUVw9F0U4onvh1rS1AQor4wp.jpg
12473,False,[Egidio Cafarelli ],Gino Cafarelli (born 28 February 1969) is an A...,1969-02-28,,2,,1179643,nm1388699,Acting,Gino Cafarelli,"Queens, New York City, New York, USA",0.2527,/x9sMMZ9BSL21eYXCvYuLXgM3Glr.jpg


POSSIBLE EXTENSION: Now we get credits details for the top 10 actors

In [6]:
# top_10_cast_credit_ids = (
#     df_movies['cast_credit_ids']
#     .apply(lambda ids: ids[:10])   # slice first 10 for each row
#     .explode()                     # flatten into one Series
#     .tolist()                      # convert to plain list
# )
# len(top_10_cast_credit_ids)

In [7]:
# all_credits = []

# for credit_id in top_10_cast_credit_ids:
#     url = f"https://api.themoviedb.org/3/person/{person_id}"
#     response = requests.get(url, headers=headers)
#     if response.status_code != 200:
#         raise Exception(f"Status code: {response.status_code} Text: {response.text}")
    
#     # Extract the person data from the response
#     person = response.json()
    
#     all_persons.append(person)
    
#     sleep(REQUEST_DELAY_SECONDS)
    
    
# df_persons = pd.DataFrame(all_persons)
# df_persons.rename(columns={"id": "person_id"}, inplace=True)
# df_persons.to_csv("../movie_data/persons.csv", index=False)
# print(f"Number of persons collected: {len(df_persons)}")
# df_persons