# read_tmdb_data.ipynb

This notebook reads JSON data from TMDB's API and stores it as CSV files:
- **movies_min_10_votes.csv**: Data on approx. 27000 movies from United States from 2000-2023
- **persons_min_10_votes.csv**: Data on the directors and top 10 cast in these movies.
- POSSIBLE EXTENSION: **credits.csv**: Data on credits for all persons in these movies (both cast and crew).


In [1]:
import os
import requests
from dotenv import load_dotenv
from time import sleep
import pandas as pd

load_dotenv()
tmdb_api_token = os.getenv("TMDB_API_TOKEN")

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {tmdb_api_token}"
}

REQUEST_DELAY_SECONDS = 0.02

First we request movie_ids for all movies that live up to these requirements:
- From the United States
- From the years 2000-2023
- With original_langauge = english
- With TMDB vote count ≥ 10

In [2]:
base_url = (
    "https://api.themoviedb.org/3/discover/movie"
    "?include_adult=false"
    "&include_video=false"
    "&language=en-US"
    "&with_original_language=en"
    "&with_origin_country=US"
    "&vote_count.gte=10"
    "&primary_release_date.gte=2000-01-01"
    "&primary_release_date.lte=2023-12-31"
    "&sort_by=primary_release_date.asc"
)

movie_ids = []

# Loop through all pages (TMDB has a maximum of 500 pages)
for page in range(1, 501):
    url = f"{base_url}&page={page}"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"page: {page} Status code: {response.status_code} Text: {response.text}")
    
    # Extract movie_results from the response
    movie_results = response.json().get("results") 
    
    # Stop if we have reached the last page
    if not movie_results:
        break

    # Extend movie_ids list with the ones from movie_results
    movie_ids.extend([movie["id"] for movie in movie_results])
    
    sleep(REQUEST_DELAY_SECONDS)

print(f"Number of movie ids found: {len(movie_ids)}")

Number of movie ids found: 10000


Then we request movie data for all these movie_ids, and store that in a dataframe and a CSV file

In [3]:
all_movies = []

# Loop through all movie_ids and request movie data
for movie_id in movie_ids:
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?language=en-US&append_to_response=credits"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"movie_id: {movie_id} Status code: {response.status_code} Text: {response.text}")
    
    # Extract the movie data from the response as a dictionary
    movie = response.json()

    # Find the directors and put director_person_ids in the dictionary
    movie["director_person_ids"] = []
    for credit in movie["credits"]["crew"]:
        if(credit["job"]=="Director"):
            movie["director_person_ids"].append(credit["id"])

    # Simplify child dictionaries to list of ids
    movie["genre_ids"] = [genre["id"] for genre in movie["genres"]]
    movie["spoken_languages"] = [language["iso_639_1"] for language in movie["spoken_languages"]]
    movie["production_company_ids"] = [company["id"] for company in movie["production_companies"]]
    movie["production_countries"] = [country["iso_3166_1"] for country in movie["production_countries"]]
    movie["collection_id"] = movie["belongs_to_collection"]["id"] if movie["belongs_to_collection"] else pd.NA
    movie["cast_person_ids"] = [cast_member["id"] for cast_member in movie["credits"]["cast"]]
    movie["cast_credit_ids"] = [cast_member["credit_id"] for cast_member in movie["credits"]["cast"]]
    movie["crew_person_ids"] = [crew_member["id"] for crew_member in movie["credits"]["crew"]]
    movie["crew_credit_ids"] = [crew_member["credit_id"] for crew_member in movie["credits"]["crew"]]
    del movie['genres']
    del movie['production_companies']
    del movie["belongs_to_collection"]   
    del movie["credits"]
    
    all_movies.append(movie)
    sleep(REQUEST_DELAY_SECONDS)  

df_movies = pd.DataFrame(all_movies)
df_movies.rename(columns={"id": "movie_id"}, inplace=True)
df_movies.to_csv("../movie_data/movies_min_10_votes.csv", index=False)
print(f"Number of movies collected: {len(df_movies)}")
df_movies

Number of movies collected: 10000


Unnamed: 0,adult,backdrop_path,budget,homepage,movie_id,imdb_id,origin_country,original_language,original_title,overview,...,vote_average,vote_count,director_person_ids,genre_ids,production_company_ids,collection_id,cast_person_ids,cast_credit_ids,crew_person_ids,crew_credit_ids
0,False,/dD90r6NQ8cFgYjjYGSLRQLCdJWN.jpg,0,,515728,tt0191181,[US],en,Hitch,Two friends are on a road trip and a one-sided...,...,4.900,10,[131388],[18],[],,"[1230580, 2030046]","[5ac28e640e0a260c140239f0, 5ae16159c3a36876ab0...","[131388, 1434896, 131388, 131388, 1360103, 270...","[6454f31187a27a011b145151, 6454f309c044290143e..."
1,False,/ifq88qw3vgoKlUyw0OAmPQCSqBc.jpg,0,,300236,tt0259233,[US],en,Carnage: The Legend of Quiltface,Four students set out for the barren Nevada de...,...,2.500,10,[103123],[27],"[4708, 110668]",,"[98740, 99106, 98276, 1685427, 1771744, 177174...","[58bf16e6925141608406b270, 6251f3b9a6c104320b2...","[98868, 103123, 103123, 103123, 1001648, 10016...","[62c5bce19ba86a00ee4ad67d, 62c5bceaf794ad00bf5..."
2,False,/c5UlEYHM2xuSrfxESg1gZSpVAEB.jpg,0,,96716,tt0128977,[US],en,The Bumblebee Flies Anyway,An amnesiac youth tries to piece together his ...,...,6.200,30,[126537],"[18, 10749]","[1596, 1363]",,"[109, 21197, 16407, 1223778, 38581, 303197, 56...","[52fe49be9251416c750d1f8f, 52fe49be9251416c750...","[23905, 1534680, 68126, 2556479, 35146, 128031...","[60c1e4b32c6b7b002a428454, 60c1e4cb39a45d0040c..."
3,False,,0,,71618,tt0198284,[US],en,After Sex,A group of attractive women get together for a...,...,5.000,27,[176312],"[35, 18, 10749]","[85165, 86531]",,"[170638, 12519, 15110, 3208, 51670, 61962, 117...","[53d9db080e0a2652f0001583, 52fe483ec3a368484e0...","[176312, 1470931, 33008, 17211, 65792, 17210, ...","[52fe483ec3a368484e0ef37f, 5564e59bc3a368740e0..."
4,False,/nKD4M8Oyuh5aE9EBR29A9WnJKxE.jpg,0,,66131,tt0346794,[US],en,A Constant Forge,"One of the great mavericks of cinema, John Cas...",...,6.600,13,[544690],[99],[],,"[11147, 5950, 10556, 10127, 856, 2314, 1629458...","[5a9dfc9e0e0a2671fb009de8, 5a9dfc11c3a36842820...","[544690, 2377945, 3399184, 3399177, 1899985, 9...","[52fe472bc3a368484e0b89d9, 61f15d931ad93b00702..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,False,/irZkfjoKwGwSCo8OD4bLcgKhDJ.jpg,0,,92398,tt1747967,[US],en,Cassadaga,A deaf girl attempts to contact her sister dur...,...,5.021,97,[111059],"[27, 53]","[49871, 49872]",,"[210305, 116474, 7071, 52483, 854818, 930994, ...","[52fe48fb9251416c750b9a8f, 52fe48fb9251416c750...","[1855417, 111059, 1093946, 66534, 1575848, 157...","[6053831c255dba00502a7e61, 52fe48fb9251416c750..."
9996,False,/9KM1CQhvA4pzf2LoUYw0KrKA70z.jpg,0,http://www.nick.com/shows/fred?navid=aLevel,84564,tt1927093,[US],en,Fred 2: Night of the Living Fred,When Fred suspects that his new music teacher ...,...,5.200,90,[58316],"[12, 35, 10751, 10770]","[21237, 6024, 20555, 6023, 1632, 5371]",433061,"[133978, 212290, 76313, 6751, 133980, 42160, 1...","[52fe48fd9251416c9109f003, 52fe48fd9251416c910...","[930974, 112593, 930976, 930982, 17313, 53177,...","[52fe48fd9251416c9109efd5, 52fe48fd9251416c910..."
9997,False,/2rvkAT18s7mSlVUn35YF5LuqKn7.jpg,300,http://www.553am.com/,78339,tt1980185,[US],en,Memory Lane,An orphaned war-veteran routinely travels betw...,...,4.500,20,[583766],"[53, 80, 878]",[],,"[583768, 583769]","[52fe4999c3a368484e133d33, 52fe4999c3a368484e1...","[583766, 583767]","[52fe4999c3a368484e133d29, 52fe4999c3a368484e1..."
9998,False,/fjGEAM5ozUznhOUyx5Q8DXCpkdU.jpg,0,,77068,tt0780645,[US],en,War of the Dead,Captain Martin Stone is leading a finely-train...,...,4.100,72,[1053404],"[27, 28, 12]",[1269],,"[17290, 108476, 34514, 147734, 231869, 61283, ...","[52fe495ac3a368484e127391, 52fe495ac3a368484e1...","[1053404, 1053404, 60704, 1101418, 18384, 1129...","[52fe495ac3a368484e12738d, 52fe495ac3a368484e1..."


First we find director_person_ids list.

Then we find the top 10 highest credited actors for each movie.

And we combine these two lists to a unique_person_ids set

In [4]:
director_person_ids = df_movies['director_person_ids'].explode().dropna().tolist()

top_10_cast_person_ids = (
    df_movies['cast_person_ids']
    .apply(lambda ids: ids[:10])   # slice first 10 for each row
    .explode()                     # flatten into one Series
    .dropna()
    .tolist()                      # convert to plain list
)

unique_person_ids = set(top_10_cast_person_ids) | set(director_person_ids)
len(unique_person_ids)

44736

And then we request data for all these unique_person_ids, and store that in a dataframe and a CSV file

In [5]:
all_persons = []

for person_id in unique_person_ids:
    url = f"https://api.themoviedb.org/3/person/{person_id}"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"person_id: {person_id} Status code: {response.status_code} Text: {response.text}")
    
    # Extract the person data from the response
    person = response.json()
    
    all_persons.append(person)
    
    sleep(REQUEST_DELAY_SECONDS)
    
    
df_persons = pd.DataFrame(all_persons)
df_persons.rename(columns={"id": "person_id"}, inplace=True)
df_persons.to_csv("../movie_data/persons_min_10_votes.csv", index=False)
print(f"Number of persons collected: {len(df_persons)}")
df_persons

Number of persons collected: 44736


Unnamed: 0,adult,also_known_as,biography,birthday,deathday,gender,homepage,person_id,imdb_id,known_for_department,name,place_of_birth,popularity,profile_path
0,False,"[George Walton Lucas Jr. , George Walton Lucas...","George Walton Lucas Jr. (born May 14, 1944) is...",1944-05-14,,2,,1,nm0000184,Directing,George Lucas,"Modesto, California, USA",1.5020,/mDLDvsx8PaZoEThkBdyaG1JxPdf.jpg
1,False,"[Mark Hamil, Mark Richard Hamill, Patrick Will...","Mark Richard Hamill (born September 25, 1951) ...",1951-09-25,,2,,2,nm0000434,Acting,Mark Hamill,"Oakland, California, USA",2.4240,/2ZulC2Ccq1yv3pemusks6Zlfy2s.jpg
2,False,[Harrison J. Ford],Legendary Hollywood Icon Harrison Ford was bor...,1942-07-13,,2,,3,nm0000148,Acting,Harrison Ford,"Chicago, Illinois, USA",3.4892,/zVnHagUvXkR2StdOtquEwsiwSVt.jpg
3,False,"[Carrie Frances Fisher , Кэрри Фрэнсис Фишер, ...","Carrie Frances Fisher (October 21, 1956 – Dece...",1956-10-21,2016-12-27,1,https://carriefisher.com/,4,nm0000402,Acting,Carrie Fisher,"Beverly Hills, Los Angeles, California, USA",1.1379,/awb4UqzT6meD3JiQlraIzAqcRtH.jpg
4,False,[],,,,2,,1835010,nm0529183,Acting,Thomas Lyons,,0.0886,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44731,False,[],,,,0,,1048564,nm2524646,Acting,Robert Axel,,0.0143,
44732,False,[Jewcy That],,,,0,,1048565,nm1188571,Acting,Gil Bar-Sela,,0.0071,/yONVBNtpVh454yq6qVZJtlZeZc4.jpg
44733,False,[],Theodore Bouloukos is a N​ew York-based actor ...,,,2,http://itheodore.com,1048566,nm1678610,Acting,Theodore Bouloukos,,0.2240,/7NbjfvcLyTmP22PRlj7bdd5qhbN.jpg
44734,False,[],,1972-04-14,,2,,131060,nm0195476,Writing,Chris D'Arienzo,"Hastings, Michigan, USA",0.0311,


POSSIBLE EXTENSION: Now we get credits details for the top 10 actors

In [6]:
# top_10_cast_credit_ids = (
#     df_movies['cast_credit_ids']
#     .apply(lambda ids: ids[:10])   # slice first 10 for each row
#     .explode()                     # flatten into one Series
#     .tolist()                      # convert to plain list
# )
# len(top_10_cast_credit_ids)

In [7]:
# all_credits = []

# for credit_id in top_10_cast_credit_ids:
#     url = f"https://api.themoviedb.org/3/person/{person_id}"
#     response = requests.get(url, headers=headers)
#     if response.status_code != 200:
#         raise Exception(f"Status code: {response.status_code} Text: {response.text}")
    
#     # Extract the person data from the response
#     person = response.json()
    
#     all_persons.append(person)
    
#     sleep(REQUEST_DELAY_SECONDS)
    
    
# df_persons = pd.DataFrame(all_persons)
# df_persons.rename(columns={"id": "person_id"}, inplace=True)
# df_persons.to_csv("../movie_data/persons.csv", index=False)
# print(f"Number of persons collected: {len(df_persons)}")
# df_persons