# read_tmdb_data.ipynb

This notebook reads JSON data from TMDB's API and stores it as CSV files:
- **movies.csv**: Data on XXXX movies from United States from 2000-2023
- **credits.csv**: Data on credits for all persons in these movies (both cast and crew).
- **persons.csv**: Data on all actors that appear in these movies.

In [1]:
import os
import requests
from dotenv import load_dotenv
from time import sleep
import pandas as pd

load_dotenv()
tmdb_api_token = os.getenv("TMDB_API_TOKEN")

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {tmdb_api_token}"
}

REQUEST_DELAY_SECONDS = 0.02

First we request movie_ids for all movies that live up to these requirements:
- From the United States
- From the years 2000-2023
- With original_langauge = english
- With TMDB vote count ≥ 1000

In [2]:
base_url = (
    "https://api.themoviedb.org/3/discover/movie"
    "?include_adult=false"
    "&include_video=false"
    "&language=en-US"
    "&with_original_language=en"
    "&with_origin_country=US"
    "&vote_count.gte=30000"
    "&primary_release_date.gte=2000-01-01"
    "&primary_release_date.lte=2023-12-31"
    "&sort_by=primary_release_date.asc"
)

movie_ids = []

# Loop through all pages (adjust max page if needed)
for page in range(1, 501):
    url = f"{base_url}&page={page}"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Status code: {response.status_code} Text: {response.text}")
    
    # Extract movie_results from the response
    movie_results = response.json().get("results") 
    
    # Stop if we have reached the last page
    if not movie_results:
        break

    # Extract movie_ids from the movie_results
    movie_ids.extend([movie["id"] for movie in movie_results])
    
    sleep(REQUEST_DELAY_SECONDS)

print(f"Number of movie ids found: {len(movie_ids)}")

Number of movie ids found: 7


Then we request movie data for all these movie_ids, and store that in a dataframe and a CSV file

In [3]:
all_movies = []

# Loop through all movie_ids and request movie data
for movie_id in movie_ids:
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?language=en-US&append_to_response=credits"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Status code: {response.status_code} Text: {response.text}")
    
    # Extract the movie data from the response as a dictionary
    movie = response.json()

    # Find directors and put director_person_ids in the dictionary
    movie["director_person_ids"] = []
    for credit in movie["credits"]["crew"]:
        if(credit["job"]=="Director"):
            movie["director_person_ids"].append(credit["id"])

    # Simplify child dictionaries to list of ids
    movie["genre_ids"] = [genre["id"] for genre in movie["genres"]]
    movie["spoken_languages"] = [language["iso_639_1"] for language in movie["spoken_languages"]]
    movie["production_company_ids"] = [company["id"] for company in movie["production_companies"]]
    movie["production_countries"] = [country["iso_3166_1"] for country in movie["production_countries"]]
    movie["collection_id"] = movie["belongs_to_collection"]["id"] if movie["belongs_to_collection"] else pd.NA
    movie["cast_person_ids"] = [cast_member["id"] for cast_member in movie["credits"]["cast"]]
    movie["cast_credit_ids"] = [cast_member["credit_id"] for cast_member in movie["credits"]["cast"]]
    movie["crew_person_ids"] = [crew_member["id"] for crew_member in movie["credits"]["crew"]]
    movie["crew_credit_ids"] = [crew_member["credit_id"] for crew_member in movie["credits"]["crew"]]
    del movie['genres']
    del movie['production_companies']
    del movie["belongs_to_collection"]   
    del movie["credits"]
    
    all_movies.append(movie)
    sleep(REQUEST_DELAY_SECONDS)  

df_movies = pd.DataFrame(all_movies)
df_movies.rename(columns={"id": "movie_id"}, inplace=True)
df_movies.to_csv("../movie_data/movies.csv", index=False)
print(f"Total movies collected: {len(df_movies)}")

Total movies collected: 7


In [4]:
df_movies

Unnamed: 0,adult,backdrop_path,budget,homepage,movie_id,imdb_id,origin_country,original_language,original_title,overview,...,vote_average,vote_count,director_person_ids,genre_ids,production_company_ids,collection_id,cast_person_ids,cast_credit_ids,crew_person_ids,crew_credit_ids
0,False,/enNubozHn9pXi0ycTVYUWfpHZm.jpg,185000000,https://www.warnerbros.com/movies/dark-knight/,155,tt0468569,[US],en,The Dark Knight,Batman raises the stakes in his war on crime. ...,...,8.523,34459,[525],"[18, 28, 80, 53]","[174, 923, 9996, 429]",263.0,"[3894, 1810, 6383, 3895, 1579, 64, 192, 53651,...","[52fe4220c3a36847f8005d17, 52fe421fc3a36847f80...","[3904, 3893, 561, 10949, 10951, 559, 527, 525,...","[52fe4220c3a36847f8005d35, 52fe4220c3a36847f80..."
1,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,237000000,https://www.avatar.com/movies/avatar,19995,tt0499549,[US],en,Avatar,"In the 22nd century, a paraplegic Marine is di...",...,7.593,32647,[2710],"[28, 12, 14, 878]","[444, 574, 25, 290]",87096.0,"[65731, 8691, 10205, 32747, 17647, 1771, 59231...","[5602a8a7c3a3685532001c9a, 52fe48009251416c750...","[58871, 2710, 1721, 8529, 18265, 6347, 2710, 4...","[52fe48009251416c750aca1d, 52fe48009251416c750..."
2,False,/gqby0RhyehP3uRrzmdyUZ0CgPPe.jpg,160000000,https://www.warnerbros.com/movies/inception,27205,tt1375666,"[US, GB]",en,Inception,"Cobb, a skilled thief who commits corporate es...",...,8.37,38011,[525],"[28, 878, 12]","[923, 9996, 174]",,"[6193, 24045, 3899, 2524, 27578, 95697, 2037, ...","[52fe4534c3a368484e04de03, 52fe4534c3a368484e0...","[3904, 561, 525, 21984, 10958, 559, 11018, 969...","[52fe4534c3a368484e04de45, 52fe4534c3a368484e0..."
3,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,220000000,https://www.marvel.com/movies/the-avengers,24428,tt0848228,[US],en,The Avengers,When an unexpected enemy emerges and threatens...,...,7.812,33209,[12891],"[878, 28, 12]",[420],86311.0,"[3223, 16828, 103, 74568, 1245, 17604, 91606, ...","[52fe4495c3a368484e02b251, 52fe4495c3a368484e0...","[12891, 35176, 18866, 15277, 12891, 12891, 113...","[52fe4495c3a368484e02b1ab, 52fe4495c3a368484e0..."
4,False,/vgnoBSVzWAV9sNQUORaDGvDp7wx.jpg,165000000,http://www.interstellarmovie.net/,157336,tt0816692,[US],en,Interstellar,The adventures of a group of explorers who mak...,...,8.461,37933,[525],"[12, 18, 878]","[923, 9996, 13769]",,"[10297, 1813, 3895, 83002, 1893, 8210, 17052, ...","[52fe4bbf9251416c910e47cb, 57fe146fc3a368504a0...","[947, 2162, 525, 74401, 556, 15327, 7418, 561,...","[52fe4bbf9251416c910e4801, 52fe4bbf9251416c910..."
5,False,/en971MEXui9diirXlogOrPKmsEn.jpg,58000000,https://www.20thcenturystudios.com/movies/dead...,293660,tt1431045,[US],en,Deadpool,The origin story of former Special Forces oper...,...,7.623,31944,[55252],"[28, 12, 35]","[25, 431, 28788, 7505]",448150.0,"[10859, 54882, 1047649, 51990, 78452, 122750, ...","[57169b95925141695b00046f, 552426e29251417be20...","[1201060, 1935206, 1762860, 58910, 23819, 1441...","[60e07fdb071650005d9bfe73, 60e07c2dbdd568005e5..."
6,False,/mDfJG3LC3Dqb67AZ52x3Z0jU0uB.jpg,300000000,https://www.marvel.com/movies/avengers-infinit...,299536,tt4154756,[US],en,Avengers: Infinity War,As the Avengers and their allies have continue...,...,8.236,30977,"[19272, 19271]","[12, 28, 878]",[420],86311.0,"[3223, 16828, 74568, 16851, 103, 1245, 1896, 7...","[54a9cfa29251414d5b00553d, 54a9cfc0c3a3680c290...","[3019687, 3062791, 1370799, 1848821, 2798471, ...","[6057faf28c44b900749c7f40, 60810ef2f6787a00404..."


In [5]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   adult                   7 non-null      bool   
 1   backdrop_path           7 non-null      object 
 2   budget                  7 non-null      int64  
 3   homepage                7 non-null      object 
 4   movie_id                7 non-null      int64  
 5   imdb_id                 7 non-null      object 
 6   origin_country          7 non-null      object 
 7   original_language       7 non-null      object 
 8   original_title          7 non-null      object 
 9   overview                7 non-null      object 
 10  popularity              7 non-null      float64
 11  poster_path             7 non-null      object 
 12  production_countries    7 non-null      object 
 13  release_date            7 non-null      object 
 14  revenue                 7 non-null      int64 

Then we run through the movie dataframe and make a list of person ids from the top 10 highest credited actors for each movie.

And we combined that with director_person_ids to a unique_person_ids set

In [6]:
top_10_cast_person_ids = (
    df_movies['cast_person_ids']
    .apply(lambda ids: ids[:10])   # slice first 10 for each row
    .explode()                     # flatten into one Series
    .tolist()                      # convert to plain list
)

director_person_ids = df_movies['director_person_ids'].explode().tolist()

unique_person_ids = set(top_10_cast_person_ids) | set(director_person_ids)
len(unique_person_ids)

69

And then we request data for all these unique_person_ids, and store that in a dataframe and a CSV file

In [7]:
all_persons = []

for person_id in unique_person_ids:
    url = f"https://api.themoviedb.org/3/person/{person_id}"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Status code: {response.status_code} Text: {response.text}")
    
    # Extract the person data from the response
    person = response.json()
    
    all_persons.append(person)
    
    sleep(REQUEST_DELAY_SECONDS)
    
    
df_persons = pd.DataFrame(all_persons)
df_persons.to_csv("../movie_data/persons.csv", index=False)
print(f"Total persons collected: {len(df_persons)}")
df_persons


Total persons collected: 69


Unnamed: 0,adult,also_known_as,biography,birthday,deathday,gender,homepage,id,imdb_id,known_for_department,name,place_of_birth,popularity,profile_path
0,False,"[Christopher Edward Nolan, Chris Nolan, Sir Ch...","Sir Christopher Edward Nolan, CBE (born July 3...",1970-07-30,,2,,525,nm0634240,Directing,Christopher Nolan,"Westminster, London, England, UK",3.6998,/xuAIuYSmsUzKlUMBFGVZaWsY3DZ.jpg
1,False,"[ჰიტ ლეჯერი, Heath Andrew Ledger]","Heath Andrew Ledger (April 4, 1979 – January 2...",1979-04-04,2008-01-22,2,,1810,nm0005132,Acting,Heath Ledger,"Perth, Western Australia, Australia",3.2485,/p2z2bURSg7nuMsN9P2s61e2RvNz.jpg
2,False,"[Monique Curnen, مونیک گابریلا کورنن]",Monique Gabriela Curnen (born 7 September 1977...,1977-09-07,,1,,53651,nm1010931,Acting,Monique Gabriela Curnen,USA,0.9874,/lJgLQs7cfM49m8VzVviwxIByz76.jpg
3,False,"[웨스 벤틀리, وس بنتلی]","Wesley Cook ""Wes"" Bentley (born September 4, 1...",1978-09-04,,2,,8210,nm0004747,Acting,Wes Bentley,"Jonesboro, Arkansas, USA",1.5220,/voD93lzFZrr9xfAggwFcPRBi84i.jpg
4,False,"[C.C.H. Pounder, Carol Christine Hilaria Pound...",Carol Christine Hilaria Pounder (born December...,1952-12-25,,1,https://cchpounder.net/,30485,nm0001634,Acting,CCH Pounder,"Georgetown, British Guiana [now Guyana]",1.4028,/vJ5Swy2WDBC46zJrbJmwsGgTPJ2.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,False,"[Gina Joy Carano , Джина Карано, 吉娜·卡拉诺, 지나 카라...","Gina Joy Carano (born April 16, 1982) is an Am...",1982-04-16,,1,,78452,nm2442289,Acting,Gina Carano,"Dallas County, Texas, USA",1.3032,/6K89PVbt3v2N7cqbPyZiO74yuOj.jpg
65,False,"[คิลเลียน เมอร์ฟี, キリアン・マーフィー, 基利安·墨菲, 킬리언 머피,...","Cillian Murphy (born May 25, 1976) is an Irish...",1976-05-25,,2,,2037,nm0614165,Acting,Cillian Murphy,"Douglas, Cork, Ireland",4.1256,/llkbyWKwpfowZ6C8peBjIV9jj99.jpg
66,False,"[Стефан Капичич, Stefan Kapičić, Stefan Kapicic]",Stefan Kapičić is a Serbian actor. He is son o...,1978-12-01,,2,,80507,nm1292973,Acting,Stefan Kapičić,"Cologne, North-Rhine-Westphalia, West Germany",0.4891,/6qHO7ckiyjLNOmW0v5RZj1MpLBj.jpg
67,False,[],"Ron Dean (born August 15, 1938) is an American...",1938-08-15,,2,,57597,nm0212939,Acting,Ron Dean,"Chicago, Illinois, USA",0.5438,/mgqdr4VFrTVZatkki2suNLYxeDG.jpg


In [8]:
# df_persons.rename(columns={'id': 'actor_id'}, inplace=True)
# danish_actors = pd.merge(danish_actors, df_persons, on='actor_id')
# danish_actors

In [9]:
# danish_actors.to_csv("../movie_data/danish_actors.csv", index=False)