# read_tmdb_data.ipynb

This notebook reads JSON data from TMDB's API and stores it as CSV files:
- **movies.csv**: Data on XXXX movies from United States from 2000-2023
- **credits.csv**: Data on credits for all persons in these movies (both cast and crew).
- **persons.csv**: Data on all actors that appear in these movies.

In [1]:
import os
import requests
from dotenv import load_dotenv
from time import sleep
import pandas as pd

load_dotenv()
tmdb_api_token = os.getenv("TMDB_API_TOKEN")

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {tmdb_api_token}"
}

First we request movie_ids for all movies that live up to these requirements:
- From the United States
- From the years 2000-2023
- With original_langauge = english
- With TMDB vote count ≥ 1000

In [2]:
base_url = (
    "https://api.themoviedb.org/3/discover/movie"
    "?include_adult=false"
    "&include_video=false"
    "&language=en-US"
    "&with_original_language=en"
    "&with_origin_country=US"
    "&vote_count.gte=1000"
    "&primary_release_date.gte=2000-01-01"
    "&primary_release_date.lte=2023-12-31"
    "&sort_by=primary_release_date.asc"
)

movie_ids = []

# Loop through all pages (adjust max page if needed)
for page in range(1, 501):
    url = f"{base_url}&page={page}"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Status code: {response.status_code} Text: {response.text}")
    
    # Extract movie_results from the response
    movie_results = response.json().get("results") 
    
    # Stop if we have reached the last page
    if not movie_results:
        break

    # Extract movie_ids from the movie_results
    movie_ids.extend([movie["id"] for movie in movie_results])
    
    sleep(0.02)  # Just to be sure we don't request too many requests

print(f"Number of movie ids found: {len(movie_ids)}")

Number of movie ids found: 2801


Then we request movie data for all these movie_ids, and store that in a dataframe and a CSV file

In [3]:
all_movies = []

# Loop through all movie_ids and request movie data
for movie_id in movie_ids:
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?language=en-US&append_to_response=credits"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Status code: {response.status_code} Text: {response.text}")
    
    # Extract the movie data from the response
    movie = response.json()

    # Simplify dictionaries to list of ids
    movie["genre_ids"] = [genre["id"] for genre in movie["genres"]]
    movie["spoken_languages"] = [language["iso_639_1"] for language in movie["spoken_languages"]]
    movie["production_company_ids"] = [company["id"] for company in movie["production_companies"]]
    movie["production_countries"] = [country["iso_3166_1"] for country in movie["production_countries"]]
    movie["collection_id"] = movie["belongs_to_collection"]["id"] if movie["belongs_to_collection"] else pd.NA
    movie["cast_person_ids"] = [cast_member["id"] for cast_member in movie["credits"]["cast"]]
    movie["cast_credit_ids"] = [cast_member["credit_id"] for cast_member in movie["credits"]["cast"]]
    movie["crew_person_ids"] = [crew_member["id"] for crew_member in movie["credits"]["crew"]]
    movie["crew_credit_ids"] = [crew_member["credit_id"] for crew_member in movie["credits"]["crew"]]
    del movie['genres']
    del movie['production_companies']
    del movie["belongs_to_collection"]   
    del movie["credits"]
    
    all_movies.append(movie)
    sleep(0.02)  # Just to be sure we don't request too many requests

df_movies = pd.DataFrame(all_movies)
df_movies.rename(columns={"id": "movie_id"}, inplace=True)
df_movies.to_csv("../movie_data/movies.csv", index=False)
print(f"Total movies collected: {len(df_movies)}")

Total movies collected: 2801


In [4]:
df_movies

Unnamed: 0,adult,backdrop_path,budget,homepage,movie_id,imdb_id,origin_country,original_language,original_title,overview,...,video,vote_average,vote_count,genre_ids,production_company_ids,collection_id,cast_person_ids,cast_credit_ids,crew_person_ids,crew_credit_ids
0,False,/n92EzFFg2cMkJiEhnMfD5sKHSH3.jpg,80000000,http://movies.disney.com/fantasia-2000,49948,tt0120910,[US],en,Fantasia 2000,"Blending lively music and brilliant animation,...",...,False,7.000,1324,"[16, 10751, 10402]",[2],55427,"[67773, 166002, 13301, 73931, 15152, 37221, 74...","[52fe47b1c3a36847f814324f, 547f6942925141239b0...","[2066428, 5690, 56146, 74299, 65531, 74297, 12...","[624f173b24f2ce009ddf2543, 52fe47b1c3a36847f81..."
1,False,/rNOeiC5uruGnr5n7YW8hvnrbX9q.jpg,0,,10898,tt0240684,[US],en,The Little Mermaid II: Return to the Sea,"Set several years after the first film, Ariel ...",...,False,6.397,1747,"[16, 12, 10751, 35]","[3475, 5391]",33085,"[63978, 67392, 15762, 35232, 67393, 9601, 7133...","[52fe43cc9251416c7501e85d, 52fe43cc9251416c750...","[60725, 67394, 67395, 67396, 61386, 67397, 607...","[52fe43cc9251416c7501e873, 52fe43cc9251416c750..."
2,False,/AbFWty0o5nKGo4iLJaGRgqFtC8W.jpg,40000000,,4234,tt0134084,[US],en,Scream 3,While Sidney Prescott and her friends visit th...,...,False,5.995,3807,"[27, 9648]","[7405, 85, 1600]",2602,"[15234, 9206, 14405, 18352, 35595, 2714, 35598...","[52fe43b4c3a36847f806911d, 52fe43b4c3a36847f80...","[35594, 35581, 27226, 26458, 21641, 26458, 409...","[52fe43b4c3a36847f8069119, 52fe43b4c3a36847f80..."
3,False,/mZGwhwIwYuF0G9XLS8j23dhNfUC.jpg,23000000,https://www.uphe.com/movies/pitch-black,2787,tt0134847,[US],en,Pitch Black,When their ship crash-lands on a remote planet...,...,False,6.852,4597,"[53, 878, 28]",[10201],2794,"[12835, 8329, 6614, 28099, 26054, 65827, 28098...","[52fe436dc3a36847f80535fd, 52fe436dc3a36847f80...","[28239, 13673, 28240, 9185, 63920, 63921, 1412...","[52fe436dc3a36847f805361f, 52fe436dc3a36847f80..."
4,False,/dA8CmAfzxVwNTcrUWTkunh7ZPqk.jpg,41300000,,2069,tt0190138,[US],en,The Whole Nine Yards,After a mobster agrees to cooperate with an FB...,...,False,6.465,1970,"[35, 80]","[53013, 53014]",103577,"[14408, 62, 57395, 2956, 61981, 2165, 7166, 16...","[52fe4331c3a36847f80418c5, 52fe4331c3a36847f80...","[21219, 21222, 21218, 21217, 21220, 21223, 147...","[52fe4331c3a36847f8041899, 52fe4331c3a36847f80..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2796,False,/sRLC052ieEzkQs9dEtPMfFxYkej.jpg,83000000,https://www.netflix.com/title/81464239,848326,tt14998742,[US],en,Rebel Moon - Part One: A Child of Fire,When the ruthless forces of the Motherworld th...,...,False,6.242,2384,"[878, 28, 12]","[114152, 156880]",934765,"[568657, 91520, 1047649, 938, 21688, 1564557, ...","[61818dfd11386c002a9b6ed9, 6250b2765a07f50050f...","[3486271, 15217, 17285, 1005508, 2104243, 1830...","[624462cdc50ad20047008aad, 626038010792e1151e8..."
2797,False,/bq9FpkTw9I3s1cSRuZxQibM2xOx.jpg,15000000,https://a24films.com/films/the-zone-of-interest,467244,tt7160372,[US],en,The Zone of Interest,"The commandant of Auschwitz, Rudolf Höss, and ...",...,False,7.006,2303,"[18, 36, 10752]","[41077, 6705, 103376, 27543, 113962]",,"[71374, 7152, 4072267, 4452138, 4072269, 40722...","[635c3ad9f28838007b15e2fe, 635c3acb88c65900823...","[1341727, 4222, 66728, 1389542, 1317874, 16237...","[620956a6fab3fa0132ea9f79, 6078629118864b002ce..."
2798,False,/bckxSN9ueOgm0gJpVJmPQrecWul.jpg,205000000,https://www.aquamanmovie.com,572802,tt9663764,[US],en,Aquaman and the Lost Kingdom,Black Manta seeks revenge on Aquaman for his f...,...,False,6.600,3223,"[28, 12, 14]","[174, 11565, 76907, 128064, 216687]",573693,"[117642, 17178, 1639847, 79082, 55085, 2227, 1...","[65b4239557530e0147d981bc, 5f41dea381a7fc00360...","[52600, 2127, 36, 1354914, 142686, 2127, 62813...","[60774e1d19ab59004062d106, 6078ca802faf4d0078a..."
2799,False,/j9eOeLlTGoHoM8BNUJVNyWmIvCi.jpg,25000000,https://www.anyonebutyou.movie,1072790,tt26047818,[US],en,Anyone But You,"After an amazing first date, Bea and Ben’s fie...",...,False,6.837,2735,"[10749, 35]","[7291, 105052, 124283, 5]",,"[115440, 83271, 3085680, 2728596, 2495673, 125...","[63bf14f4df857c0089269b1a, 63bf1503df857c007cb...","[4209661, 1963222, 2148336, 1325188, 2647144, ...","[652be8db358da7011dd8a612, 652be8e4f2883802a25..."


In [5]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2801 entries, 0 to 2800
Data columns (total 30 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   adult                   2801 non-null   bool   
 1   backdrop_path           2800 non-null   object 
 2   budget                  2801 non-null   int64  
 3   homepage                2801 non-null   object 
 4   movie_id                2801 non-null   int64  
 5   imdb_id                 2800 non-null   object 
 6   origin_country          2801 non-null   object 
 7   original_language       2801 non-null   object 
 8   original_title          2801 non-null   object 
 9   overview                2801 non-null   object 
 10  popularity              2801 non-null   float64
 11  poster_path             2800 non-null   object 
 12  production_countries    2801 non-null   object 
 13  release_date            2801 non-null   object 
 14  revenue                 2801 non-null   

Then we run through the movie dataframe and find all unique actor_ids.

And then we request data for all these actor_ids, and store that in a dataframe and a CSV file

In [6]:
# # Flatten and combine the two columns
# # all_person_ids = df_movies['cast_person_ids'].explode().tolist() + df_movies['crew_person_ids'].explode().tolist()

# all_person_ids = df_movies['cast_person_ids'].explode().tolist()

# # Get unique person IDs
# unique_person_ids = set(all_person_ids)
# len(unique_person_ids)

In [7]:
# all_persons = []

# for person_id in unique_person_ids:
#     url = f"https://api.themoviedb.org/3/person/{person_id}"
#     response = requests.get(url, headers=headers)
#     if response.status_code != 200:
#         raise Exception(f"Status code: {response.status_code} Text: {response.text}")
    
#     # Extract the person data from the response
#     person = response.json()
    
#     all_persons.append(person)
#     sleep(0.02)  # Just to be sure we don't request too many requests
    
    
# df_persons = pd.DataFrame(all_persons)
# df_persons


In [8]:
# df_persons.rename(columns={'id': 'actor_id'}, inplace=True)
# danish_actors = pd.merge(danish_actors, df_persons, on='actor_id')
# danish_actors

In [9]:
# danish_actors.to_csv("../movie_data/danish_actors.csv", index=False)