# read_tmdb_data.ipynb

This notebook reads JSON data from TMDB's API and store it as CSV files:
- **danish_movies.csv**: Data on all Danish-language movies ever made up to and including 2024-12-31.
- **danish_actors.csv**: Data on all actors that appear in these movies (also foreign actors).

In [1]:
import os
import requests
from dotenv import load_dotenv
from time import sleep
import pandas as pd

load_dotenv()
tmdb_api_token = os.getenv("TMDB_API_TOKEN")

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {tmdb_api_token}"
}

First we get movie_ids for all movies that have danish as original_language

In [2]:
base_url = (
    "https://api.themoviedb.org/3/discover/movie"
    "?include_adult=false"
    "&include_video=false"
    "&language=en-US"
    "&sort_by=primary_release_date.asc"
    "&with_original_language=da"
    "&primary_release_date.gte=2024-01-01"
    "&primary_release_date.lte=2024-12-31"
)

movie_ids = []

# Loop through all pages (adjust max page if needed)
for page in range(1, 501):
    url = f"{base_url}&page={page}"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Status code: {response.status_code} Text: {response.text}")
    
    # Extract movie_results from the response
    movie_results = response.json().get("results") 
    
    # Stop if we have reached the last page
    if not movie_results:
        break

    # Extract movie_ids from the movie_results
    movie_ids.extend([movie["id"] for movie in movie_results])
    
    sleep(0.02)  # Just to be sure we don't request too many requests

print(f"Number of movie ids found: {len(movie_ids)}")

Number of movie ids found: 207


Then we get movie data for all these movie_ids, and store that in dataframe and CSV file

In [3]:
all_movies = []

# Loop through all movie_ids and request movie data
for movie_id in movie_ids:
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?language=en-US&append_to_response=credits"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Status code: {response.status_code} Text: {response.text}")
    
    # Extract the movie data from the response
    movie = response.json()

    # Simplify dictionaries to list of ids
    movie["genre_ids"] = [genre["id"] for genre in movie["genres"]]
    movie["spoken_languages"] = [language["iso_639_1"] for language in movie["spoken_languages"]]
    movie["production_company_ids"] = [company["id"] for company in movie["production_companies"]]
    movie["production_countries"] = [country["iso_3166_1"] for country in movie["production_countries"]]
    movie["collection_id"] = movie.get("belongs_to_collection").get("id") if movie["belongs_to_collection"] else None
    movie["cast_person_ids"] = [cast_member["id"] for cast_member in movie["credits"]["cast"]]
    movie["cast_credit_ids"] = [cast_member["credit_id"] for cast_member in movie["credits"]["cast"]]
    movie["crew_person_ids"] = [crew_member["id"] for crew_member in movie["credits"]["crew"]]
    movie["crew_credit_ids"] = [crew_member["credit_id"] for crew_member in movie["credits"]["crew"]]
    del movie['genres']
    del movie['production_companies']
    del movie["belongs_to_collection"]   
    del movie["credits"]
    
    all_movies.append(movie)
    sleep(0.02)  # Just to be sure we don't request too many requests

df_movies = pd.DataFrame(all_movies)
print(f"Total movies collected: {len(df_movies)}")

df_movies.to_csv("../movie_data/danish_movies.csv", index=False)

Total movies collected: 207


In [4]:
df_movies

Unnamed: 0,adult,backdrop_path,budget,homepage,id,imdb_id,origin_country,original_language,original_title,overview,...,video,vote_average,vote_count,genre_ids,production_company_ids,collection_id,cast_person_ids,cast_credit_ids,crew_person_ids,crew_credit_ids
0,False,/bqWIsqRMeu86BwGvQPs9rhlRE9u.jpg,0,,1247285,,[DK],da,Topmøde,The three club bosses meet and give their view...,...,False,0.0,0,[],[],,[],[],[],[]
1,False,/m6p6eBNSjiYtOaDCj3xQwgQXodd.jpg,0,,1237423,,[DK],da,Anne Bakland: Bullshit,"Shut up, there are a lot of things tumbling ar...",...,False,6.0,1,[35],[156967],,[2504733],[65b6d7318741c4014a9128ce],[],[]
2,False,/2XyDZJHoxfs7RHEVSQJ1uB5QOTZ.jpg,0,,1220696,tt28521863,[DK],da,Dressage,,...,False,10.0,2,[],[],,"[558626, 4441052, 4428845, 4478158]","[65836b80f1759c3fa210e891, 65836bb685867855daf...","[4428869, 4005580, 4666868, 2501422, 4666871, ...","[65a0582a96670e0129aba875, 65a0583f7caa470131c..."
3,False,/uoinIvWZmgKfUuEnZj4vXmRIejD.jpg,0,,1226857,,[DK],da,Grin til Gavn 2024,"One of Denmark's most hyped comedy duos, Marti...",...,False,7.0,2,[35],"[54911, 758]",,"[2934371, 2934372, 1556815, 2504733, 221190, 8...","[65984d0b89b56101a4c37160, 65984d160d11f201497...","[2934372, 2934371, 5422513, 5539604, 4994889, ...","[68651ca202ab41cc08fb8507, 68651caa8888a8af7d7..."
4,False,/cVryiKn7n6xtszH0X5Q0z7BV9Lt.jpg,0,,1227125,,[DK],da,Dronning Margrethe 2. - 52 år som Danmarks regent,Get up close to Queen Margrethe II as we paint...,...,False,0.0,0,[],[758],,"[1184201, 4470379]","[659be1270d11f2025caf1017, 659be14b89b561240fc...",[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202,False,/MSP35R6y0ypmf3zU1kyUw1P4It.jpg,0,,1407803,,[DK],da,Verdensmænd - Bobos surprise,,...,False,10.0,1,[35],[758],,"[1828049, 234058, 1355004, 88356, 2208065]","[676d44614ca2fbbfca614283, 676d44784ca2fbbfca6...",[],[]
203,False,/2iVVzCEnX4QMEeyd3e52juYcsH8.jpg,0,,1410638,,[DK],da,Gud bevare Danmark,,...,False,0.0,0,[],[],,[],[],[],[]
204,False,/ej5uQVMJTzq8nZBCbf6pUMNF6jl.jpg,0,,1409581,,[DK],da,Frank Hvam - Nobody,Frank Hvam has lived in New Zealand with his f...,...,False,6.0,1,[35],[758],,[89973],[67724668d8dc22e6c6927586],"[89973, 5520795, 563909, 5227452, 4263643, 516...","[68650819af3c4b2566414b34, 686508769febd451ea8..."
205,False,,0,,1412034,,[DK],da,"Dan Andersen - Nedsat Hørelse, Nedsat Sædkvali...",,...,False,0.0,0,[35],[],,[149860],[677903662b097b15a274ab1e],[],[]


Run through the movie dataframe and create list of unique actor_ids and the movie_ids they appear in

In [5]:
# Step 1: Explode the cast_person_ids list so each actor appears in a separate row
df_exploded = df_movies[['id', 'cast_person_ids']].explode('cast_person_ids')
df_exploded

Unnamed: 0,id,cast_person_ids
0,1247285,
1,1237423,2504733
2,1220696,558626
2,1220696,4441052
2,1220696,4428845
...,...,...
202,1407803,2208065
203,1410638,
204,1409581,89973
205,1412034,149860


In [6]:
# Step 2: Rename for clarity
df_exploded = df_exploded.rename(columns={'id': 'movie_ids', 'cast_person_ids': 'actor_id'})
df_exploded

Unnamed: 0,movie_ids,actor_id
0,1247285,
1,1237423,2504733
2,1220696,558626
2,1220696,4441052
2,1220696,4428845
...,...,...
202,1407803,2208065
203,1410638,
204,1409581,89973
205,1412034,149860


In [7]:
# Step 3: Group by actor_id and aggregate the movie_ids into a set
danish_actors = df_exploded.groupby('actor_id')['movie_ids'].agg(set).reset_index()
danish_actors

Unnamed: 0,actor_id,movie_ids
0,1018,{1317159}
1,1023,"{1382539, 1359071}"
2,1024,{983465}
3,1181,"{1355578, 1466191}"
4,1184,"{1227608, 1290697}"
...,...,...
815,5652166,{1536618}
816,5652167,{1536618}
817,5654128,{1536970}
818,5669894,{1245001}


In [8]:
all_persons = []

for actor_id in danish_actors['actor_id']:
    url = f"https://api.themoviedb.org/3/person/{actor_id}"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Status code: {response.status_code} Text: {response.text}")
    
    # Extract the person data from the response
    person = response.json()
    
    all_persons.append(person)
    sleep(0.02)  # Just to be sure we don't request too many requests
    
    
df_persons = pd.DataFrame(all_persons)


In [9]:
df_persons.rename(columns={'id': 'actor_id'}, inplace=True)
danish_actors = pd.merge(danish_actors, df_persons, on='actor_id')
danish_actors

Unnamed: 0,actor_id,movie_ids,adult,also_known_as,biography,birthday,deathday,gender,homepage,imdb_id,known_for_department,name,place_of_birth,popularity,profile_path
0,1018,{1317159},False,[نیکلای لی کاس],Nikolaj Lie Kaas (born 22 May 1973) is a promi...,1973-05-22,,2,,nm0509263,Acting,Nikolaj Lie Kaas,"Glostrup, Denmark",1.1447,/zRu68e0GSwHJC7QOcbO77xIAM9J.jpg
1,1023,"{1382539, 1359071}",False,[],,1960-03-08,,2,,nm0647757,Acting,Niels Olsen,"Aarhus, Danmark",0.2028,/14DqKFpN2ElvzNjddvkMUly2rdT.jpg
2,1024,{983465},False,[],,1940-11-15,2024-10-28,2,,nm0683397,Acting,Ulf Pilgaard,"Skive, Danmark",0.0913,/108pcSiyuB6A6qO6ZmOWM3GxxQT.jpg
3,1181,"{1355578, 1466191}",False,[],Peter Gantzler is a Danish actor. He is marrie...,1958-09-28,,2,,nm0304644,Acting,Peter Gantzler,Denmark,0.5883,/1oL9VT7YLfaly3BwAcKj5xquoCo.jpg
4,1184,"{1227608, 1290697}",False,"[Anders Wodskou Bertelsen, Anders Bertelsen, A...",,1969-09-28,,2,,nm0077944,Acting,Anders W. Berthelsen,"Rødovre, Danmark",0.3129,/yEaRQLSOTcGsVHCkKEusb6PfUCE.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
815,5652166,{1536618},False,[],,,,0,,,Acting,Laura Amalie Bülow,,0.0000,
816,5652167,{1536618},False,[],,,,0,,,Acting,Sarah Halfani Madsen,,0.0000,
817,5654128,{1536970},False,[],,,,0,,,Acting,Hugo Rex Skov,,0.0000,
818,5669894,{1245001},False,[],,,,0,,,Acting,Janus Enevoldsen,,0.0143,


In [10]:
danish_actors.to_csv("../movie_data/danish_actors.csv", index=False)