In [1]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
from time import sleep

load_dotenv()
tmdb_api_token = os.getenv("TMDB_API_TOKEN")

base_url = (
    "https://api.themoviedb.org/3/discover/movie"
    "?include_adult=false"
    "&include_video=false"
    "&language=en-US"
    "&sort_by=popularity.desc"
    "&with_original_language=da"
    "&primary_release_date.gte=2020-01-01"
    "&primary_release_date.lte=2024-12-31"
)

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {tmdb_api_token}"
}

all_results = []

# Loop through all pages (adjust max page if needed)
for page in range(1, 289):
    url = f"{base_url}&page={page}"
    resp = requests.get(url, headers=headers).json()

    # Stop if we reach the last page
    if "results" not in resp or not resp["results"]:
        break

    all_results.extend(resp["results"])
    sleep(0.02)  # Just to be sure we don't request too many requests

df_movies = pd.DataFrame(all_results)
print(f"Total movies collected: {len(df_movies)}")

output_folder = os.path.join(os.getcwd(), "movie_data")
output_path = os.path.join(output_folder, "danish_movies_upto_2024.csv")
df_movies.to_csv(output_path, index=False)

Total movies collected: 931


OSError: Cannot save file into a non-existent directory: 'c:\Users\Tobi\Desktop\movie_data'

In [None]:
# Running through all the movies and getting actors id, name and gender
import os
import requests
import pandas as pd
from dotenv import load_dotenv
from time import sleep

load_dotenv()
tmdb_api_token = os.getenv("TMDB_API_TOKEN")

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {tmdb_api_token}"
}

# Make sure df_movies is already loaded or created before this section
movie_ids = df_movies["id"].tolist()

all_credits = []

for movie_id in movie_ids:
    url = f"https://api.themoviedb.org/3/movie/{movie_id}/credits?language=en-US"
    resp = requests.get(url, headers=headers).json()

    for member in resp.get("cast", []):
        all_credits.append({
            "movie_id": movie_id,
            "actor_id": member["id"],
            "actor_name": member["name"],
            "gender": member.get("gender")
        })

    sleep(0.02)  # Just to be sure we don't request too many requests

df_cast = pd.DataFrame(all_credits)

print(f"Credits rows collected: {len(df_cast)}")


output_folder = os.path.join(os.getcwd(), "movie_data")
output_path = os.path.join(output_folder, "danish_movies_cast_credits.csv")
df_cast.to_csv(output_path, index=False)

Credits rows collected: 4767


In [None]:
# Running through all the movies and getting actors id, name and gender
import os
import requests
import pandas as pd
from dotenv import load_dotenv
from time import sleep
from collections import defaultdict

load_dotenv()
tmdb_api_token = os.getenv("TMDB_API_TOKEN")

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {tmdb_api_token}"
}

# Make sure df_movies is already loaded or created before this section
movie_ids = df_movies["id"].tolist()

actor_movies = defaultdict(lambda: {"actor_name": "", "movie_ids": set(), "gender": None})

for movie_id in movie_ids:
    url = f"https://api.themoviedb.org/3/movie/{movie_id}/credits?language=en-US"
    resp = requests.get(url, headers=headers).json()

    for member in resp.get("cast", []):
        actor_id = member["id"]
        actor_movies[actor_id]["actor_name"] = member["name"]
        actor_movies[actor_id]["gender"] = member.get("gender")
        actor_movies[actor_id]["movie_ids"].add(movie_id)

    sleep(0.02)  # Avoid too many rapid requests

# Convert defaultdict to list of dicts
rows = []
for actor_id, info in actor_movies.items():
    rows.append({
        "actor_id": actor_id,
        "actor_name": info["actor_name"],
        "gender": info["gender"],
        "movie_ids": list(info["movie_ids"])  # Convert set to list for CSV
    })

df_actors = pd.DataFrame(rows)

print(f"Total unique actors collected: {len(df_actors)}")
output_folder = os.path.join(os.getcwd(), "movie_data")
output_path = os.path.join(output_folder, "danish_actors_movies_2024.csv")
df_actors.to_csv(output_path, index=False)

Total unique actors collected: 3131


In [None]:
# Getting birthday and deathday for the actors
import os
import requests
import pandas as pd
from dotenv import load_dotenv
from time import sleep

load_dotenv()
tmdb_api_token = os.getenv("TMDB_API_TOKEN")

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {tmdb_api_token}"
}

actor_ids = df_actors["actor_id"].tolist()
actors_bd_list = []

for actor_id in actor_ids:
    url = f"https://api.themoviedb.org/3/person/{actor_id}"
    resp = requests.get(url, headers=headers).json()

    birthday = resp.get("birthday", None)
    deathday = resp.get("deathday", None)

    actors_bd_list.append({
        "actor_id": actor_id,
        "birthday": birthday,
        "deathday": deathday
    })

    sleep(0.02)  # Avoid hitting the rate limit

df_actorsbd = pd.DataFrame(actors_bd_list)

# Merge with actor names for readability
df_actorsbd = df_actorsbd.merge(df_actors[["actor_id", "actor_name"]], on="actor_id")

print(f"Total actors with birthday/deathday info: {len(df_actorsbd)}")
output_folder = os.path.join(os.getcwd(), "movie_data")
output_path = os.path.join(movie_data, "danish_actors_bd_dd_2024.csv")
df_actorsbd.to_csv(output_path, index=False)