Get movie_ids for all movies with original_language = danish

In [1]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
from time import sleep

load_dotenv()
tmdb_api_token = os.getenv("TMDB_API_TOKEN")

base_url = (
    "https://api.themoviedb.org/3/discover/movie"
    "?include_adult=false"
    "&include_video=false"
    "&language=en-US"
    "&sort_by=primary_release_date.asc"
    "&with_original_language=da"
    "&primary_release_date.gte=2020-01-01"
    "&primary_release_date.lte=2024-12-31"
)

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {tmdb_api_token}"
}

movie_ids = []

# Loop through all pages (adjust max page if needed)
for page in range(1, 501):
    url = f"{base_url}&page={page}"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Status code: {response.status_code} Text: {response.text}")
    
    # Extract movie_results from the response
    movie_results = response.json().get("results") 
    
    # Stop if we have reached the last page
    if not movie_results:
        break

    # Extract movie_ids from the movie_results
    movie_ids.extend([movie["id"] for movie in movie_results])
    
    sleep(0.02)  # Just to be sure we don't request too many requests

print(f"Number of movie ids found: {len(movie_ids)}")

Number of movie ids found: 931


Get movie data for all these movie_ids and store in dataframe and CSV file

In [2]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
from time import sleep

load_dotenv()
tmdb_api_token = os.getenv("TMDB_API_TOKEN")

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {tmdb_api_token}"
}

all_movies = []

# Loop through all movie_ids and request movie data
for movie_id in movie_ids:
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?language=en-US&append_to_response=credits"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Status code: {response.status_code} Text: {response.text}")
    
    # Extract the movie data from the response
    movie = response.json()

    # Simplify dictionaries to list of ids
    movie["genre_ids"] = [genre["id"] for genre in movie["genres"]]
    movie["spoken_languages"] = [language["iso_639_1"] for language in movie["spoken_languages"]]
    movie["production_company_ids"] = [company["id"] for company in movie["production_companies"]]
    movie["production_countries"] = [country["iso_3166_1"] for country in movie["production_countries"]]
    movie["collection_id"] = movie.get("belongs_to_collection").get("id") if movie["belongs_to_collection"] else None
    movie["cast_person_ids"] = [cast_member["id"] for cast_member in movie["credits"]["cast"]]
    movie["cast_credit_ids"] = [cast_member["credit_id"] for cast_member in movie["credits"]["cast"]]
    movie["crew_person_ids"] = [crew_member["id"] for crew_member in movie["credits"]["crew"]]
    movie["crew_credit_ids"] = [crew_member["credit_id"] for crew_member in movie["credits"]["crew"]]
    del movie['genres']
    del movie['production_companies']
    del movie["belongs_to_collection"]   
    del movie["credits"]
    
    all_movies.append(movie)
    sleep(0.02)  # Just to be sure we don't request too many requests

df_movies = pd.DataFrame(all_movies)
print(f"Total movies collected: {len(df_movies)}")

output_folder = os.path.join(os.getcwd(), "movie_data")
output_path = os.path.join(output_folder, "alt.csv")
df_movies.to_csv(output_path, index=False)

Total movies collected: 931


In [3]:
df_movies

Unnamed: 0,adult,backdrop_path,budget,homepage,id,imdb_id,origin_country,original_language,original_title,overview,...,video,vote_average,vote_count,genre_ids,production_company_ids,collection_id,cast_person_ids,cast_credit_ids,crew_person_ids,crew_credit_ids
0,False,,0,,1389683,,[DK],da,Den gamle mand og skoven,,...,False,0.0,0,"[99, 10770]",[119],,[],[],"[1114991, 1114991, 1114991, 5066576, 5066578]","[673b21f4f741eb0428b62a1e, 673b2207f741eb0428b..."
1,False,,0,,1275299,,[DK],da,"Badabing og Bang - Hurra, årtiet er slut!",,...,False,0.0,0,[99],[119],,[91568],[661ea6efd18fb90131c9c55c],[],[]
2,False,,0,,1176704,tt26752729,[DK],da,Stime,"During the summer, the relationship between tw...",...,False,0.0,0,[18],[166089],,"[3018873, 2120409, 2974764, 4265498, 4265499, ...","[64ff095fffc9de0eded38e34, 64ff096cffc9de0edf6...","[2653084, 2653084, 2653084, 208175, 135949]","[64ff09aaffc9de0ee3c53cfc, 64ff09b02dffd800e3d..."
3,False,/3tdNtSbcMliHlWnxt5fVuiL6QmO.jpg,0,,660040,,[DK],da,Grin til gavn 2019,"In keeping with tradition, the popular comedy ...",...,False,6.0,1,[35],[20632],,"[88359, 88355, 1949847, 2040551, 2208065, 1556...","[5e16d23e11386c0015c92dd1, 5e16d2470cb33500170...",[],[]
4,False,/w2T6QLixu8X6C5FlfpKDglIFcJ4.jpg,0,,661167,,[DK],da,Et langsomt mord,,...,False,0.0,0,[99],[639],,[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
926,False,/MSP35R6y0ypmf3zU1kyUw1P4It.jpg,0,,1407803,,[DK],da,Verdensmænd - Bobos surprise,,...,False,10.0,1,[35],[758],,"[1828049, 234058, 1355004, 88356, 2208065]","[676d44614ca2fbbfca614283, 676d44784ca2fbbfca6...",[],[]
927,False,/2iVVzCEnX4QMEeyd3e52juYcsH8.jpg,0,,1410638,,[DK],da,Gud bevare Danmark,,...,False,0.0,0,[],[],,[],[],[],[]
928,False,/ej5uQVMJTzq8nZBCbf6pUMNF6jl.jpg,0,,1409581,,[DK],da,Frank Hvam - Nobody,Frank Hvam has lived in New Zealand with his f...,...,False,6.0,1,[35],[758],,[89973],[67724668d8dc22e6c6927586],"[89973, 5520795, 563909, 5227452, 4263643, 516...","[68650819af3c4b2566414b34, 686508769febd451ea8..."
929,False,,0,,1412034,,[DK],da,"Dan Andersen - Nedsat Hørelse, Nedsat Sædkvali...",,...,False,0.0,0,[35],[],,[149860],[677903662b097b15a274ab1e],[],[]


Run through the movie dataframe and create list of unique actor_ids

In [4]:
actor_ids = df_movies["cast_person_ids"].explode().dropna().unique().tolist()

In [5]:
actor_ids

[91568,
 3018873,
 2120409,
 2974764,
 4265498,
 4265499,
 4265500,
 88359,
 88355,
 1949847,
 2040551,
 2208065,
 1556815,
 2504733,
 1421812,
 1949848,
 2406545,
 1355004,
 1883181,
 1079967,
 2110194,
 83257,
 234058,
 2328186,
 1682477,
 2712971,
 2190163,
 2712972,
 2244,
 6120,
 1167832,
 1149906,
 1518800,
 2647874,
 1180028,
 1150549,
 19357,
 2036660,
 112733,
 225512,
 1815522,
 1259903,
 1016025,
 79260,
 231318,
 1268611,
 1444673,
 137444,
 1030328,
 1430283,
 1443045,
 4459,
 88145,
 88544,
 1445003,
 1944591,
 1111176,
 115214,
 297106,
 2642049,
 2647879,
 1074440,
 2065828,
 1025545,
 565155,
 1954416,
 1139985,
 1424989,
 2647881,
 1691420,
 2647882,
 2302835,
 1435848,
 2647883,
 2647884,
 582347,
 1122833,
 2647885,
 1232923,
 2647886,
 2647888,
 2507250,
 226190,
 1733092,
 2190857,
 2647889,
 2647890,
 2647892,
 93108,
 1034328,
 1123821,
 1150553,
 6122,
 2713066,
 2639347,
 2713067,
 2713068,
 2713069,
 2713070,
 2713071,
 42095,
 89973,
 42094,
 1444987,
 25951

In [10]:
import pandas as pd

# Step 1: Explode the cast_person_ids list so each actor appears in a separate row
df_exploded = df_movies[['id', 'cast_person_ids']].explode('cast_person_ids')

# Step 2: Rename for clarity
df_exploded = df_exploded.rename(columns={'id': 'movie_ids', 'cast_person_ids': 'actor_id'})

print(df_exploded)

# Step 3: Group by actor_id and aggregate the movie_ids into a set
actor_movies = df_exploded.groupby('actor_id')['movie_ids'].agg(set).reset_index()

print(actor_movies.head())

     movie_ids actor_id
0      1389683      NaN
1      1275299    91568
2      1176704  3018873
2      1176704  2120409
2      1176704  2974764
..         ...      ...
926    1407803  2208065
927    1410638      NaN
928    1409581    89973
929    1412034   149860
930    1410006  2703160

[5029 rows x 2 columns]
   actor_id                          movie_ids
0        42                  {752908, 1054150}
1       935                          {1061064}
2      1011                           {752908}
3      1012                   {752908, 752910}
4      1017  {1029880, 655297, 990691, 752908}


In [7]:
actor_movies

Unnamed: 0,actor_id,movie_id,num_movies
384,588175,"{995169, 1079201, 990691, 918628, 1125607, 580...",14
50,6120,"{1079202, 1119544, 1054150, 750223, 1124080, 1...",13
704,1442353,"{1299009, 991266, 733379, 1375365, 1035422, 91...",12
583,1335546,"{918627, 902407, 1178663, 1078249, 741105, 901...",12
537,1208770,"{655297, 980026, 1292039, 1053927, 711017, 116...",10
...,...,...,...
3109,5629763,{1530968},1
3108,5629762,{1530968},1
3107,5629760,{1530968},1
3106,5629758,{1530968},1
