Get movie_ids for all movies with original_language = danish

In [1]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
from time import sleep

load_dotenv()
tmdb_api_token = os.getenv("TMDB_API_TOKEN")

base_url = (
    "https://api.themoviedb.org/3/discover/movie"
    "?include_adult=false"
    "&include_video=false"
    "&language=en-US"
    "&sort_by=primary_release_date.asc"
    "&with_original_language=da"
    "&primary_release_date.gte=1800-01-01"
    "&primary_release_date.lte=2024-12-31"
)

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {tmdb_api_token}"
}

movie_ids = []

# Loop through all pages (adjust max page if needed)
for page in range(1, 501):
    url = f"{base_url}&page={page}"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Status code: {response.status_code} Text: {response.text}")
    
    # Extract movie_results from the response
    movie_results = response.json().get("results") 
    
    # Stop if we have reached the last page
    if not movie_results:
        break

    # Extract movie_ids from the movie_results
    movie_ids.extend([movie["id"] for movie in movie_results])
    
    sleep(0.02)  # Just to be sure we don't request too many requests

print(f"Number of movie ids found: {len(movie_ids)}")

Number of movie ids found: 5134


Get movie data for all these movie_ids and store in dataframe and CSV file

In [2]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
from time import sleep

load_dotenv()
tmdb_api_token = os.getenv("TMDB_API_TOKEN")

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {tmdb_api_token}"
}

all_movies = []

# Loop through all movie_ids and request movie data
for movie_id in movie_ids:
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?language=en-US&append_to_response=credits"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Status code: {response.status_code} Text: {response.text}")
    
    # Extract the movie data from the response
    movie = response.json()

    # Simplify dictionaries to list of ids
    movie["genre_ids"] = [genre["id"] for genre in movie["genres"]]
    movie["spoken_languages"] = [language["iso_639_1"] for language in movie["spoken_languages"]]
    movie["production_company_ids"] = [company["id"] for company in movie["production_companies"]]
    movie["production_countries"] = [country["iso_3166_1"] for country in movie["production_countries"]]
    movie["collection_id"] = movie.get("belongs_to_collection").get("id") if movie["belongs_to_collection"] else None
    movie["cast_person_ids"] = [cast_member["id"] for cast_member in movie["credits"]["cast"]]
    movie["cast_credit_ids"] = [cast_member["credit_id"] for cast_member in movie["credits"]["cast"]]
    movie["crew_person_ids"] = [crew_member["id"] for crew_member in movie["credits"]["crew"]]
    movie["crew_credit_ids"] = [crew_member["credit_id"] for crew_member in movie["credits"]["crew"]]
    del movie['genres']
    del movie['production_companies']
    del movie["belongs_to_collection"]   
    del movie["credits"]
    
    all_movies.append(movie)
    sleep(0.02)  # Just to be sure we don't request too many requests

df_movies = pd.DataFrame(all_movies)
print(f"Total movies collected: {len(df_movies)}")

output_folder = os.path.join(os.getcwd(), "movie_data")
output_path = os.path.join(output_folder, "danish_movies.csv")
df_movies.to_csv(output_path, index=False)

Total movies collected: 5134


In [3]:
df_movies

Unnamed: 0,adult,backdrop_path,budget,homepage,id,imdb_id,origin_country,original_language,original_title,overview,...,video,vote_average,vote_count,genre_ids,production_company_ids,collection_id,cast_person_ids,cast_credit_ids,crew_person_ids,crew_credit_ids
0,False,,0,,195139,tt0290709,[DK],da,Kørsel med Grønlandske hunde,"Johan Carl Joensen, a Danish colonial manager ...",...,False,5.3,15,[99],[41774],,[2452695],[5dc5d410470ead001391303b],"[1171313, 1171313]","[52fe4d069251416c91108121, 5dc5d3ef470ead00159..."
1,False,,0,,232636,tt0348157,[DK],da,Svanerne i Sortedamssøen,Swans getting handled as people watch.,...,False,3.0,2,[99],[41774],,[],[],[1171313],[52fe4e07c3a36847f827a04d]
2,False,,0,,195364,tt0346587,[DK],da,Bech Olsens og Poul Pons' brydekamp,Early wrestling footage.,...,False,4.0,5,[99],[41774],,"[4658231, 4658234]","[661fe77220af77017d3f88a8, 661fe78a6d9fe8017d6...",[1171313],[52fe4d0d9251416c91108ee5]
3,False,,0,,232637,tt1469330,[DK],da,Badescener fra Skovshoved,Fun at the beach.,...,False,4.5,4,[99],[41774],,[],[],[1171313],[52fe4e07c3a36847f827a057]
4,False,,0,,195305,tt0347357,[DK],da,De Kongelige skal fotograferes,"Filmed while the Christian IX, King of Denmark...",...,False,4.8,4,[99],[],,[],[],[1171313],[52fe4d0b9251416c91108c3d]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5129,False,/MSP35R6y0ypmf3zU1kyUw1P4It.jpg,0,,1407803,,[DK],da,Verdensmænd - Bobos surprise,,...,False,10.0,1,[35],[758],,"[1828049, 234058, 1355004, 88356, 2208065]","[676d44614ca2fbbfca614283, 676d44784ca2fbbfca6...",[],[]
5130,False,/2iVVzCEnX4QMEeyd3e52juYcsH8.jpg,0,,1410638,,[DK],da,Gud bevare Danmark,,...,False,0.0,0,[],[],,[],[],[],[]
5131,False,/ej5uQVMJTzq8nZBCbf6pUMNF6jl.jpg,0,,1409581,,[DK],da,Frank Hvam - Nobody,Frank Hvam has lived in New Zealand with his f...,...,False,6.0,1,[35],[758],,[89973],[67724668d8dc22e6c6927586],"[89973, 5520795, 563909, 5227452, 4263643, 516...","[68650819af3c4b2566414b34, 686508769febd451ea8..."
5132,False,,0,,1412034,,[DK],da,"Dan Andersen - Nedsat Hørelse, Nedsat Sædkvali...",,...,False,0.0,0,[35],[],,[149860],[677903662b097b15a274ab1e],[],[]


Run through the movie dataframe and create list of unique actor_ids and the movie_ids they appear in

In [4]:
# Step 1: Explode the cast_person_ids list so each actor appears in a separate row
df_exploded = df_movies[['id', 'cast_person_ids']].explode('cast_person_ids')
df_exploded

Unnamed: 0,id,cast_person_ids
0,195139,2452695
1,232636,
2,195364,4658231
2,195364,4658234
3,232637,
...,...,...
5129,1407803,2208065
5130,1410638,
5131,1409581,89973
5132,1412034,149860


In [5]:
# Step 2: Rename for clarity
df_exploded = df_exploded.rename(columns={'id': 'movie_ids', 'cast_person_ids': 'actor_id'})
df_exploded

Unnamed: 0,movie_ids,actor_id
0,195139,2452695
1,232636,
2,195364,4658231
2,195364,4658234
3,232637,
...,...,...
5129,1407803,2208065
5130,1410638,
5131,1409581,89973
5132,1412034,149860


In [6]:
# Step 3: Group by actor_id and aggregate the movie_ids into a set
danish_actors = df_exploded.groupby('actor_id')['movie_ids'].agg(set).reset_index()
danish_actors

Unnamed: 0,actor_id,movie_ids
0,42,"{658817, 456325, 29445, 1354636, 752908, 14460..."
1,53,{15843}
2,169,{8883}
3,378,{266285}
4,380,{214137}
...,...,...
15597,5706360,{1154066}
15598,5706385,{1154066}
15599,5706390,{1154066}
15600,5706467,{1019835}


In [7]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
from time import sleep

load_dotenv()
tmdb_api_token = os.getenv("TMDB_API_TOKEN")

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {tmdb_api_token}"
}

all_persons = []

for actor_id in danish_actors['actor_id']:
    print(actor_id)
    url = f"https://api.themoviedb.org/3/person/{actor_id}"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Status code: {response.status_code} Text: {response.text}")
    
    # Extract the person data from the response
    person = response.json()
    
    all_persons.append(person)
    sleep(0.02)  # Just to be sure we don't request too many requests
    
    
df_persons = pd.DataFrame(all_persons)


42
53
169
378
380
517
585
591
677
782
921
935
955
1004
1011
1012
1017
1018
1019
1021
1023
1024
1025
1026
1083
1118
1174
1175
1176
1177
1178
1181
1182
1183
1184
1185
1186
1204
1269
1294
1356
1558
1559
1562
1563
1564
1565
1566
1567
1568
1569
1570
1573
1574
1626
1634
1637
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1668
1671
1821
1833
1834
1835
1836
1837
1839
1840
1841
1842
1892
2047
2201
2227
2244
2268
2272
2310
2559
2729
2765
2838
3150
3395
3396
3397
3398
3399
3401
3402
3407
3408
3409
3410
3411
3412
3509
3776
3846
3847
3848
3853
3855
3857
3880
3881
3882
3883
3884
3885
3887
3888
3889
3890
3891
3913
3914
3915
3916
3917
3918
3919
3920
3921
4002
4109
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4467
4468
4469
4470
4471
4480
4481
4482
4637
4654
4812
5049
5274
5293
5985
6001
6002
6004
6120
6121
6122
6123
6124
6125
6126
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6140
6141
6142
6143
6144
6145
6146
6154
6155
6273
6283
6285
6288
6291
6333
6649
6657


In [8]:
danish_actors.shape

(15602, 2)

In [9]:
df_persons.shape

(15602, 14)

In [10]:
df_persons.rename(columns={'id': 'actor_id'}, inplace=True)
danish_actors = pd.merge(danish_actors, df_persons, on='actor_id')
danish_actors

Unnamed: 0,actor_id,movie_ids,adult,also_known_as,biography,birthday,deathday,gender,homepage,imdb_id,known_for_department,name,place_of_birth,popularity,profile_path
0,42,"{658817, 456325, 29445, 1354636, 752908, 14460...",False,"[Ларс фон Триер, 拉斯·馮·提爾, ラース・フォン・トリアー, لارس ف...",Lars von Trier (born Lars Trier; 30 April 1956...,1956-04-30,,2,,nm0001885,Directing,Lars von Trier,"Kongens Lyngby, Danmark",0.9635,/a1zVySI0Yqqf3ORegQ1icMfvhAa.jpg
1,53,{15843},False,"[Rolf Peter Ingvar Storm, پیتر استورماره, پِتِ...","Peter Stormare (August 27, 1953) was born in A...",1953-08-27,,2,,nm0001780,Acting,Peter Stormare,"Arbrå, Gävleborgs län, Sweden",5.0311,/1rtpuUqBV29jDc1huUhtjGDbEwn.jpg
2,169,{8883},False,[],Hanns Zischler (born 18 June 1947) is a German...,1947-06-18,,2,,nm0957193,Acting,Hanns Zischler,"Nuremberg, Bavaria, Germany",0.2330,/5O4Dum1OuKI4RMB24xjACkL22iA.jpg
3,378,{266285},False,"[Джонатан Прайс, 조너선 프라이스, 乔纳森·普雷斯, جاناتان پر...","Jonathan Pryce, CBE (born 1 June 1947) is a We...",1947-06-01,,2,,nm0000596,Acting,Jonathan Pryce,"Carmel, Flintshire, Wales, UK",2.1896,/zwSv5uXzPTtmitFe39UdqnVwmdL.jpg
4,380,{214137},False,"[Robert DeNiro, Robert Anthony De Niro Jr. , ...","Robert Anthony De Niro (born August 17, 1943) ...",1943-08-17,,2,,nm0000134,Acting,Robert De Niro,"Greenwich Village, New York City, New York, USA",3.2940,/cT8htcckIuyI1Lqwt1CvD02ynTh.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15597,5706360,{1154066},False,[],,,,0,,,Acting,Thea Esther Saniel Thomsen,,0.0000,
15598,5706385,{1154066},False,[],,,,0,,,Acting,Morten Jay Jakobsen,,0.0000,
15599,5706390,{1154066},False,[],,,,0,,,Acting,Frederik Rasted,,0.0000,
15600,5706467,{1019835},False,[],,,,0,,,Acting,Jytte Vikkelsøe,,0.0000,


In [11]:
output_folder = os.path.join(os.getcwd(), "movie_data")
output_path = os.path.join(output_folder, "danish_actors.csv")
danish_actors.to_csv(output_path, index=False)