# read_tmdb_data.ipynb

This notebook reads JSON data from TMDB's API and stores it as CSV files:
- **movies.csv**: Data on XXXX movies from United States from 2000-2023
- **credits.csv**: Data on credits for all persons in these movies (both cast and crew).
- **persons.csv**: Data on all actors that appear in these movies.

In [None]:
import os
import requests
from dotenv import load_dotenv
from time import sleep
import pandas as pd

load_dotenv()
tmdb_api_token = os.getenv("TMDB_API_TOKEN")

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {tmdb_api_token}"
}

First we request movie_ids for all movies that live up to these requirements:
- From the United States
- From the years 2000-2023
- With original_langauge = english
- With TMDB vote count ≥ 1000

In [None]:
base_url = (
    "https://api.themoviedb.org/3/discover/movie"
    "?include_adult=false"
    "&include_video=false"
    "&language=en-US"
    "&with_original_language=en"
    "&with_origin_country=US"
    "&vote_count.gte=30000"
    "&primary_release_date.gte=2000-01-01"
    "&primary_release_date.lte=2023-12-31"
    "&sort_by=primary_release_date.asc"
)

movie_ids = []

# Loop through all pages (adjust max page if needed)
for page in range(1, 501):
    url = f"{base_url}&page={page}"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Status code: {response.status_code} Text: {response.text}")
    
    # Extract movie_results from the response
    movie_results = response.json().get("results") 
    
    # Stop if we have reached the last page
    if not movie_results:
        break

    # Extract movie_ids from the movie_results
    movie_ids.extend([movie["id"] for movie in movie_results])
    
    sleep(0.02)  # Just to be sure we don't request too many requests

print(f"Number of movie ids found: {len(movie_ids)}")

Then we request movie data for all these movie_ids, and store that in a dataframe and a CSV file

In [None]:
all_movies = []

# Loop through all movie_ids and request movie data
for movie_id in movie_ids:
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?language=en-US&append_to_response=credits"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Status code: {response.status_code} Text: {response.text}")
    
    # Extract the movie data from the response
    movie = response.json()

    # Simplify dictionaries to list of ids
    movie["genre_ids"] = [genre["id"] for genre in movie["genres"]]
    movie["spoken_languages"] = [language["iso_639_1"] for language in movie["spoken_languages"]]
    movie["production_company_ids"] = [company["id"] for company in movie["production_companies"]]
    movie["production_countries"] = [country["iso_3166_1"] for country in movie["production_countries"]]
    movie["collection_id"] = movie["belongs_to_collection"]["id"] if movie["belongs_to_collection"] else pd.NA
    movie["cast_person_ids"] = [cast_member["id"] for cast_member in movie["credits"]["cast"]]
    movie["cast_credit_ids"] = [cast_member["credit_id"] for cast_member in movie["credits"]["cast"]]
    movie["crew_person_ids"] = [crew_member["id"] for crew_member in movie["credits"]["crew"]]
    movie["crew_credit_ids"] = [crew_member["credit_id"] for crew_member in movie["credits"]["crew"]]
    del movie['genres']
    del movie['production_companies']
    del movie["belongs_to_collection"]   
    del movie["credits"]
    
    all_movies.append(movie)
    sleep(0.02)  # Just to be sure we don't request too many requests

df_movies = pd.DataFrame(all_movies)
print(f"Total movies collected: {len(df_movies)}")

df_movies.to_csv("../movie_data/movies.csv", index=False)

In [None]:
df_movies

In [None]:
df_movies.info()

Then we run through the movie dataframe and find all unique actor_ids.

And then we request data for all these actor_ids, and store that in a dataframe and a CSV file

In [None]:
# Flatten and combine the two columns
# all_person_ids = df_movies['cast_person_ids'].explode().tolist() + df_movies['crew_person_ids'].explode().tolist()

all_person_ids = df_movies['cast_person_ids'].explode().tolist()

# Get unique person IDs
unique_person_ids = set(all_person_ids)
len(unique_person_ids)

In [7]:
all_persons = []

for person_id in unique_person_ids:
    url = f"https://api.themoviedb.org/3/person/{person_id}"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Status code: {response.status_code} Text: {response.text}")
    
    # Extract the person data from the response
    person = response.json()
    
    all_persons.append(person)
    sleep(0.02)  # Just to be sure we don't request too many requests
    
    
df_persons = pd.DataFrame(all_persons)
df_persons


KeyboardInterrupt: 

In [None]:
# df_persons.rename(columns={'id': 'actor_id'}, inplace=True)
# danish_actors = pd.merge(danish_actors, df_persons, on='actor_id')
# danish_actors

In [None]:
# danish_actors.to_csv("../movie_data/danish_actors.csv", index=False)