# Movie Cast Data Fetcher

This notebook fetches cast information for movies from The Movie Database (TMDB) API and saves the movie-actor mappings to a CSV file.

In [1]:
# Import required libraries
import requests
import csv
import time
import os
import json
import pandas as pd
from datetime import datetime

In [2]:
# Configuration
API_BASE_URL = "https://api.themoviedb.org/3/movie/{}/credits"
CSV_FILE = "/Users/owen/src/Personal/dgbmh/data/movie_cast_mapping.csv"
MOVIE_DATA_FILE = "/Users/owen/src/Personal/dgbmh/data/top_10000_popular_movies.json"
RATE_LIMIT_REQUESTS = 40  # 40 requests per 10 seconds
RATE_LIMIT_PERIOD = 10    # 10 seconds
SAVE_FREQUENCY = 10       # Save every 10 movies
REQUEST_TIMEOUT = 30      # 30 seconds timeout

# You need to set your TMDB API key
# Get it from: https://www.themoviedb.org/settings/api
API_KEY = "e69c09f37024f59b6531d4b4be3494d2"  # Replace with your actual API key

print("Configuration loaded successfully!")
print(f"Will process movies from: {MOVIE_DATA_FILE}")
print(f"Will save results to: {CSV_FILE}")
print(f"Rate limit: {RATE_LIMIT_REQUESTS} requests per {RATE_LIMIT_PERIOD} seconds")

Configuration loaded successfully!
Will process movies from: /Users/owen/src/Personal/dgbmh/data/top_10000_popular_movies.json
Will save results to: /Users/owen/src/Personal/dgbmh/data/movie_cast_mapping.csv
Rate limit: 40 requests per 10 seconds


In [3]:
# Load movie data
print("Loading movie data...")
with open(MOVIE_DATA_FILE, 'r') as f:
    movies_data = json.load(f)

movies_df = pd.DataFrame(movies_data)
print(f"Loaded {len(movies_df)} movies")
print(f"Columns: {list(movies_df.columns)}")
print(f"Sample movie IDs: {movies_df['id'].head().tolist()}")

Loading movie data...
Loaded 10000 movies
Columns: ['adult', 'id', 'original_title', 'popularity', 'video']
Sample movie IDs: [755898, 911430, 1061474, 1151334, 575265]


In [4]:
def load_existing_mappings(csv_file):
    """Load existing movie-cast mappings from CSV to avoid duplicates"""
    existing_mappings = set()
    if os.path.exists(csv_file):
        with open(csv_file, 'r', newline='', encoding='utf-8') as f:
            reader = csv.reader(f)
            next(reader, None)  # Skip header
            for row in reader:
                if len(row) >= 2:
                    existing_mappings.add((int(row[0]), int(row[1])))
    return existing_mappings

def save_mappings_to_csv(mappings, csv_file, write_header=False):
    """Save mappings to CSV file"""
    mode = 'w' if write_header else 'a'
    with open(csv_file, mode, newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        if write_header:
            writer.writerow(['movie_id', 'person_id'])
        for mapping in mappings:
            writer.writerow(mapping)

def fetch_movie_credits(movie_id, api_key):
    """Fetch credits for a specific movie"""
    url = API_BASE_URL.format(movie_id)
    params = {'api_key': api_key}
    
    try:
        response = requests.get(url, params=params, timeout=REQUEST_TIMEOUT)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching credits for movie {movie_id}: {e}")
        return None

print("Helper functions defined successfully!")

Helper functions defined successfully!


In [5]:
def process_movies_for_cast(start_index=0, max_movies=None):
    """Main function to process movies and extract cast information"""
    
    # Check if API key is set
    if API_KEY == "YOUR_TMDB_API_KEY_HERE":
        print("ERROR: Please set your TMDB API key in the API_KEY variable!")
        print("Get your API key from: https://www.themoviedb.org/settings/api")
        return
    
    # Load existing mappings to avoid duplicates
    print("Loading existing mappings...")
    existing_mappings = load_existing_mappings(CSV_FILE)
    print(f"Found {len(existing_mappings)} existing movie-cast mappings")
    
    # Initialize CSV file if it doesn't exist
    if not os.path.exists(CSV_FILE):
        save_mappings_to_csv([], CSV_FILE, write_header=True)
        print(f"Created new CSV file: {CSV_FILE}")
    
    # Determine movies to process
    movies_to_process = movies_df.iloc[start_index:]
    if max_movies:
        movies_to_process = movies_to_process.head(max_movies)
    
    print(f"Processing {len(movies_to_process)} movies (starting from index {start_index})...")
    print(f"Rate limit: {RATE_LIMIT_REQUESTS} requests per {RATE_LIMIT_PERIOD} seconds")
    print(f"Saving progress every {SAVE_FREQUENCY} movies")
    print(f"Starting at {datetime.now()}")
    
    new_mappings = []
    processed_count = 0
    api_calls_made = 0
    start_time = time.time()
    
    for idx, (_, movie) in enumerate(movies_to_process.iterrows()):
        movie_id = movie['id']
        
        # Check if we've already processed this movie
        if any(mapping[0] == movie_id for mapping in existing_mappings):
            print(f"Skipping movie {movie_id} (already processed)")
            continue
        
        # Rate limiting: wait if we've made too many requests
        if api_calls_made >= RATE_LIMIT_REQUESTS:
            elapsed = time.time() - start_time
            if elapsed < RATE_LIMIT_PERIOD:
                sleep_time = RATE_LIMIT_PERIOD - elapsed
                print(f"Rate limit reached, sleeping for {sleep_time:.1f} seconds...")
                time.sleep(sleep_time)
            
            # Reset counters
            api_calls_made = 0
            start_time = time.time()
        
        # Fetch movie credits
        print(f"Processing movie {movie_id} ({processed_count + 1}/{len(movies_to_process)})...")
        credits_data = fetch_movie_credits(movie_id, API_KEY)
        api_calls_made += 1
        
        if credits_data and 'cast' in credits_data:
            # Extract cast member IDs
            movie_mappings = []
            for cast_member in credits_data['cast']:
                person_id = cast_member.get('id')
                if person_id:
                    mapping = (movie_id, person_id)
                    # Check if this mapping already exists
                    if mapping not in existing_mappings:
                        movie_mappings.append(mapping)
                        existing_mappings.add(mapping)  # Add to memory to avoid duplicates in this session
            
            new_mappings.extend(movie_mappings)
            print(f"  Added {len(movie_mappings)} new cast mappings for movie {movie_id}")
        else:
            print(f"  No cast data found for movie {movie_id}")
        
        processed_count += 1
        
        # Save progress periodically
        if processed_count % SAVE_FREQUENCY == 0 and new_mappings:
            print(f"Saving {len(new_mappings)} new mappings to CSV...")
            save_mappings_to_csv(new_mappings, CSV_FILE)
            new_mappings = []  # Clear the buffer
            print(f"Progress saved! Processed {processed_count} movies so far.")
        
        # Small delay between requests to be respectful
        time.sleep(0.1)
    
    # Save any remaining mappings
    if new_mappings:
        print(f"Saving final {len(new_mappings)} mappings to CSV...")
        save_mappings_to_csv(new_mappings, CSV_FILE)
    
    print(f"\nCompleted processing {processed_count} movies!")
    print(f"CSV file saved to: {CSV_FILE}")
    print(f"Total existing mappings: {len(existing_mappings)}")
    
    return processed_count, len(existing_mappings)

print("Main processing function defined successfully!")

Main processing function defined successfully!


## Instructions

Before running the script:

1. **Get a TMDB API Key**: 
   - Go to https://www.themoviedb.org/settings/api
   - Create an account if you don't have one
   - Request an API key
   - Replace `YOUR_TMDB_API_KEY_HERE` in the configuration cell above

2. **Run the cells in order** to load the functions and data

3. **Start the processing** with one of the examples below

In [6]:
# Test with just the first 5 movies to make sure everything works
# Uncomment the line below to run a test
process_movies_for_cast(start_index=0, max_movies=5)

Loading existing mappings...
Found 0 existing movie-cast mappings
Created new CSV file: /Users/owen/src/Personal/dgbmh/data/movie_cast_mapping.csv
Processing 5 movies (starting from index 0)...
Rate limit: 40 requests per 10 seconds
Saving progress every 10 movies
Starting at 2025-09-02 17:39:58.860482
Processing movie 755898 (1/5)...
  Added 10 new cast mappings for movie 755898
Processing movie 911430 (2/5)...
  Added 10 new cast mappings for movie 755898
Processing movie 911430 (2/5)...
  Added 82 new cast mappings for movie 911430
Processing movie 1061474 (3/5)...
  Added 82 new cast mappings for movie 911430
Processing movie 1061474 (3/5)...
  Added 91 new cast mappings for movie 1061474
Processing movie 1151334 (4/5)...
  Added 91 new cast mappings for movie 1061474
Processing movie 1151334 (4/5)...
  Added 83 new cast mappings for movie 1151334
Processing movie 575265 (5/5)...
  Added 83 new cast mappings for movie 1151334
Processing movie 575265 (5/5)...
  Added 81 new cast map

(5, 347)

In [8]:
# Process the first 100 movies
# Uncomment the line below to run
process_movies_for_cast(start_index=0, max_movies=100)

Loading existing mappings...
Found 347 existing movie-cast mappings
Processing 100 movies (starting from index 0)...
Rate limit: 40 requests per 10 seconds
Saving progress every 10 movies
Starting at 2025-09-02 17:41:43.189984
Skipping movie 755898 (already processed)
Skipping movie 911430 (already processed)
Skipping movie 1061474 (already processed)
Skipping movie 1151334 (already processed)
Skipping movie 575265 (already processed)
Processing movie 1242011 (1/100)...
  Added 20 new cast mappings for movie 1242011
Processing movie 1234821 (2/100)...
  Added 20 new cast mappings for movie 1242011
Processing movie 1234821 (2/100)...
  Added 17 new cast mappings for movie 1234821
Processing movie 1083433 (3/100)...
  Added 17 new cast mappings for movie 1234821
Processing movie 1083433 (3/100)...
  Added 28 new cast mappings for movie 1083433
Processing movie 1311031 (4/100)...
  Added 28 new cast mappings for movie 1083433
Processing movie 1311031 (4/100)...
  Added 15 new cast mapping

(95, 3211)

In [10]:
# Process ALL movies (this will take a very long time!)
# Uncomment the line below to run the full process
process_movies_for_cast()

Loading existing mappings...
Found 229569 existing movie-cast mappings
Processing 10000 movies (starting from index 0)...
Rate limit: 40 requests per 10 seconds
Saving progress every 10 movies
Starting at 2025-09-02 20:22:08.647986
Skipping movie 755898 (already processed)
Skipping movie 911430 (already processed)
Skipping movie 1061474 (already processed)
Skipping movie 1151334 (already processed)
Skipping movie 575265 (already processed)
Skipping movie 1242011 (already processed)
Skipping movie 1234821 (already processed)
Skipping movie 1083433 (already processed)
Skipping movie 1311031 (already processed)
Skipping movie 13494 (already processed)
Skipping movie 1175942 (already processed)
Skipping movie 1367575 (already processed)
Skipping movie 1382406 (already processed)
Skipping movie 1087192 (already processed)
Skipping movie 803796 (already processed)
Skipping movie 1078605 (already processed)
Skipping movie 1022787 (already processed)
Skipping movie 1429739 (already processed)


(213, 229918)

In [7]:
# Check the results
if os.path.exists(CSV_FILE):
    # Load and display some statistics about the results
    df_results = pd.read_csv(CSV_FILE)
    print(f"Total movie-cast mappings: {len(df_results):,}")
    print(f"Unique movies: {df_results['movie_id'].nunique():,}")
    print(f"Unique actors: {df_results['person_id'].nunique():,}")
    print(f"\nFirst 10 mappings:")
    print(df_results.head(10))
    
    # Show some statistics
    movies_per_actor = df_results.groupby('person_id').size()
    actors_per_movie = df_results.groupby('movie_id').size()
    
    print(f"\nStatistics:")
    print(f"Average actors per movie: {actors_per_movie.mean():.1f}")
    print(f"Average movies per actor: {movies_per_actor.mean():.1f}")
    print(f"Max actors in a single movie: {actors_per_movie.max()}")
    print(f"Max movies for a single actor: {movies_per_actor.max()}")
else:
    print(f"No results file found at {CSV_FILE}")

Total movie-cast mappings: 347
Unique movies: 5
Unique actors: 345

First 10 mappings:
   movie_id  person_id
0    755898       9778
1    755898      52605
2    755898       9048
3    755898    1632530
4    755898      60482
5    755898      90498
6    755898      71402
7    755898      98811
8    755898      21710
9    755898      30697

Statistics:
Average actors per movie: 69.4
Average movies per actor: 1.0
Max actors in a single movie: 91
Max movies for a single actor: 2
