In [13]:
import requests
import pandas as pd
from time import sleep
import os

# Optionally: Set API key here directly if not using environment variable
api_key = os.getenv("TMDB_API_KEY")

# Fallback if the environment variable is not set
if not api_key:
    api_key = "be93af06df098188cc35a75ee6af6fd4"

# Base TMDB URL
base_url = "https://api.themoviedb.org/3"

# Storage for results
movie_data = []

# Loop through TMDB's /discover/movie endpoint
for page in range(1, 3):  # Start with 2 pages for quick testing; increase to 201 later
    discover_url = f"{base_url}/discover/movie"
    params = {
        "api_key": api_key,
        "language": "en-US",
        "sort_by": "popularity.desc",
        "include_adult": False,
        "include_video": False,
        "page": page,
        "primary_release_date.gte": "2000-01-01",
        "primary_release_date.lte": "2024-12-31"
    }

    response = requests.get(discover_url, params=params)
    if response.status_code != 200:
        print(f"Failed on page {page} — Status: {response.status_code}")
        print(response.json())  # Show error
        break

    movies = response.json().get("results", [])

    for movie in movies:
        movie_id = movie["id"]

        # Fetch movie details
        movie_details = requests.get(f"{base_url}/movie/{movie_id}", params={"api_key": api_key}).json()
        credits = requests.get(f"{base_url}/movie/{movie_id}/credits", params={"api_key": api_key}).json()

        # Get director and top actor
        director = next((c["name"] for c in credits.get("crew", []) if c["job"] == "Director"), None)
        cast = [c["name"] for c in credits.get("cast", [])][:1]

        # Collect relevant fields
        movie_data.append({
            "title": movie_details.get("title"),
            "release_date": movie_details.get("release_date"),
            "budget": movie_details.get("budget"),
            "revenue": movie_details.get("revenue"),
            "runtime": movie_details.get("runtime"),
            "genres": [g["name"] for g in movie_details.get("genres", [])],
            "popularity": movie_details.get("popularity"),
            "vote_average": movie_details.get("vote_average"),
            "vote_count": movie_details.get("vote_count"),
            "original_language": movie_details.get("original_language"),
            "production_companies": [pc["name"] for pc in movie_details.get("production_companies", [])],
            "belongs_to_collection": movie_details.get("belongs_to_collection") is not None,
            "director": director,
            "lead_actor": cast[0] if cast else None
        })

    print(f"Page {page} complete — movies collected: {len(movie_data)}")
    sleep(0.3)

# Convert to DataFrame
df_movies = pd.DataFrame(movie_data)

# Print the DataFrame
df_movies.head()  # Preview first few rows



Page 1 complete — movies collected: 20
Page 2 complete — movies collected: 40


Unnamed: 0,title,release_date,budget,revenue,runtime,genres,popularity,vote_average,vote_count,original_language,production_companies,belongs_to_collection,director,lead_actor
0,The Great Escape,2023-05-26,200000,0,90,"[Action, Thriller]",384.1793,6.2,2,pt,[],False,,
1,Bambi: A Life in the Woods,2024-10-16,0,1419154,77,"[Adventure, Family, Documentary]",191.5043,5.905,21,fr,"[MC4, Gébéka Films, Kinology]",False,Michel Fessler,Mylène Farmer
2,The Haunting at Saint Joseph's,2023-02-26,0,0,100,"[Horror, Thriller]",166.1326,4.6,11,en,[],False,Jon Cohen,Tim Spriggs
3,Brave Citizen,2023-10-25,0,2116112,112,"[Action, Drama, Comedy]",154.2694,7.1,38,ko,"[Studio N, Oscar 10 Studio, Vol Media]",False,Park Jin-pyo,Shin Hye-sun
4,Conjuring the Cult,2024-10-01,0,0,93,"[Horror, Drama]",134.1085,5.5,23,en,[7th Street Productions],False,Calvin Morie McCarthy,Neil Green
