In [23]:
import requests
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

def get_tmdb_movies_by_year(api_key, year, min_count=30):
    """
    Retrieves movies from TMDb's Discover endpoint for a given release year,
    collecting details such as title, release year, genres, rating, runtime,
    director, writer, and top actors. Uses a set to track movie IDs to avoid duplicates.
    The loop stops when it has collected at least min_count movies or processed all pages.

    Args:
        api_key (str): Your TMDb API key.
        year (int): The release year to filter movies by.
        min_count (int): The minimum number of movies to collect (default 30).

    Returns:
        DataFrame: A pandas dataframe sorted by rating in descending order.
    """
    movie_list = []
    processed_ids = set()  # to track movie IDs we've already processed
    page = 1
    url = "https://api.themoviedb.org/3/discover/movie"
    
    while True:
        params = {
            "api_key": api_key,
            "sort_by": "popularity.desc",
            "page": page,
            "with_original_language": "en",
            "primary_release_year": year
        }
        response = requests.get(url, params=params)
        data = response.json()
        
        if page == 1:
            total_pages = data.get("total_pages", 1)
        
        movies = data.get("results", [])
        if not movies:
            break
        
        for movie in movies:
            movie_id = movie.get("id")
            # Skip if this movie has already been processed
            if movie_id in processed_ids:
                continue
            processed_ids.add(movie_id)
            
            # Get detailed movie info
            movie_details_url = f"https://api.themoviedb.org/3/movie/{movie_id}"
            details = requests.get(movie_details_url, params={"api_key": api_key}).json()

            keywords_url = f"https://api.themoviedb.org/3/movie/{movie_id}/keywords"
            keywords = requests.get(keywords_url, params={"api_key": api_key}).json()

            keyword = []

            for word in keywords.get("keywords", []):
                keyword.append(word.get("name"))
            
            keyword = ", ".join(keyword)
            
            # Get credits for director, writer, and top actors
            credits_url = f"https://api.themoviedb.org/3/movie/{movie_id}/credits"
            credits = requests.get(credits_url, params={"api_key": api_key}).json()
            director = ""
            writer = ""
            actors = []
            
            for member in credits.get("crew", []):
                if member.get("job") == "Director":
                    director = member.get("name")
                    break
            
            for member in credits.get("crew", []):
                if member.get("job") == "Writer":
                    writer = member.get("name")
                    break
                    
            for member in credits.get("cast", []):
                if member.get("order") < 3:
                    actors.append(member.get("name"))
            
            # Extract release year, genres, language, plot, and country
            release_date = details.get("release_date", "")
            release_year = release_date.split("-")[0] if release_date else ""
            genres = ", ".join([genre["name"] for genre in details.get("genres", [])])
            language = details.get("original_language", "")
            plot = details.get("overview", "")
            country = details.get("origin_country", "")
            country = ", ".join(country) if isinstance(country, list) else country
            actors_str = ", ".join(actors)
            
            movie_data = {
                "Title": details.get("title"),
                "Year": release_year,
                "Genres": genres,
                "Rating": details.get("vote_average"),
                "Popularity": details.get("popularity"),
                "Runtime": details.get("runtime"),
                "Director": director,
                "Writer": writer,
                "Actors": actors_str,
                "Keywords": keyword,
                "Plot": plot,
                "Ratings Amount": details.get("vote_count"),
                "Language": language,
                "Country": country
            }
            movie_list.append(movie_data)
        
        if len(movie_list) >= min_count or page >= total_pages:
            break
        
        page += 1
    
    df = pd.DataFrame(movie_list)
    df = df.sort_values("Rating", ascending=False).reset_index(drop=True)
    return df

In [None]:
# testing one year
TMDB_API_KEY = "7533fe894d6e805ff7405c6c1c62d796"
df_2010 = get_tmdb_movies_by_year(TMDB_API_KEY, 2010, min_count=200)
df_2010.to_csv("tmdb_2010_data.csv", index=False)
df_2010.head()


Unnamed: 0,Title,Year,Genres,Rating,Popularity,Runtime,Director,Writer,Actors,Keywords,Plot,Ratings Amount,Language,Country
0,Inception,2010,"Action, Science Fiction, Adventure",8.369,46.201,148,Christopher Nolan,Christopher Nolan,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W...","rescue, mission, dreams, airplane, paris, fran...","Cobb, a skilled thief who commits corporate es...",37094,en,"US, GB"
1,Shutter Island,2010,"Drama, Thriller, Mystery",8.201,42.612,138,Martin Scorsese,,"Leonardo DiCaprio, Mark Ruffalo, Ben Kingsley","island, based on novel or book, hurricane, inv...",World War II soldier-turned-U.S. Marshal Teddy...,24270,en,US
2,Senna,2010,Documentary,8.113,7.371,106,Asif Kapadia,Manish Pandey,"Ayrton Senna, Alain Prost, Frank Williams","sports car, competition, champion, biography, ...",The remarkable story of Brazilian racing drive...,1067,en,GB
3,Flipped,2010,"Romance, Drama",8.0,16.818,89,Rob Reiner,,"Madeline Carroll, Callan McAuliffe, Rebecca De...","based on novel or book, shyness, family relati...","When Juli meets Bryce in the second grade, she...",3059,en,US
4,Scooby-Doo! Camp Scare,2010,"Animation, Comedy, Family, Mystery",7.9,7.515,72,Ethan Spaulding,Scott Thomas,"Frank Welker, Mindy Cohn, Matthew Lillard",summer camp,Scooby and the gang experience outdoor fun as ...,307,en,US


In [None]:
# gets all the years
for year in range(2010, 2024):
    df_year = get_tmdb_movies_by_year(TMDB_API_KEY, year, min_count=200)
    csv_filename = f"tmdb_{year}_data.csv"
    df_year.to_csv(csv_filename, index=False)
    print(f"Data for {year} saved to {csv_filename}")


Data for 2010 saved to tmdb_2010_data.csv
Data for 2011 saved to tmdb_2011_data.csv
Data for 2012 saved to tmdb_2012_data.csv
Data for 2013 saved to tmdb_2013_data.csv
Data for 2014 saved to tmdb_2014_data.csv
Data for 2015 saved to tmdb_2015_data.csv
Data for 2016 saved to tmdb_2016_data.csv
Data for 2017 saved to tmdb_2017_data.csv
Data for 2018 saved to tmdb_2018_data.csv
Data for 2019 saved to tmdb_2019_data.csv
Data for 2020 saved to tmdb_2020_data.csv
Data for 2021 saved to tmdb_2021_data.csv
Data for 2022 saved to tmdb_2022_data.csv
Data for 2023 saved to tmdb_2023_data.csv
