**Python Movie Data Collection Script Overview**

This script gathers movie data and credits from TheMovieDB API and additional metadata from the OMDB API, combining them and saving in Parquet format.

1. **Importing Libraries**: I use `requests` for HTTP requests, `backoff` for retry logic, and `pandas` for data handling.
2. **Setting Up Headers**: I configure HTTP headers with API tokens from TheMovieDB and OMDB APIs for authorization. The OMDB API key allows up to 1000 free calls per day.

   After acquiring the keys, create a .env file in the root folder and populate it with the following values:

   ```
   API_KEY=
   API_TOKEN=
   OMDB_KEY=
   ```


3. **Fetching Movie Data**:

* get_latest_movie(): Fetches the latest movie id from TheMovieDB API.
* get_movie_by_id(id): Retrieves movie data by ID from TheMovieDB API, also with error handling for non-existent movie IDs.
* get_movie_from_omdb(imdb_id, api_key): Fetches movie data from the OMDB API using the IMDb ID.
* fetch_all_movies(start_id, last_id) is created to loop through a range of movie IDs, collecting and merging movie and credits data from both TheMovieDB and OMDB APIs.

In [None]:
import requests
import os
import backoff
import pandas as pd
from dotenv import load_dotenv
from typing import Optional, Union, Dict, List

load_dotenv('../.env')

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {os.getenv('API_TOKEN')}"
}


# Custom retry condition to handle HTTP 429 status code (Too Many Requests)
def giveup(exc: requests.exceptions.HTTPError) -> bool:
    """Determines whether to give up a retry attempt.
    
    Args:
        exc (requests.exceptions.HTTPError): The exception raised during the HTTP request.
        
    Returns:
        bool: True if the exception's status code is not 429 - Too Many Requests, otherwise False.
    """
    return exc.response.status_code != 429


@backoff.on_exception(
    backoff.expo,  # Exponential backoff strategy for Too Many Requests error
    requests.exceptions.HTTPError,  # Exception to look for
    max_tries=10,  # Maximum retry attempts
    giveup=giveup  # Function to determine if retry should be aborted
)
def call_get(url: str) -> Union[Dict, None]:
    """Makes a GET request to a specified URL and handles potential errors.
    
    Args:
        url (str): The URL to send a GET request to.
        
    Returns:
        dict: The JSON response from the GET request.
    """
    response = requests.get(url, headers=headers)
    response.raise_for_status() 
    return response.json()


def get_latest_movie() -> Dict:
    """Fetches the latest movie data from TheMovieDB API.
    
    Returns:
        dict: The latest movie data.
    """
    url = "https://api.themoviedb.org/3/movie/latest"
    return call_get(url)


def get_movie_by_id(id: int) -> Optional[Dict]:
    """Retrieves movie data by ID from TheMovieDB API.
    
    Args:
        id (int): The ID of the movie.
        
    Returns:
        dict: The movie data, or None if not found.
    """
    url = f"https://api.themoviedb.org/3/movie/{id}?language=en-US&append_to_response=credits"
    try:
        return call_get(url)
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404:
            try:
                error_response = e.response.json()
                if error_response.get('status_code') == 34:
                    print(f"No movie found for ID: {id}")
                    return None
                
            except ValueError:
                print(f"Received unexpected response: {e.response.text}")
        else:
            print(f"An error occurred: {e}")
            raise e

def get_movie_from_omdb(imdb_id: str, api_key: str = os.getenv('OMDB_KEY')) -> Dict:
    """Fetches movie data from the OMDB API using the IMDb ID.
    
    Args:
        imdb_id (str): The IMDb ID of the movie.
        api_key (str): The API key for OMDB API.
        
    Returns:
        dict: The movie data from OMDB API.
    """
    url = f"https://www.omdbapi.com/?i={imdb_id}&apikey={api_key}"
    response = requests.get(url)
    response.raise_for_status() 
    return response.json()

def fetch_all_movies(start_id: int, last_id: int) -> List[Dict]:
    """Loops through a range of movie IDs, collecting and merging movie data from both APIs.
    
    Args:
        start_id (int): The starting movie ID.
        last_id (int): The ending movie ID.
        
    Returns:
        list: A list of merged movie data dictionaries.
    """
    all_movies = [] 
    for id in range(start_id, last_id + 1): 
        movie = get_movie_by_id(id)
        if movie is not None and movie["imdb_id"] and \
            movie['revenue'] !=0 and movie['status'] == 'Released' and \
            movie ['budget'] !=0:
            # check this movie in omdb
            try: 
                omdb_json = get_movie_from_omdb(movie["imdb_id"])
                merged = {**movie, **omdb_json}
            except requests.exceptions.HTTPError as e:
                print(f"OMDB Received response: {e.response.text}")
            all_movies.append(merged)
    return all_movies


Get the lastest avaliable movie id from TMDB API

In [None]:
last_movie = get_latest_movie()['id']
last_movie

# current last movie 80001

In [None]:
# Set the ID of the last movie to be fetched
last_movie = 180000

# Set the ID of the movie to start fetching
start_movie = 171710


# Fetch all movies metadata from ID start_movie to last_movie
all_movies = fetch_all_movies(start_movie, last_movie)

df = pd.json_normalize(all_movies)
df.to_parquet(f"../tmdb/movies_{last_movie}.parquet", compression='gzip')


In [None]:
import pandas as pd
import pyarrow.parquet as pq
import os

def read_parquet_files_to_dataframe(directory_path: str) -> pd.DataFrame:
    """Reads all parquet files from a specified directory and merges them into a single DataFrame.
    
    Args:
        directory_path (str): The path to the directory containing the parquet files.
        
    Returns:
        pd.DataFrame: A DataFrame containing the merged data from all parquet files.
    """
    # List all files in the directory
    files = [f for f in os.listdir(directory_path) if f.endswith('.parquet')]
    
    dataframes = []
    
    # Loop through the files and read each one into a DataFrame
    for file in files:
        file_path = os.path.join(directory_path, file)
        parquet_file = pq.ParquetFile(file_path)
        df = parquet_file.read().to_pandas()
        dataframes.append(df)
    
    # Concatenate all the DataFrames into a single DataFrame
    merged_dataframe = pd.concat(dataframes, ignore_index=True)
    
    return merged_dataframe

In [None]:
# MOVIES
# Call the function to read and merge all parquet files in the specified directory into a DataFrame
movies_dataframe = read_parquet_files_to_dataframe('../tmdb/')
# Remove duplicate rows based on the 'id' column of the DataFrame
movies_dataframe=movies_dataframe.drop_duplicates(subset=['id'])

movies_dataframe=movies_dataframe.rename(columns={"credits.cast": "cast", "credits.crew": "crew"})
movies_dataframe.columns
# Save the deduplicated DataFrame to a new Parquet file with gzip compression in the specified directory
movies_dataframe.to_parquet('../data/new_pack.parquet', compression='gzip')

In [None]:
movies_dataframe = read_parquet_files_to_dataframe('../data/')
movies_dataframe.to_parquet('../data/full_pack.parquet', compression='gzip')
