## Python Movie Data Collection Script

This script is designed to fetch movie-related data from TheMovieDB API then by imdb_id get metadata from OMDB API and unite those datasets, and store the collected data in Parquet format. It also creates a dataset from TheMovieDB API credits. Here's a breakdown of its functionalities:

### 1. **Importing Required Libraries**:
   - Essential libraries such as `requests`, `os`, `backoff`, and `pandas` are imported for handling HTTP requests, environment variables, retry logic, and data manipulation respectively.

### 2. **Header Configuration**:
HTTP headers are configured with authorization using an API token retrieved from environment variables.
You should get token accees from https://developer.themoviedb.org/reference/intro/getting-started for TheMovieDB API and OMDB API key from https://www.omdbapi.com/apikey.aspx. OMDB API key is limited to 1000 calls per day for free.
After you get those keys you should create .env file in the root folder and fill it with values:

        API_KEY=
        API_TOKEN=
        OMDB_KEY=

### 3. **Retry Logic**:
   - A custom retry logic is defined using the `backoff` library to manage HTTP errors, particularly focusing on status code 429 (Too Many Requests).

### 4. **API Call Function**:
   - A function `call_get(url)` is defined to make GET requests to the specified URL and raise exceptions for unsuccessful responses.

### 5. **Movie Data Retrieval Functions**:
   - `get_latest_movie()`: Fetches the latest movie data from TheMovieDB API.
   - `get_movie_credits_by_id(id)`: Obtains movie credits based on the movie ID from TheMovieDB API, with error handling for non-existent movie IDs.
   - `get_movie_by_id(id)`: Retrieves movie data by ID from TheMovieDB API, also with error handling for non-existent movie IDs.
   - `get_movie_from_omdb(imdb_id, api_key)`: Fetches movie data from the OMDB API using the IMDb ID.

### 6. **Bulk Data Collection Functions**:
   - `fetch_all_movies(start_id, last_id)` and `fetch_all_credits(start_id, last_id)` are designed to loop through a range of movie IDs, collecting and merging movie data from both TheMovieDB and OMDB APIs, and movie credits from TheMovieDB API respectively.

### 7. **Data Normalization and Storage**:
   - The script utilizes `pandas` to normalize the collected JSON data into a tabular format, and subsequently stores the data in Parquet files with gzip compression, for both movies and credits data.

This script represents a systematic approach to collecting, normalizing, and storing movie-related data from different online sources through API interactions, with robust error handling and retry logic to ensure the reliability of the data collection process.

In [None]:
import requests
import os
import backoff
import pandas as pd
from dotenv import load_dotenv

load_dotenv('../.env')

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {os.getenv('API_TOKEN')}"
}

# Custom retry condition
def giveup(exc):
    # Don't retry if the exception is not a 429 status
    return exc.response.status_code != 429

@backoff.on_exception(
    backoff.expo,  # Exponential backoff strategy
    requests.exceptions.HTTPError,  # Exception to look for
    max_tries=10,  # Maximum retry attempts
    giveup=giveup  # Function to determine if retry should be aborted
)
def call_get(url):
    response = requests.get(url, headers=headers)
    response.raise_for_status() 
    return response.json()

def get_latest_movie():
    url = "https://api.themoviedb.org/3/movie/latest"
    return call_get(url)


def get_movie_credits_by_id(id):
    url = f"https://api.themoviedb.org/3/movie/{id}/credits?language=en-US"
    try:
        return call_get(url)
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404:
            try:
                error_response = e.response.json()
                if error_response.get('status_code') == 34:
                    print(f"No credits found for ID: {id}")
                    return None
                
            except ValueError:
                print(f"Received unexpected response: {e.response.text}")
        else:
            print(f"An error occurred: {e}")
            raise e


def get_movie_by_id(id):
    url = f"https://api.themoviedb.org/3/movie/{id}?language=en-US"
    try:
        return call_get(url)
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404:
            try:
                error_response = e.response.json()
                if error_response.get('status_code') == 34:
                    print(f"No movie found for ID: {id}")
                    return None
                
            except ValueError:
                print(f"Received unexpected response: {e.response.text}")
        else:
            print(f"An error occurred: {e}")
            raise e

def get_movie_from_omdb(imdb_id, api_key=os.getenv('OMDB_KEY')):
    url = f"https://www.omdbapi.com/?i={imdb_id}&apikey={api_key}"
    response = requests.get(url)
    response.raise_for_status() 
    return response.json()

def fetch_all_movies(start_id, last_id):
    all_movies = [] 
    for id in range(start_id, last_id + 1): 
        movie = get_movie_by_id(id)
        if movie is not None and movie["imdb_id"] and \
            movie['revenue'] !=0 and movie['status'] == 'Released' and \
            movie ['budget'] !=0:
            # check this movie in omdb
            try: 
                omdb_json = get_movie_from_omdb(movie["imdb_id"])
                merged = {**movie, **omdb_json}
            except requests.exceptions.HTTPError as e:
                print(f"OMDB Received response: {e.response.text}")
            all_movies.append(merged)
    return all_movies

def fetch_all_credits(start_id, last_id):
    all_credits = [] 
    for id in range(start_id, last_id + 1):  
        credit = get_movie_credits_by_id(id)
        if credit is not None:
            all_credits.append(credit)
    return all_credits


In [None]:
last_movie = get_latest_movie()['id']
last_movie

In [None]:
last_movie = 14900
all_movies = fetch_all_movies(14801, last_movie)

pd.set_option('display.max_columns', None)
df = pd.json_normalize(all_movies)
df.to_parquet(f"../tmdb/movies_{last_movie}.parquet", compression='gzip')


In [None]:
df.to_parquet(f"../tmdb/movies_{last_movie}.parquet", compression='gzip')

In [None]:
last_movie = 14801
all_movies = fetch_all_credits(7800, last_movie)

df = pd.json_normalize(all_movies)
df.to_parquet(f"../credits/credits_{last_movie}.parquet", compression='gzip')


Merge files to one for movies data

In [54]:
import pandas as pd
import pyarrow.parquet as pq
import os

def read_parquet_files_to_dataframe(directory_path):
    # List all files in the directory
    files = [f for f in os.listdir(directory_path) if f.endswith('.parquet')]
    
    # Initialize an empty list to hold DataFrames
    dataframes = []
    
    # Loop through the files and read each one into a DataFrame
    for file in files:
        file_path = os.path.join(directory_path, file)
        parquet_file = pq.ParquetFile(file_path)
        df = parquet_file.read().to_pandas()
        dataframes.append(df)
    
    # Concatenate all the DataFrames into a single DataFrame
    merged_dataframe = pd.concat(dataframes, ignore_index=True)
    
    return merged_dataframe



# Call the function
movies_dataframe = read_parquet_files_to_dataframe('../tmdb/')
movies_dataframe=movies_dataframe.drop_duplicates(subset=['id'])


movies_dataframe.to_parquet('../data/movies.parquet', compression='gzip')

# Call the function
credits_dataframe = read_parquet_files_to_dataframe('../credits//')
credits_dataframe=credits_dataframe.drop_duplicates(subset=['id'])


credits_dataframe.to_parquet('../data/credits.parquet', compression='gzip')