In [None]:
import os
import pandas as pd
import requests
import json
from datetime import datetime
from dotenv import load_dotenv

In [None]:
# ========== CONFIG ========= 

load_dotenv()  # loads from .env file

OMDB_BASE_URL = "http://www.omdbapi.com/"
OMDB_API_KEY = os.getenv("OMDB_API_KEY")

SOURCE_SYSTEM_IMDB = 'imdb' # Assuming 'imdb' is used for the initial movie ID list
SOURCE_SYSTEM_OMDB = 'omdb'
RAW_DATA_DIR_IMDB = os.path.join('..', '..', 'data', 'raw', 'imdb')
RAW_DATA_DIR = os.path.join('..', '..', 'data', 'raw', 'omdb')

# Ensure raw data directory exists
os.makedirs(RAW_DATA_DIR, exist_ok=True)

# API Request Limits
DAILY_API_REQUEST_LIMIT = 1000 # OMDB API usually has lower limits than TMDB

In [4]:
# ========== HELP FUNCTIONS  ========= 

def get_already_extracted_ids(directory, prefix, extension='.json'):
    """Collects IMDb IDs from already extracted JSON files based on filename prefix."""
    extracted_ids = set()
    for root, _, files in os.walk(directory):
        for f in files:
            if f.startswith(prefix) and f.endswith(extension):
                try:
                    if len(parts) >= 4:
                        extracted_ids.add(parts[3])
                except IndexError:
                    print(f"Warning: Could not parse ID from filename {f}")
    return extracted_ids

def make_omdb_api_call(imdb_id, api_key=OMDB_API_KEY):
    """Makes an OMDB API call using an IMDb ID. Returns JSON response or None on failure."""
    url = f"{OMDB_BASE_URL}?i={imdb_id}&apikey={api_key}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        if data.get('Response') == 'True': # OMDB specific success indicator
            return data
        else:
            return None
    except requests.exceptions.RequestException as e:
        return None

def save_json_response(data, id_value, base_dir, filename_prefix=""):
    """Saves a JSON response to a file with a timestamped filename in a daily structured directory."""
    current_date_time = datetime.now()

    daily_dir = os.path.join(
        base_dir,
        current_date_time.strftime('%Y'),
        current_date_time.strftime('%m'),
        current_date_time.strftime('%d'),
        SOURCE_SYSTEM_OMDB
    )
    os.makedirs(daily_dir, exist_ok=True)

    timestamp_str = current_date_time.strftime("%Y%m%d_%H%M%S")
    filename = f"{filename_prefix}_{id_value}_{timestamp_str}.json"
    file_path = os.path.join(daily_dir, filename)

    try:
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=4)
    except Exception as e:
        print(f"Error saving JSON for ID {id_value}: {e}")

def load_imdb_ids(file_path):
    """Loads IMDb movie IDs from a CSV file."""
    df_imdb_ids = pd.read_csv(file_path)
    return df_imdb_ids['imdb_movie_id'].tolist()

def collect_all_raw_json_data(base_dir, prefix, extension='.json'):
    """
    Collects and combines data from all raw JSON files matching a prefix
    across nested daily directories.
    """
    all_data = []

    for root, _, files in os.walk(base_dir):
        for file_name in files:
            if file_name.startswith(prefix) and file_name.endswith(extension):
                file_path = os.path.join(root, file_name)
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        data = json.load(f)
                        all_data.append(data)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON from {file_name}: {e}")
                except Exception as e:
                    print(f"Error reading {file_name}: {e}")
    print(f"Collected {len(all_data)} raw JSON responses.")
    return all_data

In [5]:
# =========  MAIN EXECUTION FLOW  ========= 

if __name__ == "__main__":
    
    # --------- STEP 1: Load initial IMDb movie IDs ---------
    
    source_file_imdb_ids = os.path.join(RAW_DATA_DIR_IMDB, f'{SOURCE_SYSTEM_IMDB}_movie_ids.csv')
    if not os.path.exists(source_file_imdb_ids):
        print(f"Error: IMDb movie IDs file not found at {source_file_imdb_ids}.")
        exit()

    initial_imdb_ids = load_imdb_ids(source_file_imdb_ids)

    # --------- STEP 2: Identify IMDb IDs not yet extracted for OMDB ---------
    
    prefix_for_check = f'{SOURCE_SYSTEM_OMDB}_movie_stats_tt'
    extracted_omdb_ids = get_already_extracted_ids(RAW_DATA_DIR, prefix_for_check)

    imdb_ids_not_yet_extracted_for_omdb = sorted([
        imdb_id for imdb_id in initial_imdb_ids
        if str(imdb_id) not in extracted_omdb_ids
    ])

    if len(initial_imdb_ids) > 0: # Avoid division by zero
        print(f"=> {(len(imdb_ids_not_yet_extracted_for_omdb) / len(initial_imdb_ids) * 100):.2f} % remaining to be extracted")
    else:
        print("No IMDb IDs to process.")

    ids_to_process = imdb_ids_not_yet_extracted_for_omdb[:DAILY_API_REQUEST_LIMIT]

    # --------- STEP 3: Perform OMDB API calls and save raw JSON responses ---------
    
    for i, imdb_id in enumerate(ids_to_process):
        json_data = make_omdb_api_call(imdb_id)
        if json_data:
            save_json_response(json_data, imdb_id, RAW_DATA_DIR, f'{SOURCE_SYSTEM_OMDB}_movie_stats')
        print(f"[{i+1}/{len(ids_to_process)}] Processed IMDb ID: {imdb_id}")

    # --------- STEP 4: Collect all raw OMDB JSON responses into a single structure ---------
    
    all_omdb_raw_responses = collect_all_raw_json_data(RAW_DATA_DIR, prefix_for_check)

    overall_collected_json_filename = f'__all_{SOURCE_SYSTEM_OMDB}_api_output_collected.json'
    overall_collected_json_path = os.path.join(RAW_DATA_DIR, overall_collected_json_filename)
    with open(overall_collected_json_path, 'w', encoding='utf-8') as f:
        json.dump(all_omdb_raw_responses, f, indent=4)

    # --------- STEP 5: Transform combined OMDB API responses into a structured DataFrame ---------
    
    if not all_omdb_raw_responses:
        print("No OMDB API responses to transform.")
    else:
        # Create DataFrame from the collected list of dictionaries
        df_omdb_raw = pd.DataFrame(all_omdb_raw_responses)

        # Filter for valid BoxOffice values (starting with '$' and not NaN)
        df_movie_box_office = df_omdb_raw[
            df_omdb_raw['BoxOffice'].notna() & df_omdb_raw['BoxOffice'].astype(str).str.startswith('$')
        ].copy() # Use .copy() to avoid SettingWithCopyWarning

        # Clean and convert BoxOffice values to integers
        df_movie_box_office['BoxOffice'] = (
            df_movie_box_office['BoxOffice']
            .astype(str) # Ensure string operations
            .str.replace('$', '', regex=False)
            .str.replace(',', '', regex=False)
            .astype('Int64') # Use Pandas nullable integer type
        )

        # Select and rename relevant columns
        selected_column_names = ['imdbID', 'BoxOffice']
        new_column_names = [f'{SOURCE_SYSTEM_OMDB}_imdb_id', f'{SOURCE_SYSTEM_OMDB}_box_office']
        columns_dict = dict(zip(selected_column_names, new_column_names))
        df_movie_box_office = df_movie_box_office[selected_column_names].rename(columns=columns_dict)

        # Drop duplicates based on IMDb ID, keeping the first occurrence if multiple exist
        df_movie_box_office = df_movie_box_office.drop_duplicates(subset=[f'{SOURCE_SYSTEM_OMDB}_imdb_id']).reset_index(drop=True)

        # Save
        output_csv_path = os.path.join(RAW_DATA_DIR, f'{SOURCE_SYSTEM_OMDB}_movie_box_office.csv')
        df_movie_box_office.to_csv(output_csv_path, index=False)

    print("\n--- OMDB Data Transformation Complete ---")

=> 100.00 % remaining to be extracted
[1/1000] Processed IMDb ID: tt0035423
[2/1000] Processed IMDb ID: tt0069049
[3/1000] Processed IMDb ID: tt0082328
[4/1000] Processed IMDb ID: tt0088751
[5/1000] Processed IMDb ID: tt0096056
[6/1000] Processed IMDb ID: tt0104988
[7/1000] Processed IMDb ID: tt0108116
[8/1000] Processed IMDb ID: tt0108549
[9/1000] Processed IMDb ID: tt0109173
[10/1000] Processed IMDb ID: tt0110476
[11/1000] Processed IMDb ID: tt0112912
[12/1000] Processed IMDb ID: tt0113092
[13/1000] Processed IMDb ID: tt0114447
[14/1000] Processed IMDb ID: tt0114722
[15/1000] Processed IMDb ID: tt0115686
[16/1000] Processed IMDb ID: tt0115937
[17/1000] Processed IMDb ID: tt0116391
[18/1000] Processed IMDb ID: tt0116748
[19/1000] Processed IMDb ID: tt0116991
[20/1000] Processed IMDb ID: tt0117825
[21/1000] Processed IMDb ID: tt0118154
[22/1000] Processed IMDb ID: tt0118578
[23/1000] Processed IMDb ID: tt0118589
[24/1000] Processed IMDb ID: tt0118652
[25/1000] Processed IMDb ID: tt0118