In [None]:
import os
import pandas as pd
import requests
import json
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time
from dotenv import load_dotenv

In [None]:
# =========  CONFIG ========= 

load_dotenv()  # loads from .env file

TMDB_API_KEY = os.getenv("TMDB_API_KEY")
TMDB_READ_TOKEN = os.getenv("TMDB_READ_TOKEN")
TMDB_BASE_URL = "https://api.themoviedb.org/3"

SOURCE_SYSTEM_IMDB = 'imdb'
SOURCE_SYSTEM_TMDB = 'tmdb'
RAW_DATA_DIR_IMDB = os.path.join('..', '..', 'data', 'raw', 'imdb')
RAW_DATA_DIR = os.path.join('..', '..', 'data', 'raw', 'tmdb')
LOAD_DIR = os.path.join('..', '..', 'data', 'raw', 'box_office_mapping')

os.makedirs(RAW_DATA_DIR, exist_ok=True)
os.makedirs(LOAD_DIR, exist_ok=True)

DAILY_API_REQUEST_LIMIT = 25000
MAX_THREADS = 10


In [4]:
# ========= HELP FUNCTIONS ========= 

def get_already_extracted_ids(directory, prefix, extension='.json'):
    extracted_ids = set()
    for root, _, files in os.walk(directory):
        for f in files:
            if f.startswith(prefix) and f.endswith(extension):
                try:
                    parts = f.split('_')
                    if len(parts) >= 4:
                        extracted_ids.add(parts[3])  # Expecting imdb_id like tt1234567
                except Exception as e:
                    print(f"Error parsing filename {f}: {e}")
    return extracted_ids


def make_tmdb_api_call(endpoint, item_id="", params=None):
    url = f"{TMDB_BASE_URL}/movie/{item_id}{endpoint}"
    headers = {
        "Authorization": f"Bearer {TMDB_READ_TOKEN}",
        "Content-Type": "application/json;charset=utf-8"
    }

    if params is None:
        params = {}
    params["api_key"] = TMDB_API_KEY

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"TMDB API call failed for ID {item_id}: {e}")
        return None


def save_json_response(data, id_value, base_dir, filename_prefix=""):
    current_date_time = datetime.now()
    daily_dir = os.path.join(
        base_dir,
        current_date_time.strftime('%Y'),
        current_date_time.strftime('%m'),
        current_date_time.strftime('%d'),
        SOURCE_SYSTEM_TMDB
    )
    os.makedirs(daily_dir, exist_ok=True)

    timestamp_str = current_date_time.strftime("%Y%m%d_%H%M%S")
    filename = f"{filename_prefix}_{id_value}_{timestamp_str}.json"
    file_path = os.path.join(daily_dir, filename)

    try:
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=4)
    except Exception as e:
        print(f"Error saving JSON for ID {id_value}: {e}")


def load_imdb_ids(file_path):
    df_imdb_ids = pd.read_csv(file_path)
    return df_imdb_ids['imdb_movie_id'].tolist()


def load_json_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)


In [5]:
# =========  HELP FUNCTIONS 2 ========= 

def get_tmdb_id_for_imdb_id(imdb_id, mapping_df):
    """
    Returns the TMDB ID for a given IMDb ID using the mapping DataFrame.
    """
    row = mapping_df[mapping_df['imdb_movie_id'] == imdb_id]
    return row['tmdb_movie_id'].values[0] if not row.empty else None


def get_imdb_id_for_tmdb_id(tmdb_id, mapping_df):
    """
    Returns the IMDb ID for a given TMDB ID using the mapping DataFrame.
    """
    row = mapping_df[mapping_df['tmdb_movie_id'] == tmdb_id]
    return row['imdb_movie_id'].values[0] if not row.empty else None


def get_tmdb_data_for_imdb_id(imdb_id, stats_df):
    """
    Returns a dictionary with TMDB data for a given IMDb ID using the stats DataFrame.
    """
    row = stats_df[stats_df['tmdb_imdb_id'] == imdb_id]
    return row.iloc[0].to_dict() if not row.empty else None

In [6]:
# =========  API DATA COLLECTION ========= 

def collect_api_data(imdb_ids_list, data_type="mapping"):
    if data_type == "mapping":
        prefix_pattern = f'{SOURCE_SYSTEM_TMDB}_imdb_mapping_tt'
        endpoint = ""
    elif data_type == "stats":
        prefix_pattern = f'{SOURCE_SYSTEM_TMDB}_movie_stats_tt'
        mapping_file = os.path.join(RAW_DATA_DIR, f'{SOURCE_SYSTEM_TMDB}_imdb_2_tmdb_id_mapping.csv')
        if not os.path.exists(mapping_file):
            print("Mapping file not found. Please run the mapping step first.")
            return
        df_mapping = pd.read_csv(mapping_file)
        imdb_tmdb_map = pd.Series(df_mapping.tmdb_movie_id.values, index=df_mapping.imdb_movie_id).to_dict()
    else:
        raise ValueError("Invalid data_type. Must be 'mapping' or 'stats'.")

    already_extracted = get_already_extracted_ids(RAW_DATA_DIR, prefix_pattern)
    ids_to_process = [imdb_id for imdb_id in imdb_ids_list if str(imdb_id) not in already_extracted]
    ids_to_process = ids_to_process[:DAILY_API_REQUEST_LIMIT]

    if not ids_to_process:
        print("No new IDs to process for this data type.")
        return

    current_request_count = 0
    with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
        futures = {}
        for imdb_id in ids_to_process:
            if data_type == "mapping":
                future = executor.submit(make_tmdb_api_call, endpoint, imdb_id)
            else:
                tmdb_id = imdb_tmdb_map.get(imdb_id)
                if not tmdb_id:
                    print(f"Skipping {imdb_id}: No TMDB ID found.")
                    continue
                future = executor.submit(make_tmdb_api_call, endpoint, tmdb_id)
            futures[future] = imdb_id

        for i, future in enumerate(as_completed(futures), 1):
            imdb_id = futures[future]
            try:
                json_data = future.result()
                if json_data:
                    save_json_response(json_data, imdb_id, RAW_DATA_DIR, prefix_pattern.split('_tt')[0])
                    current_request_count += 1
                else:
                    print(f"[{i}] No data for IMDb ID: {imdb_id}")
            except Exception as e:
                print(f"[{i}] Error processing IMDb ID {imdb_id}: {e}")

            if current_request_count >= DAILY_API_REQUEST_LIMIT:
                print("Daily API request limit reached.")
                break

    print(f"Finished TMDB API Data Collection ({data_type}). Total requests made: {current_request_count}")


In [7]:
# =========  DATA TRANSFORMATION ========= 

def create_imdb_tmdb_id_mapping(raw_data_directory, output_dir):
    movie_data = []

    for root, _, files in os.walk(raw_data_directory):
        for file_name in files:
            if file_name.startswith(f'{SOURCE_SYSTEM_TMDB}_imdb_mapping_tt') and file_name.endswith(".json"):
                file_path = os.path.join(root, file_name)
                try:
                    data = load_json_file(file_path)
                    imdb_id_from_filename = file_name.split("_")[3]
                    json_tmdb_id = data.get("id")
                    json_imdb_id = data.get("imdb_id")
                    if json_tmdb_id and json_imdb_id and str(json_imdb_id) == imdb_id_from_filename:
                        movie_data.append((json_imdb_id, json_tmdb_id))
                    else:
                        print(f"Mismatch or missing IDs in file {file_name}")
                except Exception as e:
                    print(f"Error processing JSON file {file_name}: {e}")

    df_mapping = pd.DataFrame(movie_data, columns=['imdb_movie_id', 'tmdb_movie_id']).drop_duplicates()
    df_mapping = df_mapping.sort_values('imdb_movie_id').reset_index(drop=True)

    output_file_path = os.path.join(output_dir, f'{SOURCE_SYSTEM_TMDB}_imdb_2_tmdb_id_mapping.csv')
    df_mapping.to_csv(output_file_path, index=False)
    return df_mapping


def transform_tmdb_api_responses(raw_data_directory, output_dir):
    all_extracted_data = []

    for root, _, files in os.walk(raw_data_directory):
        for file_name in files:
            if file_name.startswith(f'{SOURCE_SYSTEM_TMDB}_movie_stats_tt') and file_name.endswith('.json'):
                file_path = os.path.join(root, file_name)
                try:
                    data = load_json_file(file_path)
                    entry = {
                        "tmdb_budget": data.get("budget"),
                        "tmdb_id": data.get("id"),
                        "tmdb_imdb_id": data.get("imdb_id"),
                        "tmdb_release_date": data.get("release_date"),
                        "tmdb_revenue": data.get("revenue"),
                        "tmdb_vote_average": data.get("vote_average"),
                        "tmdb_vote_count": data.get("vote_count")
                    }
                    all_extracted_data.append(entry)
                except Exception as e:
                    print(f"Error processing JSON file {file_name}: {e}")

    if not all_extracted_data:
        print("No data found to transform.")
        return None

    df_transformed = pd.DataFrame(all_extracted_data)
    df_transformed = df_transformed.drop_duplicates(subset=['tmdb_imdb_id']).reset_index(drop=True)

    output_file_path = os.path.join(output_dir, f'{SOURCE_SYSTEM_TMDB}_movie_box_office.csv')
    df_transformed.to_csv(output_file_path, index=False)
    return df_transformed


In [8]:
# ========= MAIN EXECUTION FLOW ========= 

if __name__ == "__main__":
    source_file_imdb_ids = os.path.join(RAW_DATA_DIR_IMDB, f'{SOURCE_SYSTEM_IMDB}_movie_ids.csv')
    if not os.path.exists(source_file_imdb_ids):
        print(f"Error: IMDb movie IDs file not found at {source_file_imdb_ids}.")
    else:
        imdb_ids = load_imdb_ids(source_file_imdb_ids)

        print("Starting TMDB Mapping Collection...")
        collect_api_data(imdb_ids, data_type="mapping")

        print("Creating IMDb-TMDB ID mapping...")
        df_mapping = create_imdb_tmdb_id_mapping(RAW_DATA_DIR, LOAD_DIR)

        print("Starting TMDB Stats Collection...")
        collect_api_data(imdb_ids, data_type="stats")

        print("Transforming TMDB JSON responses...")
        df_stats = transform_tmdb_api_responses(RAW_DATA_DIR, LOAD_DIR)

        # --- Example usage of helper functions ---
        if df_mapping is not None and df_stats is not None and len(imdb_ids) > 0:
            sample_imdb_id = imdb_ids[0]
            tmdb_id = get_tmdb_id_for_imdb_id(sample_imdb_id, df_mapping)
            print(f"Sample IMDb ID: {sample_imdb_id} maps to TMDB ID: {tmdb_id}")

            if tmdb_id:
                imdb_id_back = get_imdb_id_for_tmdb_id(tmdb_id, df_mapping)
                print(f"TMDB ID: {tmdb_id} maps back to IMDb ID: {imdb_id_back}")

            tmdb_data = get_tmdb_data_for_imdb_id(sample_imdb_id, df_stats)
            print(f"TMDB data for IMDb ID {sample_imdb_id}:\n{tmdb_data}")

Starting TMDB Mapping Collection...
TMDB API call failed for ID tt0118656: 404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0118656?api_key=946e197c2070e68dcce7690ad471f676
TMDB API call failed for ID tt0108549: 404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0108549?api_key=946e197c2070e68dcce7690ad471f676
TMDB API call failed for ID tt0112912: 404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0112912?api_key=946e197c2070e68dcce7690ad471f676
TMDB API call failed for ID tt0154187: 404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0154187?api_key=946e197c2070e68dcce7690ad471f676
TMDB API call failed for ID tt0159503: 404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0159503?api_key=946e197c2070e68dcce7690ad471f676
TMDB API call failed for ID tt0131597: 404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0131597?api_key=946e197c2070e68dcce7690ad