In [1]:
import pandas as pd
import requests
import os
from IPython.display import clear_output, display

# Replace with your RapidAPI key, removed before placing on github
RAPIDAPI_KEY = ''
RAPIDAPI_HOST = "moviesdatabase.p.rapidapi.com"

# Load existing data
pickle_file_path = 'movie_ids_with_nan_rapidapi.pkl'
if os.path.exists(pickle_file_path):
    data = pd.read_pickle(pickle_file_path)
    print(f"Loaded data type: {type(data)}")  # Debug statement to show the type of the loaded data
    display(data)
    if isinstance(data, pd.Series):
        print("Data is a Series, converting to DataFrame.")  # Debug statement
        movies_df = pd.DataFrame(data, columns=['movie_id']).reset_index(drop=True)
    elif isinstance(data, pd.DataFrame):
        print("Data is a DataFrame.")  # Debug statement
        movies_df = data.reset_index(drop=True)
    else:
        raise ValueError(f"The loaded object is neither a Series nor a DataFrame. Type: {type(data)}")
else:
    raise FileNotFoundError(f"The file {pickle_file_path} does not exist.")

# Initialize necessary columns if they don't exist
required_columns = ['worldwideGross', 'worldwideGrossCurrency', 'lifetimeGross', 'lifetimeGrossCurrency',
                    'productionBudget', 'productionBudgetCurrency', 'openingWeekendGross', 'openingWeekendGrossCurrency']
for column in required_columns:
    if column not in movies_df.columns:
        movies_df[column] = pd.Series([None] * len(movies_df))

# Helper function to safely get nested dictionary values
def get_nested_value(data, keys, default=None):
    for key in keys:
        try:
            data = data[key]
            if data is None:
                return default
        except (TypeError, KeyError):
            return default
    return data

# Function to get movie details from RapidAPI
def get_movie_details(imdb_id, api_key, api_host):
    try:
        url = f"https://{api_host}/titles/{imdb_id}"
        headers = {
            "X-RapidAPI-Key": api_key,
            "X-RapidAPI-Host": api_host
        }
        params = {"info": "revenue_budget"}
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        data = response.json()
        
        movie_data = data.get('results', {})
        
        production_budget_amount = get_nested_value(movie_data, ['productionBudget', 'budget', 'amount'])
        production_budget_currency = get_nested_value(movie_data, ['productionBudget', 'budget', 'currency'])
        
        lifetime_gross_amount = get_nested_value(movie_data, ['lifetimeGross', 'total', 'amount'])
        lifetime_gross_currency = get_nested_value(movie_data, ['lifetimeGross', 'total', 'currency'])
        
        opening_weekend_gross_amount = get_nested_value(movie_data, ['openingWeekendGross', 'gross', 'total', 'amount'])
        opening_weekend_gross_currency = get_nested_value(movie_data, ['openingWeekendGross', 'gross', 'total', 'currency'])
        
        worldwide_gross_amount = get_nested_value(movie_data, ['worldwideGross', 'total', 'amount'])
        worldwide_gross_currency = get_nested_value(movie_data, ['worldwideGross', 'total', 'currency'])
        
        return (production_budget_amount, production_budget_currency, lifetime_gross_amount, lifetime_gross_currency,
                opening_weekend_gross_amount, opening_weekend_gross_currency, worldwide_gross_amount, worldwide_gross_currency)
    except requests.exceptions.RequestException as e:
        print(f"API error: {e}")
        return None, None, None, None, None, None, None, None

# Keep track of the index and counter
start_index = movies_df['worldwideGross'].last_valid_index() + 1 if movies_df['worldwideGross'].last_valid_index() is not None else 0
counter = 0
total_counter = 0

# Variables to store the last "good" values
last_good_production_budget = None
last_good_production_budget_currency = None
last_good_lifetime_gross = None
last_good_lifetime_gross_currency = None
last_good_opening_weekend_gross = None
last_good_opening_weekend_gross_currency = None
last_good_worldwide_gross = None
last_good_worldwide_gross_currency = None

# Total number of rows to process
total_rows = len(movies_df)

# Loop over IMDb IDs and get the revenue and budget for each movie
try:
    for index in range(start_index, total_rows):
        imdb_id = movies_df.at[index, 'movie_id']
        print(f"Processing index {index}, IMDb ID: {imdb_id}")  # Debug statement
        
        (production_budget, production_budget_currency, lifetime_gross, lifetime_gross_currency,
         opening_weekend_gross, opening_weekend_gross_currency, worldwide_gross, worldwide_gross_currency) = get_movie_details(imdb_id, RAPIDAPI_KEY, RAPIDAPI_HOST)
        
        print(f"Retrieved production budget: {production_budget} {production_budget_currency}, lifetime gross: {lifetime_gross} {lifetime_gross_currency}, opening weekend gross: {opening_weekend_gross} {opening_weekend_gross_currency}, worldwide gross: {worldwide_gross} {worldwide_gross_currency}")  # Debug statement
        
        # Update the DataFrame with the new values
        movies_df.at[index, 'productionBudget'] = production_budget
        movies_df.at[index, 'productionBudgetCurrency'] = production_budget_currency
        movies_df.at[index, 'lifetimeGross'] = lifetime_gross
        movies_df.at[index, 'lifetimeGrossCurrency'] = lifetime_gross_currency
        movies_df.at[index, 'openingWeekendGross'] = opening_weekend_gross
        movies_df.at[index, 'openingWeekendGrossCurrency'] = opening_weekend_gross_currency
        movies_df.at[index, 'worldwideGross'] = worldwide_gross
        movies_df.at[index, 'worldwideGrossCurrency'] = worldwide_gross_currency
        
        # Check for "good" values and update the last_good variables
        if production_budget not in [None, 0]:
            last_good_production_budget = production_budget
            last_good_production_budget_currency = production_budget_currency
        if lifetime_gross not in [None, 0]:
            last_good_lifetime_gross = lifetime_gross
            last_good_lifetime_gross_currency = lifetime_gross_currency
        if opening_weekend_gross not in [None, 0]:
            last_good_opening_weekend_gross = opening_weekend_gross
            last_good_opening_weekend_gross_currency = opening_weekend_gross_currency
        if worldwide_gross not in [None, 0]:
            last_good_worldwide_gross = worldwide_gross
            last_good_worldwide_gross_currency = worldwide_gross_currency
        
        # Update the counter for successful queries
        if (worldwide_gross is not None and worldwide_gross > 0) or (production_budget is not None and production_budget > 0):
            counter += 1
        total_counter += 1

        # Display the progress and last good values
        clear_output(wait=True)
        display(f"Progress: {counter/total_counter:.2%} valuable info ({total_counter}/{total_rows})")
        display(f"Last good production budget: {last_good_production_budget} {last_good_production_budget_currency}")
        display(f"Last good lifetime gross: {last_good_lifetime_gross} {last_good_lifetime_gross_currency}")
        display(f"Last good opening weekend gross: {last_good_opening_weekend_gross} {last_good_opening_weekend_gross_currency}")
        display(f"Last good worldwide gross: {last_good_worldwide_gross} {last_good_worldwide_gross_currency}")

        # Save progress to pickle file as DataFrame
        movies_df.to_pickle(pickle_file_path)
        print(f"Progress saved for index {index}.")
except Exception as e:
    print(f"Error occurred: {e}")

# Save the final DataFrame with additional columns
final_pickle_path = 'movies_with_revenue_and_budget_rapidapi.pkl'
movies_df.to_pickle(final_pickle_path)
print(f"Final DataFrame saved to '{final_pickle_path}'")

# Display the final DataFrame
print(movies_df.head())  # Debug statement to display the first few rows of the final DataFrame


'Progress: 32.81% valuable info (47279/193334)'

'Last good production budget: 1000000 INR'

'Last good lifetime gross: 163745 USD'

'Last good opening weekend gross: 12930 USD'

'Last good worldwide gross: 163745 USD'

Progress saved for index 81677.
Processing index 81678, IMDb ID: tt26741067


KeyboardInterrupt: 