In [2]:
import pandas as pd
import requests
import os
from IPython.display import clear_output, display

# Replace with your TMDb API key
#replace with your api key, removed before placing on github
API_KEY = ""

# Load existing data
pickle_file_path = 'movie_ids_with_nan.pkl'
if os.path.exists(pickle_file_path):
    data = pd.read_pickle(pickle_file_path)
    print(f"Loaded data type: {type(data)}")  # Debug statement to show the type of the loaded data
    display(data)
    if isinstance(data, pd.Series):
        print("Data is a Series, converting to DataFrame.")  # Debug statement
        movies_df = pd.DataFrame(data, columns=['movie_id']).reset_index(drop=True)
        movies_df['worldwide_gross'] = pd.Series([None] * len(movies_df), dtype='float64')
        movies_df['budget'] = pd.Series([None] * len(movies_df), dtype='float64')
    elif isinstance(data, pd.DataFrame):
        print("Data is a DataFrame.")  # Debug statement
        movies_df = data.reset_index(drop=True)
    else:
        raise ValueError(f"The loaded object is neither a Series nor a DataFrame. Type: {type(data)}")
else:
    raise FileNotFoundError(f"The file {pickle_file_path} does not exist.")

#these ids causes a 404 with the api!
bad_movie_ids = [888161]
# Function to get movie details from TMDb
def get_movie_details(imdb_id, api_key):
    try:
        url = f"https://api.themoviedb.org/3/find/{imdb_id}?api_key={api_key}&external_source=imdb_id"
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        if data['movie_results']:
            movie_id = data['movie_results'][0]['id']
            if movie_id in bad_movie_ids:
                return None,None
            movie_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}"
            movie_response = requests.get(movie_url)
            movie_response.raise_for_status()
            movie_data = movie_response.json()
            
            revenue = movie_data.get('revenue', None)
            budget = movie_data.get('budget', None)
            return revenue, budget
        else:
            return None, None
    except requests.exceptions.RequestException as e:
        print(f"API error: {e}")
        raise

# Initialize columns if they don't exist
if 'worldwide_gross' not in movies_df.columns:
    movies_df['worldwide_gross'] = pd.Series([None] * len(movies_df), dtype='float64')
if 'budget' not in movies_df.columns:
    movies_df['budget'] = pd.Series([None] * len(movies_df), dtype='float64')

# Keep track of the index and counter
start_index = movies_df['worldwide_gross'].last_valid_index() + 1 if movies_df['worldwide_gross'].last_valid_index() is not None else 0
counter = 0
total_counter = 0

# Total number of rows to process
total_rows = len(movies_df) - start_index

# Initialize variables to store last valid values
last_valid_revenue = None
last_valid_budget = None

# Loop over IMDb IDs and get the revenue and budget for each movie
for index in range(start_index, len(movies_df)):
    imdb_id = movies_df.at[index, 'movie_id']
    print(f"Processing index {index}, IMDb ID: {imdb_id}")  # Debug statement
    if not imdb_id.startswith('tt'):
        print(f"imdb_id {imdb_id} seems invalid")
        continue
    
    revenue, budget = get_movie_details(imdb_id, API_KEY)
    print(f"Retrieved revenue: {revenue}, budget: {budget}")  # Debug statement
    
    movies_df.at[index, 'worldwide_gross'] = revenue
    movies_df.at[index, 'budget'] = budget
    
    # Update the counter for successful queries
    if (revenue is not None and revenue > 0):
        last_valid_revenue = revenue
    if (budget is not None and budget > 0):
        last_valid_budget = budget
    if (revenue is not None and revenue > 0) or (budget is not None and budget > 0):
        counter += 1
    total_counter += 1

    # Display the progress
    clear_output(wait=True)
    display(f"Progress: {counter/total_counter:.2%} valuable info ({total_counter}/{total_rows})")
    display(f"Last valid revenue: {last_valid_revenue}")
    display(f"Last valid budget: {last_valid_budget}")

    # Save progress to pickle file as DataFrame
    movies_df.to_pickle(pickle_file_path)
    print(f"Progress saved for index {index}.")

# Save the final DataFrame with additional columns
final_pickle_path = 'movies_with_revenue_and_budget.pkl'
movies_df.to_pickle(final_pickle_path)
print(f"Final DataFrame saved to '{final_pickle_path}'")

# Display the final DataFrame
print(movies_df.head())  # Debug statement to display the first few rows of the final DataFrame



'Progress: 7.12% valuable info (34866/34866)'

'Last valid revenue: 124375'

'Last valid budget: 3500'

Progress saved for index 193333.
Final DataFrame saved to 'movies_with_revenue_and_budget.pkl'
    movie_id  worldwide_gross      budget
0  tt0011801              0.0         0.0
1  tt0013274              0.0         0.0
2  tt0062336              0.0         0.0
3  tt0069049              0.0  12000000.0
4  tt0070596              0.0         0.0
