In [20]:
import requests
import json
import pandas as pd
import os

# Load the DataFrame from the pickle file
merged_df = pd.read_pickle('last_three_years_movies_with_ratings.pkl')

# Display the first few rows to verify the DataFrame
print(merged_df.head())

# Read the API key from the file
with open('rapidapi.key', 'r') as file:
    api_key = file.read().strip()

# Now use the api_key variable in your code
print(api_key)  # Just to verify it's reading correctly

base_url = 'https://moviesdatabase.p.rapidapi.com/titles/x/titles-by-ids'

# Define headers
headers = {
    'x-rapidapi-key': api_key,
    'x-rapidapi-host': 'moviesdatabase.p.rapidapi.com'
}

# Utility function to get nested dictionary values
def get_nested(data, keys, default=None):
    for key in keys:
        try:
            data = data[key]
        except (KeyError, TypeError):
            return default
    return data

# Function to fetch detailed movie data for multiple IMDb IDs
def fetch_bulk_movie_details(imdb_ids, info):
    url = base_url
    payload = {
        "idsList": ",".join(imdb_ids),
        "info": info
    }
    try:
        response = requests.get(url, headers=headers, params=payload)
        response.raise_for_status()  # Raise HTTPError for bad responses
        return response.json(), imdb_ids  # Return response JSON and the IMDb IDs
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for IMDb IDs {imdb_ids} with info {info}: {e}")
        print(f"Response status code: {response.status_code}")
        print(f"Response text: {response.text}")
        return None, imdb_ids

# Function to merge data from multiple info parameters
def merge_movie_details(imdb_ids, infos):
    all_details = {imdb_id: {} for imdb_id in imdb_ids}
    
    for info in infos:
        details, ids = fetch_bulk_movie_details(imdb_ids, info)
        if details and 'results' in details:
            if isinstance(details['results'], list):
                for idx, result in enumerate(details['results']):
                    imdb_id = ids[idx]
                    if imdb_id in all_details:
                        all_details[imdb_id].update(result)
            else:
                print(f"No valid results for info: {info}")
        else:
            print(f"No valid results for info: {info}")
            print(details)  # Display details only if it's not something we can iterate

    return all_details

# Function to flatten nested JSON
def flatten_json(json_data):
    out = {}

    def flatten(x, name=''):
        if isinstance(x, dict):
            for a in x:
                flatten(x[a], name + a + '_')
        elif isinstance(x, list):
            for i, a in enumerate(x):
                flatten(a, name + str(i) + '_')
        else:
            out[name[:-1]] = x

    flatten(json_data)
    return out

# Function to split a list into chunks
def chunk_list(data, chunk_size):
    for i in range(0, len(data), chunk_size):
        yield data[i:i + chunk_size]

# Function to fetch and process data in bulk
def fetch_and_process_data(imdb_ids, chunk_size, num_chunks=1, progress_pickle='in_progress.pkl'):
    # Define the valid info parameters we want to fetch
    infos = ["base_info", "genres", "revenue_budget", "releaseDate", "rating", "awards"]
    
    # List to hold detailed movie data
    movie_details_list = []

    # Load progress if the pickle file exists
    if os.path.exists(progress_pickle):
        with open(progress_pickle, 'rb') as f:
            processed_chunks = pd.read_pickle(f)
            movie_details_list.extend(processed_chunks)
        print(f"Resuming from {len(processed_chunks)} processed chunks.")
    else:
        processed_chunks = []

    # Fetch data for each chunk of IMDb IDs
    for chunk_count, chunk in enumerate(chunk_list(imdb_ids, chunk_size)):
        if chunk_count < len(processed_chunks):
            continue  # Skip already processed chunks
        if chunk_count >= num_chunks:
            break
        details = merge_movie_details(chunk, infos)
        if details:
            for imdb_id, detail in details.items():
                detail['imdb_id'] = imdb_id  # Add the IMDb ID back to the detail
            flattened_data = [flatten_json(value) for value in details.values()]
            movie_details_list.extend(flattened_data)

            # Save progress to the pickle file
            with open(progress_pickle, 'wb') as f:
                pd.to_pickle(movie_details_list, f)
        print(f"Processed chunk {chunk_count + 1}/{num_chunks}")

    # Convert to DataFrame
    movie_details_df = pd.DataFrame(movie_details_list)

    return movie_details_df

# Get IMDb IDs from the DataFrame
imdb_ids = merged_df['tconst'].tolist()

# Set the chunk size and number of chunks (e.g., 25 IDs per request, and 10 chunks)
chunk_size = 25  # Adjusted to 25 based on API limits
num_chunks = 70000/25 

# Fetch and process data in bulk
movie_details_df = fetch_and_process_data(imdb_ids, chunk_size, num_chunks)

# Ensure 'imdb_id' exists in the movie_details_df before merging
if 'imdb_id' in movie_details_df.columns:
    # Merge the new movie details DataFrame with the original DataFrame
    final_df = merged_df.merge(movie_details_df, left_on='tconst', right_on='imdb_id', how='left')
else:
    final_df = merged_df

# Display the merged DataFrame
print(final_df.head())

# Save the final DataFrame to a new pickle file
final_pickle = 'last_three_years_movies_with_detailed_data.pkl'
final_df.to_pickle(final_pickle)

# Remove the in-progress pickle file upon successful completion
if os.path.exists('in_progress.pkl'):
    os.remove('in_progress.pkl')

print(f"Data saved to {final_pickle}")


      tconst titleType                       primaryTitle  \
0  tt0013274     movie        Istoriya grazhdanskoy voyny   
1  tt0070596     movie                  Socialist Realism   
2  tt0077684     movie  Histórias de Combóios em Portugal   
3  tt0096235     movie                        Taxi Killer   
4  tt0097767     movie                     Loading Ludwig   

                       originalTitle isAdult  startYear endYear  \
0        Istoriya grazhdanskoy voyny       0       2021      \N   
1             El realismo socialista       0       2023      \N   
2  Histórias de Combóios em Portugal       0       2022      \N   
3                        Taxi Killer       0       2022      \N   
4                     Loading Ludwig       0       2022      \N   

  runtimeMinutes              genres  averageRating  numVotes  
0             94         Documentary            6.6      71.0  
1             78               Drama            7.5      59.0  
2             46         Documentary  

In [19]:
dict(final_df.iloc[0])

{'tconst': 'tt0013274',
 'titleType': 'movie',
 'primaryTitle': 'Istoriya grazhdanskoy voyny',
 'originalTitle': 'Istoriya grazhdanskoy voyny',
 'isAdult': 0,
 'startYear': 2021,
 'endYear': '\\N',
 'runtimeMinutes': '94',
 'genres_x': 'Documentary',
 'averageRating': 6.6,
 'numVotes': 71.0,
 '_id': '61e58022d735dff3f9412891',
 'id': 'tt0013274',
 'ratingsSummary_aggregateRating': 6.6,
 'ratingsSummary_voteCount': 71.0,
 'ratingsSummary___typename': 'RatingsSummary',
 'episodes': None,
 'primaryImage_id': 'rm3867208705',
 'primaryImage_width': 2868.0,
 'primaryImage_height': 2154.0,
 'primaryImage_url': 'https://m.media-amazon.com/images/M/MV5BMjBhZjQ2NGMtZjUwNC00OTJmLWI0ODctYmI5OTc5ZGM5ZTY5XkEyXkFqcGdeQXVyMTQxMDQ2MDEz._V1_.jpg',
 'primaryImage_caption_plainText': 'Istoriya grazhdanskoy voyny (1922)',
 'primaryImage_caption___typename': 'Markdown',
 'primaryImage___typename': 'Image',
 'titleType_text': 'Movie',
 'titleType_id': 'movie',
 'titleType_isSeries': False,
 'titleType_isEpis