In [20]:
import requests
import json
import pandas as pd
import os

# Load the DataFrame from the pickle file
merged_df = pd.read_pickle('last_three_years_movies_with_ratings.pkl')

# Display the first few rows to verify the DataFrame
print(merged_df.head())

# Define the API key and base URL
#replace with your api key, removed before placing on github
api_key = ''
base_url = 'https://moviesdatabase.p.rapidapi.com/titles/x/titles-by-ids'

# Define headers
headers = {
    'x-rapidapi-key': api_key,
    'x-rapidapi-host': 'moviesdatabase.p.rapidapi.com'
}

# Utility function to get nested dictionary values
def get_nested(data, keys, default=None):
    for key in keys:
        try:
            data = data[key]
        except (KeyError, TypeError):
            return default
    return data

# Function to fetch detailed movie data for multiple IMDb IDs
def fetch_bulk_movie_details(imdb_ids, info):
    url = base_url
    payload = {
        "idsList": ",".join(imdb_ids),
        "info": info
    }
    try:
        response = requests.get(url, headers=headers, params=payload)
        response.raise_for_status()  # Raise HTTPError for bad responses
        return response.json(), imdb_ids  # Return response JSON and the IMDb IDs
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for IMDb IDs {imdb_ids} with info {info}: {e}")
        print(f"Response status code: {response.status_code}")
        print(f"Response text: {response.text}")
        return None, imdb_ids

# Function to merge data from multiple info parameters
def merge_movie_details(imdb_ids, infos):
    all_details = {imdb_id: {} for imdb_id in imdb_ids}
    
    for info in infos:
        details, ids = fetch_bulk_movie_details(imdb_ids, info)
        if details and 'results' in details:
            if isinstance(details['results'], list):
                for idx, result in enumerate(details['results']):
                    imdb_id = ids[idx]
                    if imdb_id in all_details:
                        all_details[imdb_id].update(result)
            else:
                print(f"No valid results for info: {info}")
        else:
            print(f"No valid results for info: {info}")
            print(details)  # Display details only if it's not something we can iterate

    return all_details

# Function to flatten nested JSON
def flatten_json(json_data):
    out = {}

    def flatten(x, name=''):
        if isinstance(x, dict):
            for a in x:
                flatten(x[a], name + a + '_')
        elif isinstance(x, list):
            for i, a in enumerate(x):
                flatten(a, name + str(i) + '_')
        else:
            out[name[:-1]] = x

    flatten(json_data)
    return out

# Function to split a list into chunks
def chunk_list(data, chunk_size):
    for i in range(0, len(data), chunk_size):
        yield data[i:i + chunk_size]

# Function to fetch and process data in bulk
def fetch_and_process_data(imdb_ids, chunk_size, num_chunks=1, progress_pickle='in_progress.pkl'):
    # Define the valid info parameters we want to fetch
    infos = ["base_info", "genres", "revenue_budget", "releaseDate", "rating", "awards"]
    
    # List to hold detailed movie data
    movie_details_list = []

    # Load progress if the pickle file exists
    if os.path.exists(progress_pickle):
        with open(progress_pickle, 'rb') as f:
            processed_chunks = pd.read_pickle(f)
            movie_details_list.extend(processed_chunks)
        print(f"Resuming from {len(processed_chunks)} processed chunks.")
    else:
        processed_chunks = []

    # Fetch data for each chunk of IMDb IDs
    for chunk_count, chunk in enumerate(chunk_list(imdb_ids, chunk_size)):
        if chunk_count < len(processed_chunks):
            continue  # Skip already processed chunks
        if chunk_count >= num_chunks:
            break
        details = merge_movie_details(chunk, infos)
        if details:
            for imdb_id, detail in details.items():
                detail['imdb_id'] = imdb_id  # Add the IMDb ID back to the detail
            flattened_data = [flatten_json(value) for value in details.values()]
            movie_details_list.extend(flattened_data)

            # Save progress to the pickle file
            with open(progress_pickle, 'wb') as f:
                pd.to_pickle(movie_details_list, f)
        print(f"Processed chunk {chunk_count + 1}/{num_chunks}")

    # Convert to DataFrame
    movie_details_df = pd.DataFrame(movie_details_list)

    return movie_details_df

# Get IMDb IDs from the DataFrame
imdb_ids = merged_df['tconst'].tolist()

# Set the chunk size and number of chunks (e.g., 25 IDs per request, and 10 chunks)
chunk_size = 25  # Adjusted to 25 based on API limits
num_chunks = 70000/25 

# Fetch and process data in bulk
movie_details_df = fetch_and_process_data(imdb_ids, chunk_size, num_chunks)

# Ensure 'imdb_id' exists in the movie_details_df before merging
if 'imdb_id' in movie_details_df.columns:
    # Merge the new movie details DataFrame with the original DataFrame
    final_df = merged_df.merge(movie_details_df, left_on='tconst', right_on='imdb_id', how='left')
else:
    final_df = merged_df

# Display the merged DataFrame
print(final_df.head())

# Save the final DataFrame to a new pickle file
final_pickle = 'last_three_years_movies_with_detailed_data.pkl'
final_df.to_pickle(final_pickle)

# Remove the in-progress pickle file upon successful completion
if os.path.exists('in_progress.pkl'):
    os.remove('in_progress.pkl')

print(f"Data saved to {final_pickle}")


      tconst titleType                       primaryTitle  \
0  tt0013274     movie        Istoriya grazhdanskoy voyny   
1  tt0070596     movie                  Socialist Realism   
2  tt0077684     movie  Histórias de Combóios em Portugal   
3  tt0096235     movie                        Taxi Killer   
4  tt0097767     movie                     Loading Ludwig   

                       originalTitle isAdult  startYear endYear  \
0        Istoriya grazhdanskoy voyny       0       2021      \N   
1             El realismo socialista       0       2023      \N   
2  Histórias de Combóios em Portugal       0       2022      \N   
3                        Taxi Killer       0       2022      \N   
4                     Loading Ludwig       0       2022      \N   

  runtimeMinutes              genres  averageRating  numVotes  
0             94         Documentary            6.6      71.0  
1             78               Drama            7.5      59.0  
2             46         Documentary  

In [19]:
dict(final_df.iloc[0])

{'tconst': 'tt0013274',
 'titleType': 'movie',
 'primaryTitle': 'Istoriya grazhdanskoy voyny',
 'originalTitle': 'Istoriya grazhdanskoy voyny',
 'isAdult': 0,
 'startYear': 2021,
 'endYear': '\\N',
 'runtimeMinutes': '94',
 'genres_x': 'Documentary',
 'averageRating': 6.6,
 'numVotes': 71.0,
 '_id': '61e58022d735dff3f9412891',
 'id': 'tt0013274',
 'ratingsSummary_aggregateRating': 6.6,
 'ratingsSummary_voteCount': 71.0,
 'ratingsSummary___typename': 'RatingsSummary',
 'episodes': None,
 'primaryImage_id': 'rm3867208705',
 'primaryImage_width': 2868.0,
 'primaryImage_height': 2154.0,
 'primaryImage_url': 'https://m.media-amazon.com/images/M/MV5BMjBhZjQ2NGMtZjUwNC00OTJmLWI0ODctYmI5OTc5ZGM5ZTY5XkEyXkFqcGdeQXVyMTQxMDQ2MDEz._V1_.jpg',
 'primaryImage_caption_plainText': 'Istoriya grazhdanskoy voyny (1922)',
 'primaryImage_caption___typename': 'Markdown',
 'primaryImage___typename': 'Image',
 'titleType_text': 'Movie',
 'titleType_id': 'movie',
 'titleType_isSeries': False,
 'titleType_isEpis

In [52]:
full_df = pd.read_pickle(final_pickle)
display(full_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68006 entries, 0 to 68005
Columns: 179 entries, tconst to ratingsSummary
dtypes: float64(53), int64(1), object(125)
memory usage: 93.4+ MB


None

In [23]:
display(full_df['_id'].value_counts())

61e58022d735dff3f9412891    1
6442da6dbc472b4044241ed1    1
6442da4c5256c30ca7241cb1    1
6442da4bbc472b4044241ca7    1
6442da505256c30ca7241cf9    1
                           ..
61e5cc4f971a4a15d653698b    1
61e5cff7e1a8ddbd96544b55    1
61e5cabcef99b1c543531039    1
61e5d5c3b597b91eb955b637    1
61e5d758d8f3c0931e561d4b    1
Name: _id, Length: 67873, dtype: int64

In [24]:
display(full_df['id'].value_counts())

tt0013274     1
tt27553457    1
tt27550692    1
tt27550705    1
tt27550984    1
             ..
tt16757716    1
tt16757854    1
tt16758012    1
tt16758212    1
tt9914972     1
Name: id, Length: 67873, dtype: int64

In [25]:
display(full_df['primaryImage_id'].value_counts())

rm2814853121    3
rm1043327489    2
rm3664921857    2
rm3715706369    1
rm3377669889    1
               ..
rm3092442881    1
rm3446466817    1
rm5893889       1
rm102426881     1
rm1307756289    1
Name: primaryImage_id, Length: 47162, dtype: int64

In [28]:
display(full_df['primaryImage___typename'].value_counts())
display(full_df['ratingsSummary___typename'].value_counts())
display(full_df['ratingsSummary___typename'].value_counts())
display(full_df['titleType_text'].value_counts())

Image    47166
Name: primaryImage___typename, dtype: int64

RatingsSummary    67871
Name: ratingsSummary___typename, dtype: int64

RatingsSummary    67871
Name: ratingsSummary___typename, dtype: int64

Movie             67228
Short               405
TV Movie            106
Video                57
TV Series            30
TV Episode           15
Video Game           12
TV Special           11
TV Mini Series        8
Music Video           1
Name: titleType_text, dtype: int64

In [30]:
display(full_df['titleType_id'].value_counts())

movie           67228
short             405
tvMovie           106
video              57
tvSeries           30
tvEpisode          15
videoGame          12
tvSpecial          11
tvMiniSeries        8
musicVideo          1
Name: titleType_id, dtype: int64

In [49]:
display(full_df['runtime_displayableProperty_value_plainText'].value_counts())

1h 30m     727
1h         398
1h 20m     342
1h 15m     293
1h 40m     275
          ... 
4h 45m       1
5h 10m       1
4h 30m       1
11h 20m      1
6h 57m       1
Name: runtime_displayableProperty_value_plainText, Length: 263, dtype: int64

In [54]:
to_keep_cols = {
    'tconst': 'movie_id',
    'titleType': 'title_type',
    'primaryTitle': 'primary_title',
    'originalTitle': 'original_title',
    'isAdult': 'is_adult',
    'startYear': 'start_year', 
    'endYear': 'end_year',
    'runtimeMinutes': 'runtime_minutes',
    'genres_x': 'genres',
    'averageRating': 'average_rating',
    'numVotes': 'num_votes',
    'ratingsSummary_aggregateRating': 'md_average_rating',
    'ratingsSummary_voteCount': 'md_num_votes',
    'primaryImage_url': 'image_url',
    'primaryImage_caption_plainText': 'image_caption',
    'titleType_text': 'md_title_type',
    'genres_genres_0_text': 'md_genres_0',
    'releaseDate_day': 'start_day',
    'releaseDate_month': 'start_month',
    'runtime_seconds': 'runtime_seconds',
    'wins_total': 'wins_total',
    'nominations_total': 'nominations_total',
    'genres_genres_1_text': 'md_genres_1',
    'productionBudget_budget_amount': 'budget_amount',
    'productionBudget_budget_currency': 'budget_currency',
    'lifetimeGross_total_amount': 'lifetime_gross',
    'openingWeekendGross_gross_total_amount': 'opening_weekend_gross',
    'worldwideGross_total_amount': 'worldwide_gross'
}

filtered_df = full_df[to_keep_cols.keys()]
filtered_df.rename(columns=to_keep_cols, inplace=True)
filtered_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.rename(columns=to_keep_cols, inplace=True)


Unnamed: 0,movie_id,title_type,primary_title,original_title,is_adult,start_year,end_year,runtime_minutes,genres,average_rating,...,start_month,runtime_seconds,wins_total,nominations_total,md_genres_1,budget_amount,budget_currency,lifetime_gross,opening_weekend_gross,worldwide_gross
0,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021,\N,94,Documentary,6.6,...,11.0,7980.0,0.0,0.0,,,,,,
1,tt0070596,movie,Socialist Realism,El realismo socialista,0,2023,\N,78,Drama,7.5,...,4.0,16200.0,0.0,0.0,,,,,,
2,tt0077684,movie,Histórias de Combóios em Portugal,Histórias de Combóios em Portugal,0,2022,\N,46,Documentary,,...,,1740.0,0.0,0.0,Short,,,,,
3,tt0096235,movie,Taxi Killer,Taxi Killer,0,2022,\N,106,"Action,Crime,Drama",5.6,...,,,0.0,0.0,Crime,,,,,
4,tt0097767,movie,Loading Ludwig,Loading Ludwig,0,2022,\N,65,\N,7.0,...,,3900.0,0.0,0.0,,,,,,


In [55]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68006 entries, 0 to 68005
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   movie_id               68006 non-null  object 
 1   title_type             68006 non-null  object 
 2   primary_title          68006 non-null  object 
 3   original_title         68006 non-null  object 
 4   is_adult               68006 non-null  object 
 5   start_year             68006 non-null  int64  
 6   end_year               68006 non-null  object 
 7   runtime_minutes        68006 non-null  object 
 8   genres                 68006 non-null  object 
 9   average_rating         32777 non-null  float64
 10  num_votes              32777 non-null  float64
 11  md_average_rating      32854 non-null  float64
 12  md_num_votes           67871 non-null  float64
 13  image_url              47166 non-null  object 
 14  image_caption          47166 non-null  object 
 15  md

In [56]:
filtered_df.to_pickle('final_filtered_last_three_years_data.pkl')