In [19]:
import requests
import json
import pandas as pd

# Load the DataFrame from the pickle file
merged_df = pd.read_pickle('last_three_years_movies_with_ratings.pkl')

# Display the first few rows to verify the DataFrame
print(merged_df.head())

# Define the API key and base URL
#replace with your api key, removed before adding to github
api_key = ''
base_url = 'https://moviesdatabase.p.rapidapi.com/titles/x/titles-by-ids'

# Define headers
headers = {
    'x-rapidapi-key': api_key,
    'x-rapidapi-host': 'moviesdatabase.p.rapidapi.com'
}

# Utility function to get nested dictionary values
def get_nested(data, keys, default=None):
    for key in keys:
        try:
            data = data[key]
        except (KeyError, TypeError):
            return default
    return data

# Function to fetch detailed movie data for multiple IMDb IDs
def fetch_bulk_movie_details(imdb_ids):
    url = base_url
    payload = {
        "idsList": ",".join(imdb_ids),
        "info": "revenue_budget,genre,runtime,awards,releaseDate,rating"
    }
    response = requests.get(url, headers=headers, params=payload)
    if response.status_code == 200:
        return response.json()  # Parse the JSON response into a Python object
    else:
        print(f"Error fetching data for IMDb IDs {imdb_ids}: {response.status_code}")
        return None

# Function to extract relevant fields from the JSON response
def extract_movie_data(movie_json):
    if not movie_json:
        return None
    movie_data = movie_json.get('results', {})
    budget = get_nested(movie_data, ['productionBudget', 'budget', 'amount'])
    lifetime_gross = get_nested(movie_data, ['lifetimeGross', 'total', 'amount'])
    opening_weekend_gross = get_nested(movie_data, ['openingWeekendGross', 'gross', 'total', 'amount'])
    worldwide_gross = get_nested(movie_data, ['worldwideGross', 'total', 'amount'])
    genres = get_nested(movie_data, ['genres', 'genre'], [])
    runtime = get_nested(movie_data, ['runtime', 'runtimeMinutes'])
    awards = movie_data.get('awards', {}) if 'awards' in movie_data else {}
    release_date = get_nested(movie_data, ['releaseDate'], {})
    mdb_rating = get_nested(movie_data, ['ratingsSummary', 'aggregateRating'])
    mdb_votecount = get_nested(movie_data, ['ratingsSummary', 'voteCount'])
    
    return {
        'imdb_id': movie_data.get('id'),
        'budget': budget,
        'lifetime_gross': lifetime_gross,
        'opening_weekend_gross': opening_weekend_gross,
        'worldwide_gross': worldwide_gross,
        'genres': genres,
        'runtime': runtime,
        'awards': awards,
        'release_date': release_date,
        'mdb_rating': mdb_rating,
        'mdb_votecount': mdb_votecount
    }

# Define a function to split a list into chunks
def chunk_list(data, chunk_size):
    for i in range(0, len(data), chunk_size):
        yield data[i:i + chunk_size]

# Function to fetch and process data in bulk
def fetch_and_process_data(imdb_ids, chunk_size, num_chunks):
    # List to hold detailed movie data
    movie_details_list = []

    # Fetch data for each chunk of IMDb IDs
    for chunk_count, chunk in enumerate(chunk_list(imdb_ids, chunk_size)):
        if chunk_count >= num_chunks:
            break
        details = fetch_bulk_movie_details(chunk)
        if details and isinstance(details.get('results'), list):
            for movie in details['results']:
                if movie:  # Ensure movie is not None
                    movie_details_list.append(movie)
        else:
            print(f"Skipping chunk {chunk_count} due to errors or no results.")

    # Extract data for all fetched movies
    extracted_data = [extract_movie_data(movie) for movie in movie_details_list if movie]

    # Convert to DataFrame
    movie_details_df = pd.DataFrame(extracted_data)

    return movie_details_df

# Get IMDb IDs from the DataFrame
imdb_ids = merged_df['tconst'].tolist()

# Set the chunk size and number of chunks (e.g., 50 IDs per request, and 10 chunks)
chunk_size = 50
num_chunks = 10

# Fetch and process data in bulk
movie_details_df = fetch_and_process_data(imdb_ids, chunk_size, num_chunks)

# Display the first few rows to check the data
print(movie_details_df.head())

# Check if 'awards' column exists before printing
if 'awards' in movie_details_df.columns:
    print(movie_details_df['awards'].head())
else:
    print("No 'awards' column found.")

# Merge the new movie details DataFrame with the original DataFrame
final_df = merged_df.merge(movie_details_df, left_on='tconst', right_on='imdb_id', how='left')

# Display the merged DataFrame
print(final_df.head())

# Save the final DataFrame to a new pickle file
final_df.to_pickle('last_three_years_movies_with_detailed_data.pkl')


      tconst titleType                       primaryTitle  \
0  tt0013274     movie        Istoriya grazhdanskoy voyny   
1  tt0070596     movie                  Socialist Realism   
2  tt0077684     movie  Histórias de Combóios em Portugal   
3  tt0096235     movie                        Taxi Killer   
4  tt0097767     movie                     Loading Ludwig   

                       originalTitle isAdult  startYear endYear  \
0        Istoriya grazhdanskoy voyny       0       2021      \N   
1             El realismo socialista       0       2023      \N   
2  Histórias de Combóios em Portugal       0       2022      \N   
3                        Taxi Killer       0       2022      \N   
4                     Loading Ludwig       0       2022      \N   

  runtimeMinutes              genres  averageRating  numVotes  
0             94         Documentary            6.6      71.0  
1             78               Drama            7.5      59.0  
2             46         Documentary  

KeyError: 'imdb_id'

In [20]:
import requests

# Define the API key and base URL
api_key = '41e53a088amsh7d462d76ae468fep15f66djsnfbddd2d7475a'
base_url = 'https://moviesdatabase.p.rapidapi.com/titles/x/titles-by-ids'

# Define headers
headers = {
    'x-rapidapi-key': api_key,
    'x-rapidapi-host': 'moviesdatabase.p.rapidapi.com'
}

# Sample IMDb IDs
imdb_ids = ['tt0013274', 'tt0070596']  # Small sample for debugging

# Function to fetch detailed movie data for multiple IMDb IDs
def fetch_bulk_movie_details(imdb_ids):
    url = base_url
    payload = {
        "idsList": ",".join(imdb_ids),
        "info": "revenue_budget,genre,runtime,awards,releaseDate,rating"
    }
    response = requests.get(url, headers=headers, params=payload)
    if response.status_code == 200:
        return response.json()  # Parse the JSON response into a Python object
    else:
        print(f"Error fetching data for IMDb IDs {imdb_ids}: {response.status_code}")
        return None

# Fetch data for the sample IMDb IDs
details = fetch_bulk_movie_details(imdb_ids)

# Display the response for inspection
print(json.dumps(details, indent=4))


{
    "error": "Wrong info query parameter: revenue_budget,genre,runtime,awards,releaseDate,rating",
    "stack": "\ud83e\udd5e",
    "results": null
}


In [22]:
import requests
import json

# Define the API key and base URL
api_key = '41e53a088amsh7d462d76ae468fep15f66djsnfbddd2d7475a'
base_url = 'https://moviesdatabase.p.rapidapi.com/title/utils/titleType'

# Define headers
headers = {
    'x-rapidapi-key': api_key,
    'x-rapidapi-host': 'moviesdatabase.p.rapidapi.com'
}

# Fetch valid info parameters
response = requests.get(base_url, headers=headers)
valid_info_params = response.json()

# Display valid info parameters for inspection
print(json.dumps(valid_info_params, indent=4))


{
    "message": "Endpoint '/title/utils/titleType' does not exist"
}


In [38]:
import requests
import json

# Define the API key and base URL
api_key = '41e53a088amsh7d462d76ae468fep15f66djsnfbddd2d7475a'
base_url = 'https://moviesdatabase.p.rapidapi.com/titles/x/titles-by-ids'

# Define headers
headers = {
    'x-rapidapi-key': api_key,
    'x-rapidapi-host': 'moviesdatabase.p.rapidapi.com'
}

# Sample IMDb IDs
imdb_ids = ['tt0013274', 'tt0070596']  # Small sample for debugging

# Function to fetch detailed movie data for multiple IMDb IDs
def fetch_bulk_movie_details(imdb_ids):
    url = base_url
    payload = {
        "idsList": ",".join(imdb_ids),
        "info": "awards"
    }
    response = requests.get(url, headers=headers, params=payload)
    if response.status_code == 200:
        return response.json()  # Parse the JSON response into a Python object
    else:
        print(f"Error fetching data for IMDb IDs {imdb_ids}: {response.status_code}")
        return None

# Fetch data for the sample IMDb IDs
details = fetch_bulk_movie_details(imdb_ids)

# Display the response for inspection
print(json.dumps(details, indent=4))


{
    "entries": 2,
    "results": [
        {
            "_id": "61e58022d735dff3f9412891",
            "id": "tt0013274",
            "wins": {
                "total": 0,
                "__typename": "AwardNominationConnection"
            },
            "nominations": {
                "total": 0,
                "__typename": "AwardNominationConnection"
            },
            "prestigiousAwardSummary": null
        },
        {
            "_id": "61e580eda66e596423414ce1",
            "id": "tt0070596",
            "wins": {
                "total": 0,
                "__typename": "AwardNominationConnection"
            },
            "nominations": {
                "total": 0,
                "__typename": "AwardNominationConnection"
            },
            "prestigiousAwardSummary": null
        }
    ]
}


In [51]:
import requests
import json
import pandas as pd

# Define the API key and base URL
api_key = '41e53a088amsh7d462d76ae468fep15f66djsnfbddd2d7475a'
base_url = 'https://moviesdatabase.p.rapidapi.com/titles/x/titles-by-ids'

# Define headers
headers = {
    'x-rapidapi-key': api_key,
    'x-rapidapi-host': 'moviesdatabase.p.rapidapi.com'
}

# Function to fetch detailed movie data for multiple IMDb IDs
def fetch_bulk_movie_details(imdb_ids, info):
    url = base_url
    payload = {
        "idsList": ",".join(imdb_ids),
        "info": info
    }
    response = requests.get(url, headers=headers, params=payload)
    if response.status_code == 200:
        return response.json()  # Parse the JSON response into a Python object
    else:
        print(f"Error fetching data for IMDb IDs {imdb_ids} with info {info}: {response.status_code}")
        return None

# Function to merge data from multiple info parameters
def merge_movie_details(imdb_ids, infos):
    all_details = {imdb_id: {} for imdb_id in imdb_ids}
    
    for info in infos:
        details = fetch_bulk_movie_details(imdb_ids, info)
        if details and 'results' in details:
            for result in details['results']:
                imdb_id = result['id']
                if imdb_id in all_details:
                    all_details[imdb_id].update(result)
    
    return all_details

# Sample IMDb IDs
imdb_ids = ['tt0013274', 'tt0070596']  # Small sample for debugging

# Define the valid info parameters we want to fetch
infos = ["base_info", "genres", "revenue_budget", "releaseDate", "rating", "awards"]

# Fetch and merge data for the sample IMDb IDs
merged_details = merge_movie_details(imdb_ids, infos)

# Convert the merged results to a DataFrame
merged_data = [value for value in merged_details.values()]
movie_details_df = pd.DataFrame(merged_data)

# Display the merged DataFrame
print(movie_details_df)

# Save the final DataFrame to a new pickle file
movie_details_df.to_pickle('sample_movies_with_detailed_data.pkl')

# Display the first few rows to verify the DataFrame
print(movie_details_df.head())

                        _id         id  \
0  61e58022d735dff3f9412891  tt0013274   
1  61e580eda66e596423414ce1  tt0070596   

                                      ratingsSummary episodes  \
0  {'aggregateRating': 6.6, 'voteCount': 71, '__t...     None   
1  {'aggregateRating': 7.5, 'voteCount': 59, '__t...     None   

                                        primaryImage  \
0  {'id': 'rm3867208705', 'width': 2868, 'height'...   
1  {'id': 'rm2513434113', 'width': 1233, 'height'...   

                                           titleType  \
0  {'text': 'Movie', 'id': 'movie', 'isSeries': F...   
1  {'text': 'Movie', 'id': 'movie', 'isSeries': F...   

                                              genres  \
0  {'genres': [{'text': 'Documentary', 'id': 'Doc...   
1  {'genres': [{'text': 'Drama', 'id': 'Drama', '...   

                                           titleText  \
0  {'text': 'Istoriya grazhdanskoy voyny', '__typ...   
1  {'text': 'El realismo socialista', '__typename...   

 

In [54]:
dict(movie_details_df.iloc[0])

{'_id': '61e58022d735dff3f9412891',
 'id': 'tt0013274',
 'ratingsSummary': {'aggregateRating': 6.6,
  'voteCount': 71,
  '__typename': 'RatingsSummary'},
 'episodes': None,
 'primaryImage': {'id': 'rm3867208705',
  'width': 2868,
  'height': 2154,
  'url': 'https://m.media-amazon.com/images/M/MV5BMjBhZjQ2NGMtZjUwNC00OTJmLWI0ODctYmI5OTc5ZGM5ZTY5XkEyXkFqcGdeQXVyMTQxMDQ2MDEz._V1_.jpg',
  'caption': {'plainText': 'Istoriya grazhdanskoy voyny (1922)',
   '__typename': 'Markdown'},
  '__typename': 'Image'},
 'titleType': {'text': 'Movie',
  'id': 'movie',
  'isSeries': False,
  'isEpisode': False,
  '__typename': 'TitleType'},
 'genres': {'genres': [{'text': 'Documentary',
    'id': 'Documentary',
    '__typename': 'Genre'}],
  '__typename': 'Genres'},
 'titleText': {'text': 'Istoriya grazhdanskoy voyny',
  '__typename': 'TitleText'},
 'originalTitleText': {'text': 'Istoriya grazhdanskoy voyny',
  '__typename': 'TitleText'},
 'releaseYear': {'year': 1922, 'endYear': None, '__typename': 'Year

In [55]:
import requests
import json
import pandas as pd

# Define the API key and base URL
api_key = '41e53a088amsh7d462d76ae468fep15f66djsnfbddd2d7475a'
base_url = 'https://moviesdatabase.p.rapidapi.com/titles/x/titles-by-ids'

# Define headers
headers = {
    'x-rapidapi-key': api_key,
    'x-rapidapi-host': 'moviesdatabase.p.rapidapi.com'
}

# Function to fetch detailed movie data for multiple IMDb IDs
def fetch_bulk_movie_details(imdb_ids, info):
    url = base_url
    payload = {
        "idsList": ",".join(imdb_ids),
        "info": info
    }
    response = requests.get(url, headers=headers, params=payload)
    if response.status_code == 200:
        return response.json()  # Parse the JSON response into a Python object
    else:
        print(f"Error fetching data for IMDb IDs {imdb_ids} with info {info}: {response.status_code}")
        return None

# Function to merge data from multiple info parameters
def merge_movie_details(imdb_ids, infos):
    all_details = {imdb_id: {} for imdb_id in imdb_ids}
    
    for info in infos:
        details = fetch_bulk_movie_details(imdb_ids, info)
        if details and 'results' in details:
            for result in details['results']:
                imdb_id = result['id']
                if imdb_id in all_details:
                    all_details[imdb_id].update(result)
    
    return all_details

# Function to flatten nested JSON
def flatten_json(json_data):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            for i, a in enumerate(x):
                flatten(a, name + str(i) + '_')
        else:
            out[name[:-1]] = x

    flatten(json_data)
    return out

# Sample IMDb IDs
imdb_ids = ['tt0013274', 'tt0070596']  # Small sample for debugging

# Define the valid info parameters we want to fetch
infos = ["base_info", "genres", "revenue_budget", "releaseDate", "rating", "awards"]

# Fetch and merge data for the sample IMDb IDs
merged_details = merge_movie_details(imdb_ids, infos)

# Flatten the merged details
flattened_data = [flatten_json(value) for value in merged_details.values()]
movie_details_df = pd.DataFrame(flattened_data)

# Display the merged DataFrame
print(movie_details_df)

# Save the final DataFrame to a new pickle file
movie_details_df.to_pickle('sample_movies_with_detailed_data.pkl')

# Display the first few rows to verify the DataFrame
print(movie_details_df.head())


                        _id         id  ratingsSummary_aggregateRating  \
0  61e58022d735dff3f9412891  tt0013274                             6.6   
1  61e580eda66e596423414ce1  tt0070596                             7.5   

   ratingsSummary_voteCount ratingsSummary___typename episodes  \
0                        71            RatingsSummary     None   
1                        59            RatingsSummary     None   

  primaryImage_id  primaryImage_width  primaryImage_height  \
0    rm3867208705                2868                 2154   
1    rm2513434113                1233                  871   

                                    primaryImage_url  ... wins_total  \
0  https://m.media-amazon.com/images/M/MV5BMjBhZj...  ...          0   
1  https://m.media-amazon.com/images/M/MV5BYTQ4Mz...  ...          0   

             wins___typename nominations_total     nominations___typename  \
0  AwardNominationConnection                 0  AwardNominationConnection   
1  AwardNominationCo

In [57]:
import requests
import json
import pandas as pd

# Define the API key and base URL
api_key = '41e53a088amsh7d462d76ae468fep15f66djsnfbddd2d7475a'
base_url = 'https://moviesdatabase.p.rapidapi.com/titles/x/titles-by-ids'

# Define headers
headers = {
    'x-rapidapi-key': api_key,
    'x-rapidapi-host': 'moviesdatabase.p.rapidapi.com'
}

# Function to fetch detailed movie data for multiple IMDb IDs
def fetch_bulk_movie_details(imdb_ids, info):
    url = base_url
    payload = {
        "idsList": ",".join(imdb_ids),
        "info": info
    }
    try:
        response = requests.get(url, headers=headers, params=payload)
        response.raise_for_status()  # Raise HTTPError for bad responses
        return response.json()  # Parse the JSON response into a Python object
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for IMDb IDs {imdb_ids} with info {info}: {e}")
        return None

# Function to merge data from multiple info parameters
def merge_movie_details(imdb_ids, infos):
    all_details = {imdb_id: {} for imdb_id in imdb_ids}
    
    for info in infos:
        details = fetch_bulk_movie_details(imdb_ids, info)
        if details and isinstance(details.get('results'), list):
            for result in details['results']:
                imdb_id = result.get('id')
                if imdb_id and imdb_id in all_details:
                    all_details[imdb_id].update(result)
        else:
            print(f"No valid results for info: {info}")

    return all_details

# Function to flatten nested JSON
def flatten_json(json_data):
    out = {}

    def flatten(x, name=''):
        if isinstance(x, dict):
            for a in x:
                flatten(x[a], name + a + '_')
        elif isinstance(x, list):
            for i, a in enumerate(x):
                flatten(a, name + str(i) + '_')
        else:
            out[name[:-1]] = x

    flatten(json_data)
    return out

# Sample IMDb IDs
imdb_ids = ['tt0013274', 'tt0070596']  # Small sample for debugging

# Define the valid info parameters we want to fetch
infos = ["base_info", "genres", "revenue_budget", "releaseDate", "rating", "awards"]

# Fetch and merge data for the sample IMDb IDs
merged_details = merge_movie_details(imdb_ids, infos)

# Flatten the merged details
flattened_data = [flatten_json(value) for value in merged_details.values()]
movie_details_df = pd.DataFrame(flattened_data)

# Display the merged DataFrame
print(movie_details_df)

# Save the final DataFrame to a new pickle file
movie_details_df.to_pickle('sample_movies_with_detailed_data.pkl')

# Display the first few rows to verify the DataFrame
print(movie_details_df.head())


                        _id         id  ratingsSummary_aggregateRating  \
0  61e58022d735dff3f9412891  tt0013274                             6.6   
1  61e580eda66e596423414ce1  tt0070596                             7.5   

   ratingsSummary_voteCount ratingsSummary___typename episodes  \
0                        71            RatingsSummary     None   
1                        59            RatingsSummary     None   

  primaryImage_id  primaryImage_width  primaryImage_height  \
0    rm3867208705                2868                 2154   
1    rm2513434113                1233                  871   

                                    primaryImage_url  ... wins_total  \
0  https://m.media-amazon.com/images/M/MV5BMjBhZj...  ...          0   
1  https://m.media-amazon.com/images/M/MV5BYTQ4Mz...  ...          0   

             wins___typename nominations_total     nominations___typename  \
0  AwardNominationConnection                 0  AwardNominationConnection   
1  AwardNominationCo

In [58]:
dict(movie_details_df.iloc[0])

{'_id': '61e58022d735dff3f9412891',
 'id': 'tt0013274',
 'ratingsSummary_aggregateRating': 6.6,
 'ratingsSummary_voteCount': 71,
 'ratingsSummary___typename': 'RatingsSummary',
 'episodes': None,
 'primaryImage_id': 'rm3867208705',
 'primaryImage_width': 2868,
 'primaryImage_height': 2154,
 'primaryImage_url': 'https://m.media-amazon.com/images/M/MV5BMjBhZjQ2NGMtZjUwNC00OTJmLWI0ODctYmI5OTc5ZGM5ZTY5XkEyXkFqcGdeQXVyMTQxMDQ2MDEz._V1_.jpg',
 'primaryImage_caption_plainText': 'Istoriya grazhdanskoy voyny (1922)',
 'primaryImage_caption___typename': 'Markdown',
 'primaryImage___typename': 'Image',
 'titleType_text': 'Movie',
 'titleType_id': 'movie',
 'titleType_isSeries': False,
 'titleType_isEpisode': False,
 'titleType___typename': 'TitleType',
 'genres_genres_0_text': 'Documentary',
 'genres_genres_0_id': 'Documentary',
 'genres_genres_0___typename': 'Genre',
 'genres___typename': 'Genres',
 'titleText_text': 'Istoriya grazhdanskoy voyny',
 'titleText___typename': 'TitleText',
 'original