# Initial testing of the TMDB API to fetch movie details based on Id from the MovieLens dataset

Loading the data from movielens: merging the movies and links datasets to get the tmdbId for each movie.

In [1]:
import os
import pandas as pd
import numpy as np

data_dir = "Data/ml-latest-small"
movies_path = os.path.join(data_dir, "movies.csv")
ratings_path = os.path.join(data_dir, "ratings.csv")
tags_path = os.path.join(data_dir, "tags.csv")
links_path = os.path.join(data_dir, "links.csv")

In [2]:
# Load the datasets into pandas DataFrames:
movies = pd.read_csv(movies_path)
ratings = pd.read_csv(ratings_path, usecols=['userId', 'movieId', 'rating']) # Discard timestamp column
tags = pd.read_csv(tags_path)
links = pd.read_csv(links_path)

print("Movies DataFrame:")
movies.head()

Movies DataFrame:


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Creating the API fetching function, and using this for Fight Club (tmdbId 550) as a test.

In [33]:
api_key = "eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJlZDY3ZTIwOWZlOTAxMjczNTdmMmYyMThiZTBiMjU4MyIsIm5iZiI6MTc2MjAwNjA0OS45MTgwMDAyLCJzdWIiOiI2OTA2MTQyMWQ0NjY3NDNhMWNlMTI1ZTYiLCJzY29wZXMiOlsiYXBpX3JlYWQiXSwidmVyc2lvbiI6MX0._PD37ccGbyN3_HrrlOZiizVlcqaVm_LYIxeyfWYqmIc"

# Use the links DataFrame to merge with an external dataset tmbid to pull additional movie information (e.g., genres, release dates, etc.).
merged_movies = movies.merge(links, on='movieId', how='left')
# print("Merged Movies DataFrame with TMDB IDs:")
# print(merged_movies.head())

# This is a placeholder for actual API calls to fetch additional data.
# You would typically use the 'requests' library to make HTTP requests to the TMDB API.
import requests
def fetch_movie_details(tmbid, api_key):
    url = f"https://api.themoviedb.org/3/movie/{tmbid}"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "accept": "application/json"
    }
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        return response.json()
    else:
        return None
    
# Example usage:
movie_details = fetch_movie_details(552, api_key)  # Fetch details for the movie with TMDB ID 550
# Nicely print the fetched movie details
import pprint
pprint.pprint(movie_details)

{'adult': False,
 'backdrop_path': '/k4JIHyAXaGHwAwT7y5Skd17f0Wl.jpg',
 'belongs_to_collection': None,
 'budget': 0,
 'genres': [{'id': 35, 'name': 'Comedy'}, {'id': 10749, 'name': 'Romance'}],
 'homepage': '',
 'id': 552,
 'imdb_id': 'tt0237539',
 'origin_country': ['IT'],
 'original_language': 'it',
 'original_title': 'Pane e tulipani',
 'overview': 'An endearing light comedy about a woman who spontaneously '
             'becomes a resident of Venice after her family left her behind. '
             'While enjoying the wonderful people she meets she achieves a new '
             'life and the first time independent of her family.',
 'popularity': 2.4392,
 'poster_path': '/ijCBLF08KQsam3cOBAnit05iiV3.jpg',
 'production_companies': [{'id': 317,
                           'logo_path': '/HJj0J9Tk6AGcLpL4O7mI3bsccM.png',
                           'name': 'Istituto Luce Cinecittà',
                           'origin_country': 'IT'},
                          {'id': 169018,
               

Deciding on the relevant fields to extract from the API response.

In [22]:
# Parse JSON response and integrate relevant fields into the movies DataFrame as needed.
if movie_details:
    # Example: Extract genres and release date
    release_date = movie_details.get('release_date', 'N/A')
    overview = movie_details.get('overview', 'N/A')
    original_language = movie_details.get('original_language', 'N/A')
    origin_country = movie_details.get('origin_country', [])
    runtime = movie_details.get('runtime', 'N/A')
    print(f"Original Language: {original_language}")
    print(f"Origin Country: {origin_country}")
    print(f"Runtime: {runtime} minutes")
    print(f"Overview: {overview}")
    print(f"Release Date: {release_date}")

Original Language: it
Origin Country: ['IT']
Runtime: 114 minutes
Overview: An endearing light comedy about a woman who spontaneously becomes a resident of Venice after her family left her behind. While enjoying the wonderful people she meets she achieves a new life and the first time independent of her family.
Release Date: 2000-03-03


Creating a structured approach to fetch and store movie details from TMDB by looping through the list of tmdbIds.

In [None]:
import pandas as pd
import os
import requests
import time
from tqdm import tqdm # tqdm for progress bar

# 1. Setup API and load data

api_key = "eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJlZDY3ZTIwOWZlOTAxMjczNTdmMmYyMThiZTBiMjU4MyIsIm5iZiI6MTc2MjAwNjA0OS45MTgwMDAyLCJzdWIiOiI2OTA2MTQyMWQ0NjY3NDNhMWNlMTI1ZTYiLCJzY29wZXMiOlsiYXBpX3JlYWQiXSwidmVyc2lvbiI6MX0._PD37ccGbyN3_HrrlOZiizVlcqaVm_LYIxeyfWYqmIc"
folder_path = r"Data/ml-latest-small/"

# Load your data
movies = pd.read_csv(folder_path + 'movies.csv')
links = pd.read_csv(folder_path + 'links.csv')

# Merge to get tmdbId
merged_movies = movies.merge(links, on='movieId', how='left')

# Define the output filename
OUTPUT_CSV = 'tmdb_movie_details.csv'

# 2. API Fetching Function

def fetch_movie_details(tmbid, api_key):
    """
    Fetches and parses movie details from TMDB.
    Returns a dictionary of the fields, or None on failure.
    """
    url = f"https://api.themoviedb.org/3/movie/{tmbid}"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "accept": "application/json"
    }
    
    try:
        # Add a timeout to prevent hanging forever on a bad request
        response = requests.get(url, headers=headers, timeout=5)

        if response.status_code == 200:
            movie_details = response.json()
            
            # Extract genres as a comma-separated string
            genres_list = [g['name'] for g in movie_details.get('genres', [])]
            genres_str = ','.join(genres_list)
            
            # Return a flat dictionary of the relevant fields
            return {
                'release_date': movie_details.get('release_date', None),
                'overview': movie_details.get('overview', None),
                'original_language': movie_details.get('original_language', None),
                'runtime': movie_details.get('runtime', None),
                'popularity': movie_details.get('popularity', None),
                'genres_tmdb': genres_str
            }

        else:
            return None
            
    except requests.RequestException as e:
        return None

# 3. Looping and Saving

# This list will store all our results as dictionaries
all_movie_data = []

# Use tqdm to wrap your loop for a beautiful progress bar
print(f"Fetching {len(merged_movies)} movie details from TMDB...")

for index, row in tqdm(merged_movies.iterrows(), total=len(merged_movies)):
    
    tmbid = row['tmdbId']
    
    # Check if tmdbId is valid
    if pd.notna(tmbid):
        # Convert to integer
        tmbid = int(tmbid)
        
        # Call our API function
        details = fetch_movie_details(tmbid, api_key)
        
        # If the call was successful and returned data
        if details:
            # Add the 'movieId' from the original DataFrame as the key
            details['movieId'] = row['movieId']
            
            # Add the full dictionary to our list
            all_movie_data.append(details)
            
        # Add a small delay to avoid rate limits
        time.sleep(0.03)

print("API fetching complete.")

# 4. Convert list of dicts to DataFrame

print("Converting list to DataFrame...")
details_df = pd.DataFrame(all_movie_data)

# Reorder columns to put the 'movieId' first
if not details_df.empty:
    cols = ['movieId'] + [col for col in details_df.columns if col != 'movieId']
    details_df = details_df[cols]

# Save the final CSV
details_df.to_csv(OUTPUT_CSV, index=False)

print(f"Successfully fetched and saved {len(details_df)} movie details to {OUTPUT_CSV}")

# Display the first few rows of your new file
print(details_df.head())

Fetching 9742 movie details from TMDB...


100%|██████████| 9742/9742 [1:31:24<00:00,  1.78it/s]  

API fetching complete.
Converting list to DataFrame...
Successfully fetched and saved 9621 movie details to tmdb_movie_details.csv
   movieId release_date                                           overview  \
0        1   1995-11-22  Led by Woody, Andy's toys live happily in his ...   
1        2   1995-12-15  When siblings Judy and Peter discover an encha...   
2        3   1995-12-22  A family wedding reignites the ancient feud be...   
3        4   1995-12-22  Cheated on, mistreated and stepped on, the wom...   
4        5   1995-12-08  Just when George Banks has recovered from his ...   

  original_language  runtime  popularity                        genres_tmdb  
0                en       81     17.7869  Family,Comedy,Animation,Adventure  
1                en      104      2.6307           Adventure,Fantasy,Family  
2                en      101      3.5340                     Romance,Comedy  
3                en      127      3.0182               Comedy,Drama,Romance  
4         


