In [1]:
import pandas as pd
import requests
import os
import json
from time import sleep

# Load links.csv
links_path = 'data/ml-latest-small/links.csv'
links = pd.read_csv(links_path)

# TMDb API setup
TMDB_API_KEY = "04826aaa5a7349d28828c140963b6483"
BASE_URL = "https://api.themoviedb.org/3/movie/"

# Directory to save fetched data
output_dir = "data/enriched"
os.makedirs(output_dir, exist_ok=True)

# Function to fetch movie data from TMDb API
def fetch_movie_data(tmdb_id):
    url = f"{BASE_URL}{tmdb_id}"
    params = {"api_key": TMDB_API_KEY, "language": "en-US"}
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for TMDb ID {tmdb_id}: {e}")
        return None

# Fetch and save data
for _, row in links.iterrows():
    tmdb_id = row['tmdbId']
    if pd.isna(tmdb_id):
        continue
    
    output_file = os.path.join(output_dir, f"{int(tmdb_id)}.json")
    if os.path.exists(output_file):
        print(f"Data for TMDb ID {tmdb_id} already exists. Skipping.")
        continue

    print(f"Fetching data for TMDb ID {tmdb_id}...")
    movie_data = fetch_movie_data(int(tmdb_id))
    if movie_data:
        with open(output_file, 'w') as f:
            json.dump(movie_data, f)
    sleep(0.25)  # Avoid hitting rate limits


Fetching data for TMDb ID 862.0...
Fetching data for TMDb ID 8844.0...
Fetching data for TMDb ID 15602.0...
Fetching data for TMDb ID 31357.0...
Fetching data for TMDb ID 11862.0...
Fetching data for TMDb ID 949.0...
Fetching data for TMDb ID 11860.0...
Fetching data for TMDb ID 45325.0...
Fetching data for TMDb ID 9091.0...
Fetching data for TMDb ID 710.0...
Fetching data for TMDb ID 9087.0...
Fetching data for TMDb ID 12110.0...
Fetching data for TMDb ID 21032.0...
Fetching data for TMDb ID 10858.0...
Fetching data for TMDb ID 1408.0...
Fetching data for TMDb ID 524.0...
Fetching data for TMDb ID 4584.0...
Fetching data for TMDb ID 5.0...
Fetching data for TMDb ID 9273.0...
Fetching data for TMDb ID 11517.0...
Fetching data for TMDb ID 8012.0...
Fetching data for TMDb ID 1710.0...
Fetching data for TMDb ID 9691.0...
Fetching data for TMDb ID 12665.0...
Fetching data for TMDb ID 451.0...
Fetching data for TMDb ID 16420.0...
Fetching data for TMDb ID 9263.0...
Fetching data for TMDb ID

Parse the saved data and enrich the content features:

In [2]:
import os
import json
import pandas as pd

# Step 1: Read links.csv
links_df = pd.read_csv('data/ml-latest-small/links.csv')

# Directory containing the enriched JSON files
enriched_dir = 'data/enriched'

# Prepare a list to hold enriched movie data
enriched_data = []

# Step 2: Read all JSON files and extract the required fields
for file_name in os.listdir(enriched_dir):
    if file_name.endswith('.json'):
        tmdb_id = file_name.split('.')[0]

        # Read the JSON data
        with open(os.path.join(enriched_dir, file_name), 'r') as f:
            movie_data = json.load(f)

        # Extract required fields
        # Some fields are directly available, others need processing
        title = movie_data.get("title", "")
        budget = movie_data.get("budget", None)
        imdb_id = movie_data.get("imdb_id", "")
        
        # origin_country from "production_countries" field which is a list of dicts with "iso_3166_1" or "name"
        production_countries = movie_data.get("production_countries", [])
        origin_country = ", ".join([c.get("name", "") for c in production_countries])

        original_language = movie_data.get("original_language", "")
        original_title = movie_data.get("original_title", "")
        overview = movie_data.get("overview", "")
        popularity = movie_data.get("popularity", None)
        release_date = movie_data.get("release_date", "")
        runtime = movie_data.get("runtime", None)
        tagline = movie_data.get("tagline", "")
        vote_average = movie_data.get("vote_average", None)
        vote_count = movie_data.get("vote_count", None)
        
        enriched_data.append({
            "tmdbId": tmdb_id,
            "title": title,
            "budget": budget,
            "imdb_id": imdb_id,
            "origin_country": origin_country,
            "original_language": original_language,
            "original_title": original_title,
            "overview": overview,
            "popularity": popularity,
            "release_date": release_date,
            "runtime": runtime,
            "tagline": tagline,
            "vote_average": vote_average,
            "vote_count": vote_count
        })

# Create a DataFrame from the enriched data
enriched_df = pd.DataFrame(enriched_data)

# Convert tmdbId to numeric if possible (links.csv often stores it as float)
enriched_df['tmdbId'] = pd.to_numeric(enriched_df['tmdbId'], errors='coerce')

# Step 3: Save the enriched data to movies_enriched.csv
enriched_df.to_csv("data/movies_enriched.csv", index=False)

Now join the movies_enriched.csv with the movies.csv.

In [3]:
import os
import json
import pandas as pd

# Step 1: Read links.csv
links_df = pd.read_csv('data/ml-latest-small/links.csv')

# Directory containing the enriched JSON files
enriched_dir = 'data/enriched'

# Step 4: Read the newly created movies_enriched.csv
movies_enriched_df = pd.read_csv("data/movies_enriched.csv")

# Step 5: Read movies.csv
movies_df = pd.read_csv("data/ml-latest-small/movies.csv")

# Step 6: Join movies.csv and movies_enriched.csv via movieId, tmdbId from links.csv

# Convert movieId in links to int if needed
links_df['movieId'] = pd.to_numeric(links_df['movieId'], errors='coerce')
links_df['tmdbId'] = pd.to_numeric(links_df['tmdbId'], errors='coerce')

# Merge links_df with movies_enriched_df on tmdbId first
links_enriched_merged = pd.merge(links_df, movies_enriched_df, on='tmdbId', how='left')

# Now merge the above result with movies_df on movieId
final_merged_df = pd.merge(movies_df, links_enriched_merged, on='movieId', how='left')

# Step 7: Save the final merged DataFrame
final_merged_df.to_csv("data/ml-latest-small/movies_with_enriched_data.csv", index=False)

print("Merging complete! The final file is: movies_with_enriched_data.csv")


Merging complete! The final file is: movies_with_enriched_data.csv
