In [1]:
import pandas as pd
from pathlib import Path

# Define data path
TMDB_DIR = Path(r"D:\Capstone_Staging\data\tmdb")
tmdb_file = TMDB_DIR / "enriched_top_1000.csv"

# --- 1️⃣ Load and validate ---
df = pd.read_csv(tmdb_file)
print(f"✅ TMDb dataset loaded: {len(df):,} rows, {len(df.columns)} columns")

# --- 2️⃣ Schema check ---
print("\n📋 Columns:")
for col in df.columns:
    print("  •", col)

# --- 3️⃣ Preview top records ---
print("\n🪄 Sample:")
display(df.head(10))

# --- 4️⃣ Basic data health check ---
missing_years = df['release_year'].isna().sum()
missing_titles = df['title'].isna().sum()
print(f"\n🔍 Missing release_year: {missing_years:,} rows")
print(f"🔍 Missing title: {missing_titles:,} rows")

# --- 5️⃣ Quick genre diversity snapshot ---
if 'genres' in df.columns:
    genre_counts = (
        df['genres']
        .dropna()
        .str.split('|')
        .explode()
        .value_counts()
        .head(15)
    )
    print("\n🎬 Top 15 Genres:")
    display(genre_counts)


✅ TMDb dataset loaded: 9,999 rows, 4 columns

📋 Columns:
  • tmdb_id
  • title
  • release_year
  • genres

🪄 Sample:


Unnamed: 0,tmdb_id,title,release_year,genres
0,941109,Play Dirty,2025.0,Crime
1,1311031,Demon Slayer: Kimetsu no Yaiba Infinity Castle,2025.0,Animation|Action|Fantasy|Thriller
2,1257009,Primitive War,2025.0,Action|Horror|War
3,755898,War of the Worlds,2025.0,Science Fiction|Thriller
4,617126,The Fantastic 4: First Steps,2025.0,Science Fiction|Adventure
5,1186350,Marco,2024.0,Action|Crime|Thriller
6,1267319,Mantis,2025.0,Action|Crime|Thriller
7,793387,Holy Night: Demon Hunters,2025.0,Action|Fantasy|Horror|Thriller
8,1328803,Prisoner of War,2025.0,Action|War|Thriller|History
9,1357886,Django Undisputed,2024.0,Western



🔍 Missing release_year: 40 rows
🔍 Missing title: 0 rows

🎬 Top 15 Genres:


genres
Drama              3915
Comedy             2616
Action             2151
Thriller           1995
Romance            1464
Adventure          1443
Horror             1297
Crime              1206
Fantasy             960
Science Fiction     949
Family              942
Animation           913
Mystery             699
Documentary         645
TV Movie            571
Name: count, dtype: int64