In [0]:
# ================================================================
#  Validation Notebook Cell ‚Äî Verify TMDB + Discogs Extract Outputs
# ================================================================

from pyspark.sql import SparkSession

# 1Ô∏è‚É£ Configure Spark with ADLS credentials from markscope
from pyspark.dbutils import DBUtils
dbutils = DBUtils(spark)

STORAGE_ACCOUNT = dbutils.secrets.get("markscope", "azure-storage-account-name").strip()
STORAGE_KEY = dbutils.secrets.get("markscope", "azure-storage-account-key").strip()
BASE_URI = f"abfss://raw@{STORAGE_ACCOUNT}.dfs.core.windows.net"

spark = (
    SparkSession.builder.appName("Validate_Extract_Outputs")
    .config(f"fs.azure.account.key.{STORAGE_ACCOUNT}.dfs.core.windows.net", STORAGE_KEY)
    .getOrCreate()
)

# 2Ô∏è‚É£ Define paths
tmdb_path = f"{BASE_URI}/raw/tmdb/"
discogs_path = f"{BASE_URI}/raw/discogs/"

# 3Ô∏è‚É£ Load Parquet outputs
tmdb_df = spark.read.parquet(tmdb_path)
discogs_df = spark.read.parquet(discogs_path)

# 4Ô∏è‚É£ Display summary info
print("‚úÖ TMDB Extract")
tmdb_df.printSchema()
print(f"TMDB record count: {tmdb_df.count()}")
display(tmdb_df.limit(10))

print("‚úÖ Discogs Extract")
discogs_df.printSchema()
print(f"Discogs record count: {discogs_df.count()}")
display(discogs_df.limit(10))

# 5Ô∏è‚É£ Sanity check: sample join keys overlap
tmdb_years = [r["tmdb_year"] for r in tmdb_df.select("tmdb_year").distinct().collect()]
discogs_years = [r["discogs_year"] for r in discogs_df.select("discogs_year").distinct().collect()]
overlap = set(tmdb_years) & set(discogs_years)
print(f"üéØ Overlapping year values: {sorted(list(overlap))[:10]}")

print("üßæ Validation complete ‚Äî both extract outputs verified.")


# ================================================================
#  Validation Extension ‚Äî Verify TMDB‚ÜíDiscogs Candidate Output
# ================================================================

print("\n‚úÖ Candidate Generation Output ‚Äî Step 03 (Pandas-based)")

# 6Ô∏è‚É£ Define candidate path (from Step03PrepareTMDBDiscogsCandidates)
candidates_path = f"{BASE_URI}/raw/intermediate/tmdb_discogs_candidates/"

# 7Ô∏è‚É£ Read the resulting Parquet from ADLS
try:
    candidates_df = spark.read.parquet(candidates_path)
    print("‚úÖ Candidate Parquet loaded successfully.\n")
    candidates_df.printSchema()
    print(f"Candidate record count: {candidates_df.count()}")
    display(candidates_df.limit(10))
except Exception as e:
    print(f"‚ùå Failed to load candidates Parquet: {e}")

# 8Ô∏è‚É£ Quick sanity checks
if "tmdb_title_norm" in candidates_df.columns and "discogs_title_norm" in candidates_df.columns:
    overlap_titles = (
        candidates_df.select("tmdb_title_norm")
        .intersect(candidates_df.select("discogs_title_norm"))
        .count()
    )
    print(f"üîç Normalized title overlaps (identical values): {overlap_titles}")
else:
    print("‚ö†Ô∏è Normalized title columns not present; skipping overlap check.")

print("üß≠ Validation complete ‚Äî TMDB + Discogs + Candidate outputs verified.")


