In [0]:
from pyspark.sql import functions as F

# Explicit prod run path
candidates_path = (
    "abfss://intermediate@ungcapstor01.dfs.core.windows.net/"
    "candidates/env=prod/run_id=20251106T070555/"
)

# Load dataframe
candidates_df = spark.read.parquet(candidates_path)

# Inspect schema if the columns are inconsistent
candidates_df.printSchema()

# Normalize possible inconsistent column naming
cols = [c.lower().strip() for c in candidates_df.columns]
candidates_df = candidates_df.toDF(*cols)

# Display joined candidate pairs with year deltas
candidates_df.select(
    "canonical_id",
    F.col("tmdb_title").alias("tmdb_title_raw"),
    F.col("tmdb_title_norm").alias("tmdb_title_clean"),
    F.col("tmdb_year").alias("tmdb_year"),
    F.col("discogs_title"),
    F.col("discogs_title_norm"),
    F.col("discogs_year"),
    F.abs(F.col("tmdb_year") - F.col("discogs_year")).alias("year_diff"),
    F.col("discogs_country"),
    F.col("discogs_genre_str"),
    F.col("discogs_style_str"),
    F.concat_ws(", ", F.col("discogs_format")).alias("discogs_format_list"),
    "load_dt"
).orderBy(
    F.asc("year_diff"),
    F.asc("tmdb_title_clean")
).show(50, truncate=False)

candidates_df.filter(
    (F.col("tmdb_title_norm").isNull()) | (F.col("tmdb_title_norm") == "")
).select("tmdb_title", "tmdb_title_norm").show(50, truncate=False)


