In [0]:
# 00_validate_raw_encounters_CSV.ipynb
# Quick validation for Encounters CSV (10 rows)

from pyspark.sql import functions as F

RAW_PATH = "dbfs:/kardia/raw/encounters/encounters_10.csv"

In [0]:
# Load CSV (no schema inference - Spark treats all columns as strings)
df = (
    spark.read
        .format("csv")
        .option("header", "true")
        .option("inferSchema", "false")
        .load(RAW_PATH)
)

display(df)

In [0]:
# Validate row count, ID nulls, uniqueness, and PATIENT nulls before Bronze ingestion
expected_rows = 10

stats = df.agg(
    F.count("*").alias("row_count"),
    F.countDistinct("ID").alias("distinct_ids"),
    F.expr("sum(ID IS NULL)").alias("id_null_count"),
    F.expr("sum(PATIENT IS NULL)").alias("patient_null_count")
).first()

try:
    assert stats.total == expected_rows, f"Expected {expected_rows} rows, got {stats.total}"
    assert stats.null_ids == 0, "Found NULLs in ID column"
    assert stats.distinct_ids == stats.total, "Duplicate Encounter IDs"
    assert stats.null_patients == 0, "Found NULLs in PATIENT column"
    print("Validation passed:", stats.asDict())
except AssertionError as e:
    print("Validation failed:", e)
    raise
