In [0]:
# 00_validate_raw_encounters_CSV.ipynb
# Quick validation for Encounters CSV (10 rows)

from pyspark.sql import functions as F

RAW_PATH = "dbfs:/kardia/raw/encounters/encounters_10.csv"

In [0]:
# Load CSV (no schema inference - Spark treats all columns as strings)
# Spark unions all CSV files it finds in RAW_PATH folder into a single DataFrame
df = (
    spark.read
        .format("csv")
        .option("header", "true")
        .option("inferSchema", "false")
        .load(RAW_PATH)
)

display(df)

In [0]:
# Validate row count, ID nulls, uniqueness, and PATIENT nulls before Bronze ingestion
expected_rows = 10

stats = df.agg(
    F.expr("count(*)").alias("row_count"),
    F.expr("count(distinct ID)").alias("distinct_id_count"),
    F.expr("sum(CASE WHEN ID IS NULL THEN 1 ELSE 0 END)").alias("id_null_count"),
    F.expr("sum(CASE WHEN PATIENT IS NULL THEN 1 ELSE 0 END)").alias("patient_null_count")
).first()

try:
    assert stats.id_null_count == 0, "Found NULLs in ID column"
    assert stats.row_count == expected_rows, f"Expected {expected_rows} rows, got {stats.row_count}"
    assert stats.distinct_id_count == stats.row_count, "Duplicate Encounter IDs"
    assert stats.patient_null_count == 0, "Found NULLs in PATIENT column"
    print("Validation passed:", stats.asDict())
except AssertionError as e:
    print("Validation failed:", e)
    raise e