In [0]:
# 00_validate_raw_patients_CSV.ipynb
# Quick validation for Patients CSV (10 rows)

from pyspark.sql import functions as F

RAW_PATH = "dbfs:/kardia/raw/patients/patients_10.csv"

# Minimize shuffle overhead for small test datasets
spark.conf.set("spark.sql.shuffle.partitions", "1")

In [0]:
# Load CSV (without schema inference - treat all columns as strings)
df = (
    spark.read
        .format("csv")
        .option("header", "true")
        .option("inferSchema", "false")
        .load(RAW_PATH)
)

display(df)

In [0]:
# Validate row count, ID nulls, and gender values before Bronze ingestion
expected_rows = 10

stats = df.agg(
    F.count("*").alias("row_count"),
    F.expr("sum(ID IS NULL)").alias("id_null_count"),
    F.expr("sum(NOT (GENDER IN ('M', 'F')))").alias("invalid_gender_count")
).first()

try:
    assert stats.null_id_count == 0, "Found NULLs in ID column"
    assert stats.row_count == expected_rows, f"Expected {expected_rows} rows, got {stats.row_count}"
    assert stats.invalid_gender_count == 0, "Invalid values in GENDER column"
    print("Validation passed:", stats.asDict())
except AssertionError as e:
    print("Validation failed:", e)
    raise