In [0]:
# 00_validate_raw_patients_CSV.ipynb
# SOURCE: (10 row) Patients CSV file in DBFS
# OUTPUT: N/A

from pyspark.sql import functions as F

RAW_PATH = "dbfs:/kardia/raw/patients/patients_10.csv"

In [0]:
# Load Patients CSV.
df = (
    spark.read
         .format("csv")
         .option("header", "true")
         .option("inferSchema", "false")
         .load(RAW_PATH)
)

display(df)

# NOTE: Spark will union all CSV files in RAW_PATH folder into a single DataFrame.
#       Without schema inference, Spark treats all columns as strings.

In [0]:
# Check for NULLs and uniqueness before Bronze ingestion.
stats = df.agg(
    F.expr("count(*)").alias("row_count"),
    F.expr("count(distinct ID)").alias("distinct_id_count"),
    F.expr("sum(CASE WHEN ID IS NULL THEN 1 ELSE 0 END)").alias("id_null_count"),
    F.expr("sum(CASE WHEN NOT (GENDER IN ('M', 'F')) THEN 1 ELSE 0 END)").alias("invalid_gender_count")
).first()

try:
    assert stats.id_null_count        == 0,               "Found NULLs in ID column"
    assert stats.invalid_gender_count == 0,               "Invalid values in GENDER column"
    assert stats.distinct_id_count    == stats.row_count, "Duplicate Patient IDs"
    print("Validation passed:", stats.asDict())
except AssertionError as e:
    print("Validation failed:", e)
    raise e
