In [0]:
# Validate `patients_10.csv`  
# Cheap 10-row slice sanity-check before Bronze ingest.

from pyspark.sql import SparkSession, functions as F

In [0]:
# Path config
RAW_PATH = "dbfs:/kardia/raw/patients/patients_10.csv"

# Initialize a Spark session with minimal shuffle partitions
spark = (
    SparkSession.builder
    .appName("kardia_val_patients_10")
    .config("spark.sql.shuffle.partitions", "1")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

In [0]:
# Read 10 rows, avoid schema inference
df = (
    spark.read
         .option("header", True)
         .option("inferSchema", False)
         .csv(RAW_PATH)
         .cache()
)
display(df)

In [0]:
# Aggregate key metrics in one pass and retrieve the results
metrics = (
    df
    .agg(
        F.count(F.lit(1)).alias("row_cnt"),
        F.sum((~F.col("GENDER").isin("M", "F")).cast("int")).alias("bad_gender_n"),
        F.sum(F.col("ID").isNull().cast("int")).alias("null_id_n")
    )
    .first()
)

# Extract metrics into Python variables
row_cnt      = metrics.row_cnt
bad_gender_n = metrics.bad_gender_n
null_id_n    = metrics.null_id_n

# Enforce data quality rules
assert row_cnt      == 10, "Expected 10 rows, found %d" % row_cnt
assert null_id_n    ==  0, "ID column has NULLs"
assert bad_gender_n ==  0, "GENDER contains values outside {M,F}"

print("All validation checks passed.")


In [0]:
# Proceed to bronze_patients_ingest `01_bronze/bronze_patients`