In [0]:
# 00_validate_raw_patients_CSV.ipynb
# Quick validation for Patients CSV (10 rows)

from pyspark.sql import SparkSession, functions as F

RAW_PATH = "dbfs:/kardia/raw/patients/patients_10.csv"

# Minimize shuffle overhead for small test datasets
spark.conf.set("spark.sql.shuffle.partitions", "1")

In [0]:
# Load CSV (no schema inference - Spark treats all columns as strings)
df = (
    spark.read
        .option("header", True)
        .option("inferSchema", False)
        .csv(RAW_PATH)
        .cache()
)

display(df)

In [0]:
# Calculate basic metrics
summary = df.agg(
    F.count("*").alias("total"),
    F.sum(F.col("ID").isNull().cast("int")).alias("null_ids"),
    F.sum((~F.col("GENDER").isin("M", "F")).cast("int")).alias("bad_gender")
).first()

total = summary.total
null_ids = summary.null_ids
bad_gender = summary.bad_gender

# Enforce data quality rules
assert total == 10, f"Expected 10 rows, got {total}"
assert null_ids == 0, "ID column has NULLs"
assert bad_gender == 0, "GENDER column has invalid values"

print("Patient CSV validation passed.")
