In [0]:
# 00_validate_raw_encounters_CSV.ipynb
# Quick validation for Encounters CSV (10 rows)

from pyspark.sql import SparkSession, functions as F

RAW_PATH = "dbfs:/kardia/raw/encounters/encounters_10.csv"

# Minimize shuffle overhead for small test datasets
spark.conf.set("spark.sql.shuffle.partitions", "1")

In [0]:
# Load CSV (no schema inference - Spark treats all columns as strings)
df = (
    spark.read
        .option("header", True)
        .option("inferSchema", False)
        .csv(RAW_PATH)
        .cache()
)

display(df)

In [0]:
# Calculate basic metrics
summary = df.agg(
    F.count("*").alias("total"),
    F.countDistinct("ID").alias("distinct_ids"),
    F.sum(F.col("ID").isNull().cast("int")).alias("null_ids"),
    F.sum(F.col("PATIENT").isNull().cast("int")).alias("null_patients")
).first()

# Extract to Python
total = summary.total
distinct_ids = summary.distinct_ids
null_ids = summary.null_ids
null_patients = summary.null_patients

In [0]:
# Assertions
assert total == 10, f"Expected 10 rows, got {total}"
assert null_ids == 0, "Found NULLs in ID column"
assert distinct_ids == total, "Duplicate Encounter IDs"
assert null_patients == 0, "Found NULLs in PATIENT column"

print("Encounter CSV validation passed.")