In [0]:
# 00_validate_raw_encounters_CSV.ipynb
# SOURCE: (10 row) Encounters CSV file in DBFS
# OUTPUT: N/A

from pyspark.sql import functions as F

RAW_PATH = "dbfs:/kardia/raw/encounters/encounters_1000.csv"

In [0]:
# Load Encounters CSV.
df = (
    spark.read
         .format("csv")
         .option("header", "true")
         .option("inferSchema", "false")
         .load(RAW_PATH)
)

display(df)

# NOTE: Spark will union all CSV files in RAW_PATH folder into a single DataFrame.
#       Without schema inference, Spark treats all columns as strings.

In [0]:
# Check for NULLs and uniqueness before Bronze ingestion.
stats = df.agg(
    F.expr("count(*)").alias("row_count"),
    F.expr("count(distinct ID)").alias("distinct_id_count"),
    F.expr("sum(CASE WHEN ID IS NULL THEN 1 ELSE 0 END)").alias("id_null_count"),
    F.expr("sum(CASE WHEN PATIENT IS NULL THEN 1 ELSE 0 END)").alias("patient_null_count")
).first()

try:
    assert stats.id_null_count      == 0,               "Found NULLs in ID column"
    assert stats.patient_null_count == 0,               "Found NULLs in PATIENT column"
    assert stats.distinct_id_count  == stats.row_count, "Duplicate Encounter IDs"
    print("Validation passed:", stats.asDict())
except AssertionError as e:
    print("Validation failed:", e)
    raise e