In [0]:
# 00_validate_raw_encounters_CSV.ipynb
# -------------------------------------------------------
# Cheap 10-row smoke-test for the encounters CSV file
# confirms file is present, IDs are unique / non-null

from pyspark.sql import SparkSession, functions as F

# Path config
RAW_PATH = "dbfs:/kardia/raw/encounters/encounters_10.csv"

In [0]:
# Spark session
spark = (SparkSession.builder
           .appName("kardia_val_encounters_10")
           .config("spark.sql.shuffle.partitions", "1")
           .getOrCreate())
spark.sparkContext.setLogLevel("ERROR")

In [0]:
# Read 10 rows, avoid schema inference
df = (spark.read
            .option("header", True)
            .option("inferSchema", False)
            .csv(RAW_PATH)
            .cache())

display(df)

In [0]:
# Aggregate key metrics in one pass and retrieve the results
metrics = (df.agg(
              F.count(F.lit(1)).alias("row_cnt"),
              F.sum(F.col("ID").isNull().cast("int")).alias("null_id_n"),
              F.countDistinct("ID").alias("distinct_id_n"),
              F.sum(F.col("PATIENT").isNull().cast("int")).alias("null_patient_n")
          ).first())

# Extract metrics into Python variables
row_cnt        = metrics.row_cnt
null_id_n      = metrics.null_id_n
distinct_id_n  = metrics.distinct_id_n
null_patient_n = metrics.null_patient_n

In [0]:
# Enforce data quality rules
assert row_cnt == 10,                f"Expected 10 rows, found {row_cnt}"
assert null_id_n == 0,               "ID column contains NULLs"
assert distinct_id_n == row_cnt,     "Duplicate Encounter IDs detected"
assert null_patient_n == 0,          "PATIENT column contains NULLs"

print("All encounter-file validation checks passed.")

In [0]:
# Proceed to bronze_patients_ingest `01_bronze/bronze_patients`