In [0]:
display(dbutils.fs.ls("dbfs:/kardia/raw/"))


In [0]:
# ------------------------------------------------------------
# KardiaFlow – micro-inspection of 1K-row CSV  (cost-safe)
# ------------------------------------------------------------

file_path = "dbfs:/kardia/raw/patients_1k.csv"  # <- Correct path format

# Read with minimal options
df = (
    spark.read
         .option("header", "true")          # keep header
         .option("inferSchema", "false")    # fastest: treat all as strings
         .csv(file_path)
         .limit(5)                          # physical limit → Spark scans only what's needed
)

# Collect and print 5 rows (no fancy formatting)
for row in df.collect():
    print(row)

# Dump schema cheaply (all StringType, no extra job)
print("Columns:", df.columns)
print("Partitions:", df.rdd.getNumPartitions())


In [0]:
from pyspark.sql.functions import current_date, lit

# Path to the uploaded CSV file (DBFS mount, not local FS)
file_path = "dbfs:/kardia/raw/patients_1k.csv"

# Read the file as all strings — fastest, and avoids schema inference jobs
df = (
    spark.read
         .option("header", True)
         .option("inferSchema", False)
         .csv(file_path)
)

# Add load_date column (cheap transformation)
df = df.withColumn("load_date", current_date())

# Write as a small Delta table (safe overwrite, no partitioning)
df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("kardia_patients_stage")


In [0]:
%sql SHOW TABLES