data reading

uncomment this when new columns are adding 

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pyspark

In [0]:
# 1. Configure your ADLS paths
bronze_source_path     = "abfss://source@cajainterview.dfs.core.windows.net/discharge"
bronze_checkpoint_dir  = "abfss://bronze@cajainterview.dfs.core.windows.net/checkpoint_dailydischarge"
bronze_schema_dir      = f"{bronze_checkpoint_dir}/schemas"
bronze_sink_path       = "abfss://bronze@cajainterview.dfs.core.windows.net/dailydischarge"
bronze_bad_path        = "abfss://bronze@cajainterview.dfs.core.windows.net/badrecords"


In [0]:
# 2. Stop any prior streams and clear old checkpoints/schemas
for s in spark.streams.active:
    s.stop()
dbutils.fs.rm(bronze_checkpoint_dir, recurse=True)
dbutils.fs.rm(bronze_schema_dir, recurse=True)
dbutils.fs.rm(bronze_bad_path, recurse=True)


In [0]:
# 3. Stream-read with Auto Loader
bronze_df = (
    spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("header", "true")  # <— ALIGN BY HEADER
        .option("inferSchema", "true")  # <— INFER types once
        .option("cloudFiles.schemaLocation", bronze_schema_dir)  # store AVRO schemas
        .option("cloudFiles.maxFilesPerTrigger", "1")  # control incremental pace
        .option("badRecordsPath", bronze_bad_path)  # route malformed rows
        .load(bronze_source_path)
)

In [0]:
# after your readStream with addNewColumns, but before writeStream
bronze_df.printSchema()


data writing

In [0]:
# 5. Write to Bronze Delta with schema merging off  
#    (we’re inferring schema, so no need to merge later)
(
    bronze_df.writeStream
        .format("parquet")
        .outputMode("append")
        .option("checkpointLocation", f"{bronze_checkpoint_dir}/checkpoint")
        .trigger(once=True)
        .start(bronze_sink_path)
)

In [0]:
df_parquet=spark.read.format("parquet")\
    .load("abfss://bronze@cajainterview.dfs.core.windows.net/dailydischarge")

In [0]:
df_parquet.printSchema()

In [0]:
from pyspark.sql import functions as F

# assume your DataFrame is called df
# 1. Parse the Period column into a proper DateType
df2 = df_parquet.withColumn(
    "Period_date",
    F.to_date(F.col("Period"), "dd/MM/yyyy")
)

# 2. Aggregate to get min and max
min_max = df2.agg(
    F.min("Period_date").alias("min_period"),
    F.max("Period_date").alias("max_period")
)

min_max.show(truncate=False)


In [0]:
from pyspark.sql.functions import col, sum

null_counts = df_parquet.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_parquet.columns])
display(null_counts)

In [0]:
unique_periods = df_parquet.groupBy("Period").count()
display(unique_periods)

In [0]:
unique_r = df_parquet.groupBy("_rescued_data").count()
display(unique_r)