In [0]:
# access configuration settings
storage_account_name = "dlaqimddev"
client_id = dbutils.secrets.get(scope="key-vault-secrets", key="client-id")
client_secret = dbutils.secrets.get(scope="key-vault-secrets", key="client-secret")
tenant_id = dbutils.secrets.get(scope="key-vault-secrets", key="tenant-id")

# Configure OAuth 2.0 connection
spark.conf.set(f"fs.azure.account.auth.type.{storage_account_name}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account_name}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account_name}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account_name}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account_name}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

In [0]:
from pyspark.sql.functions import col, explode, to_timestamp, current_timestamp, expr
from delta.tables import DeltaTable

In [0]:
# 1. Read Raw JSON from Bronze (Wildcard path for daily folders)
bronze_path = f"abfss://bronze@{storage_account_name}.dfs.core.windows.net/cpcb_aqi/*/*/*/*.json"
df_bronze = spark.read.option("multiline", "true").json(bronze_path)

# 2. Explode the 'records' array to flatten the data
df_records = df_bronze.select(explode(col("records")).alias("rec")).select("rec.*")

df_records.show()
df_records.columns

In [0]:
# 3. Clean, Cast, and Filter
df_silver = df_records \
    .withColumn("last_update", to_timestamp(col("last_update"), "dd-MM-yyyy HH:mm:ss")) \
    .withColumn("avg_value", expr("try_cast(avg_value as int)")) \
    .withColumn("max_value", expr("try_cast(max_value as int)")) \
    .withColumn("min_value", expr("try_cast(min_value as int)")) \
    .withColumn("latitude", expr("try_cast(latitude as double)")) \
    .withColumn("longitude", expr("try_cast(longitude as double)")) \
    .filter(col("avg_value").isNotNull()) \
    .withColumn("ingestion_timestamp", current_timestamp())

df_silver.show()

In [0]:
# 4. Save to Silver
silver_path = f"abfss://silver@{storage_account_name}.dfs.core.windows.net/cpcb_aqi/"

# 4.1 Deduplicate the incoming batch first
# We use station, pollutant_id, and last_update as the unique key for Silver
df_silver_unique = df_silver.dropDuplicates(["station", "pollutant_id", "last_update"])

display(df_silver_unique.limit(10))

# 4.2 Check if Silver Delta table exists
if DeltaTable.isDeltaTable(spark, silver_path):
    silver_table = DeltaTable.forPath(spark, silver_path)
    
    # 4.3 Perform the Merge (Upsert)
    silver_table.alias("target").merge(
        df_silver_unique.alias("source"),
        "target.station = source.station AND \
         target.pollutant_id = source.pollutant_id AND \
         target.last_update = source.last_update"
    ).whenMatchedUpdateAll() \
     .whenNotMatchedInsertAll() \
     .execute()
    print("Bronze data successfully merged into Silver layer.")
else:
    # 4.4 Create table for the first time
    df_silver_unique.write.format("delta").mode("overwrite").save(silver_path)
    print("Silver table created and initial data loaded.")