<a href="https://colab.research.google.com/github/moinshaikh6872/Modern-Azure-Data-Pipeline-On-Premises-to-Delta-Lake-Analytics/blob/main/Bronze_to_Silver_PySpark_Transformation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Databricks Notebook: Bronze to Silver Transformation

# This notebook performs initial cleansing and standardization on raw data
# from the Bronze layer and writes the refined data to the Silver Delta Lake layer.

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, to_timestamp

# 1. Configuration Setup
# NOTE: In a real Databricks environment, these paths would likely be passed via widgets or parameters
# from Azure Data Factory.

# Base path for the Azure Data Lake Storage Gen2 (ADLS Gen2) mount point
BASE_PATH = "/mnt/adlssynapse/"
BRONZE_PATH = f"{BASE_PATH}bronze/sales_data"
SILVER_PATH = f"{BASE_PATH}silver/sales_data_cleaned"

# 2. Read Raw Data from Bronze Layer
# Reads all Parquet files for the 'sales_data' table in the Bronze container.
try:
    print(f"Reading data from Bronze Layer: {BRONZE_PATH}")

    # Reading Parquet format, assuming raw ingestion from SQL Server was successful
    df_bronze = spark.read.parquet(BRONZE_PATH)

    # Check if data was loaded
    if df_bronze.count() == 0:
        print("Warning: No data found in the Bronze layer.")
        # Exit gracefully if no data is present
        # sys.exit(0) # In a real environment, you might use sys.exit(0)
    else:
        print(f"Successfully loaded {df_bronze.count()} records.")

except Exception as e:
    print(f"Error reading Bronze data: {e}")
    # Raise the exception to fail the ADF pipeline
    raise

# 3. Apply Transformation Logic (Cleansing & Standardization)
# Transformation Logic:
# a. Convert 'ModifiedDate' (assumed to be a UTC timestamp string/type) to YYYY-MM-DD string format.
# b. Add a processing timestamp for lineage.

df_silver = df_bronze.select(
    # Select all original columns
    col("*"),

    # Apply the required date format standardization
    date_format(
        to_timestamp(col("ModifiedDate"), "yyyy-MM-dd HH:mm:ss.SSS"), # Assuming typical SQL timestamp format
        "yyyy-MM-dd"
    ).alias("Standardized_ModifiedDate")

).drop("ModifiedDate") # Drop the original column after standardization

# 4. Write Data to Silver Layer (Delta Lake Format)
print(f"Writing transformed data to Silver Layer: {SILVER_PATH}")

# Writing as a Delta table provides ACID properties, schema evolution, and better performance.
# We use 'overwrite' for simplicity, but 'append' or 'merge' (UPSERT) are common for production pipelines.
df_silver.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(SILVER_PATH)

print("Bronze to Silver transformation completed successfully.")

# 5. Verification (Optional)
# df_check = spark.read.format("delta").load(SILVER_PATH)
# print("Schema of Silver Layer Data:")
# df_check.printSchema()
# print(f"First 5 rows of Silver Layer Data:")
# df_check.show(5)