## Silver Layer - Quality Assurance
#### Goal is to cleanse and transform the data in the bronze_table by ensuring:
###### * Deduplicate data
###### * Convert data types
###### * Make data business-ready
###### * Data quality checks
###### * Handle nulls & invalid data
###### * Create fact & dimension tables

In [0]:
# import modules
from pyspark.sql.functions import (
    col, to_date, lit, current_timestamp, datediff, when, coalesce, concat_ws, trim, upper, lower,
    row_number, desc, array, round
)

from pyspark.sql.window import Window

from pyspark.sql.types import (
    StructType, StructField, StringType, IntegerType, DecimalType, DoubleType, DateType, TimestampType
)

from delta.tables import DeltaTable


#### Run Config Notebook

In [0]:
%run ../../configs/config_notebook

In [0]:
# Use configuration variables
BRONZE_TABLE = TABLES["orders_bronze"]
SILVER_TABLE = TABLES["orders_silver"]


#### Read from bronze table

In [0]:
# read bronze table
df_bronze = spark.table(BRONZE_TABLE)


#### Deduplication
###### * We use row_id as primary key
###### * The bronze_table may contain duplicate row_ids as a result of file reupload.
###### * We dedupe the data, keeping only the latest record using the ingestion_timestamp ordered by DESC.

In [0]:
# We define a window, partitioned by row_id and ordered by ingestion_timestamp
window_partition = Window.partitionBy("row_id").orderBy(desc("ingestion_timestamp"))

# we add a row_number column and keep only where row_number = 1, which represents the latest
df_deduped = (df_bronze
    .withColumn("row_number", row_number().over(window_partition))
    .filter(col("row_number") == 1)
    .drop("row_number")
)

# verify deduplication
bronze_count = df_bronze.count()
deduped_count = df_deduped.count()
duplicates_removed = bronze_count - deduped_count

#show results
print(f"Bronze count: {bronze_count}")
print(f"Deduplicated count: {deduped_count}")
print(f"Duplicates removed: {duplicates_removed}")


#### Data Type Conversion
###### * We use explicit type casting to convert dates and numeric data types
###### * We also trim and standardize columns 

In [0]:
# explicit data type conversion
df_datatype = (
    df_deduped
    # Date conversions - (format MM/DD/YYYY)
    .withColumn("order_date", to_date(col("order_date"), "MM/dd/yyyy"))
    .withColumn("ship_date", to_date(col("ship_date"), "MM/dd/yyyy"))
    
    # Numeric conversions
    .withColumn("row_id", col("row_id").cast(IntegerType()))
    .withColumn("sales", round(col("sales").cast(DoubleType()), 2))
    .withColumn("quantity", col("quantity").cast(IntegerType()))
    .withColumn("discount", round(col("discount").cast(DoubleType()), 2))
    .withColumn("profit", round(col("profit").cast(DoubleType()), 2))

    # String trimming and standardization
    .withColumn("order_id", trim(col("order_id")))
    .withColumn("ship_mode", trim(col("ship_mode")))
    .withColumn("customer_id", trim(col("customer_id")))
    .withColumn("customer_name", trim(col("customer_name")))
    .withColumn("segment", trim(col("segment")))
    .withColumn("country", trim(col("country")))
    .withColumn("city", trim(col("city")))
    .withColumn("state", trim(col("state")))
    .withColumn("postal_code", trim(col("postal_code")))
    .withColumn("region", trim(col("region")))
    .withColumn("product_id", trim(col("product_id")))
    .withColumn("category", trim(col("category")))
    .withColumn("sub_category", trim(col("sub_category")))
    .withColumn("product_name", trim(col("product_name")))
)


#### Data Quality
###### Goal is to build data quality checks to validate critical columns, such as:
###### * Check for null or missing values
###### * Numeric values checks
###### * Business logic checks

In [0]:
# Null values check in critical fields
df_data_quality = (
    df_datatype
    .withColumn("missing_order_id", col("order_id").isNull())
    .withColumn("missing_order_date", col("order_date").isNull())
    .withColumn("missing_customer_id", col("customer_id").isNull())
    .withColumn("missing_product_id", col("product_id").isNull())
)

# Null values check for numeric fields
invalid_qty = col("quantity").isNull()
invalid_sales = col("sales").isNull()
df_data_quality = (
    df_data_quality
    .withColumn("invalid_sales_value", invalid_sales)
    .withColumn("invalid_quantity_value", invalid_qty)
)

# Check for ship before order business logic
df_data_quality = (
    df_data_quality
    .withColumn("ship_before_order", 
        (col("ship_date").isNotNull()) & (col("order_date").isNotNull()) & 
        (col("ship_date") < col("order_date"))
    )
)

# Overall row validity flag
df_data_quality = (
    df_data_quality
    .withColumn("has_dq_issue",
      col("missing_order_id") |
      col("missing_order_date") |
      col("missing_customer_id") |
      col("missing_product_id") |
      col("invalid_sales_value") |
      col("invalid_quantity_value") |
      col("ship_before_order")
    )
)

# Reason for data quality issues
df_data_quality = (
    df_data_quality
    .withColumn("dq_issue_reason",
      when(col("missing_order_id"), "MISSING_ORDER_ID")
      .when(col("missing_order_date"), "MISSING_ORDER_DATE")
      .when(col("missing_customer_id"), "MISSING_CUSTOMER_ID")
      .when(col("missing_product_id"), "MISSING_PRODUCT_ID")
      .when(col("invalid_sales_value"), "INVALID_SALES_VALUE")
      .when(col("invalid_quantity_value"), "INVALID_QUANTITY_VALUE")
      .when(col("ship_before_order"), "SHIP_BEFORE_ORDER")
    )
)

# Add audit columns
df_data_quality = (
    df_data_quality
    .withColumn("silver_processed_timestamp", current_timestamp())
    .withColumn("ingested_from_layer", lit("bronze"))
)


In [0]:
# Arrange columns in logical order to simplify viewing the data

df_silver = df_data_quality.select(
    # Primary key
    "row_id",
    
    # Order information
    "order_id",
    "order_date",
    "ship_date",
    "ship_mode",
    
    # Customer information
    "customer_id",
    "customer_name",
    "segment",
    
    # Location information
    "country",
    "city",
    "state",
    "postal_code",
    "region",
    
    # Product information
    "product_id",
    "product_name",
    "category",
    "sub_category",
    
    # Order details
    "sales",
    "quantity",
    "discount",
    "profit",

    # Data quality flags
    "has_dq_issue",
    "dq_issue_reason",
    
    # Audit columns
    "ingestion_timestamp",
    "source_file",
    "silver_processed_timestamp",
    "ingested_from_layer"
)

print(f"Silver records total count: {df_silver.count()}")


#### Silver Table
###### * Write cleansed & transformed data to silver table
###### * Ensure idempotency

In [0]:
# Create upsert logic to update/insert records into silver table 
if spark.catalog.tableExists(SILVER_TABLE):
    print(f"Silver table exists. Performing MERGE...")

    delta_table = DeltaTable.forName(spark, SILVER_TABLE)
    (delta_table.alias("target")
        .merge(
            df_silver.alias("source"),
            "target.row_id = source.row_id"
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )
    print("✅ MERGE completed!")

else:
    print(f"Silver table does not exist. Creating orders_silver table...")

    (
        df_silver.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable(SILVER_TABLE)
    )

    print("✅ Silver table 'orders_silver' completed!")


#### Sanity check of '**orders_silver**' table

In [0]:
%sql
SELECT 
  MIN(order_date) AS min_order_date, 
  MAX(order_date) AS max_order_date,
  COUNT(DISTINCT row_id) AS total_orders
FROM silver_dev.global_mart_retail.orders_silver