In [0]:
# SILVER LAYER – orderitems
# Purpose:
# - Enforce schema & data contracts
# - Reject rescued/malformed rows
# - Deduplicate records
# - Publish clean Silver table
#
# Upstream: git_analysis.bronze.orderitems
# Downstream: git_analysis.silver.orderitem

In [0]:
# Importing necessary libraries

In [0]:
%python
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import (
    StructType, StructField,
    IntegerType, StringType,DateType,TimestampType
)
from pyspark.sql.functions import col,lit, current_timestamp,current_date

spark.conf.set("spark.sql.session.timeZone", "UTC")


In [0]:

# Defining schemas that is neccessary if we didnt define it at bronze layer

In [0]:
orderitems_silver_schema = StructType(
    fields=[StructField("ORDERITEMSID",StringType(),False),
            StructField("ORDERID",StringType(),True),
            StructField("PRODUCTID",StringType(), True),
            StructField("QUANTITY",IntegerType(), True),
             ])

In [0]:
# Reading bronze table into a dataframe

In [0]:
orderitems_bronze_df = spark.table("git_analysis.bronze.orderitems")


In [0]:
# Saving the extra column "_rescued_data" that databricks has added in bronze layer in a separate quarantine table


In [0]:
%python
orderitems_quarantine_df = orderitems_bronze_df.filter(F.col("_rescued_data").isNotNull())

(
    orderitems_quarantine_df
    .write
    .format("delta")
    .mode("append")
    .saveAsTable("git_analysis.silver.orderitems_quarantine")
)


In [0]:
# Filtering the extra column "_rescued_data" that databricks has added in bronze layer

In [0]:
orderitems_clean_df = orderitems_bronze_df.filter(F.col("_rescued_data").isNull())


In [0]:
# checking the coloumn types and renaming the columns

In [0]:
       
orderitems_typed_df = orderitems_clean_df.select(
    F.col("ORDERITEMSID").cast("string").alias("orderitem_id"),
    F.col("ORDERID").cast("string").alias("order_id"),
    F.col("PRODUCTID").cast("string").alias("product_id"),
    F.col("QUANTITY").cast("integer").alias("quantity")
)


In [0]:
#checking for data quality

In [0]:
%python
orderitems_dq_df = orderitems_typed_df.filter(
    F.col("orderitem_id").isNotNull() &
    F.col("order_id").isNotNull() &
    F.col("product_id").isNotNull()&
    F.col("quantity").isNotNull()
   
)

In [0]:
 
orderitems_dq_df=orderitems_dq_df.filter(F.col("quantity")>0)

In [0]:
#setting alert for bad data entry for checking data quality

In [0]:
bad_count = orderitems_typed_df.count() - orderitems_dq_df.count()
if bad_count > 0:
    print(f"⚠️ Dropped {bad_count} rows due to DQ rules")


In [0]:
# check for referentioal integrity

In [0]:
orders_df = spark.table("git_analysis.silver.orders")

orderitems_dq_df =orderitems_dq_df .join(
    orders_df.select("order_id"),
    "order_id",
    "inner"
)


In [0]:
#checking for duplicate values and droping them

In [0]:
from pyspark.sql.window import Window
window = Window.partitionBy("orderitem_id").orderBy(F.lit(1))

orderitems_dedup_df = orderitems_dq_df.withColumn("rn", F.row_number().over(window)) \
       .filter(F.col("rn") == 1) \
       .drop("rn")


In [0]:
#write df to silver table

In [0]:
(
    orderitems_dedup_df
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("git_analysis.silver.orderitems")
)
