In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, trim, upper, coalesce, lit, current_timestamp, md5, concat_ws
from pyspark.sql.types import *
from delta.tables import DeltaTable

StatementMeta(, ab7b6008-f544-4c39-8216-a6572c4d355a, 3, Finished, Available, Finished)

In [9]:
products_path = "Files/bronze/sales/products.parquet"
product_types_path = "Files/bronze/sales/productypes.parquet"
delta_output_path = "Tables/dim_product"

StatementMeta(, ab7b6008-f544-4c39-8216-a6572c4d355a, 11, Finished, Available, Finished)

In [4]:
print("Loading source data...")

# Load products table
df_products = spark.read.parquet(products_path)
print(f"Products loaded: {df_products.count()} records")

# Load product types table
df_product_types = spark.read.parquet(product_types_path)
print(f"Product types loaded: {df_product_types.count()} records")

# Display sample data for debugging
print("\n--- Products Sample ---")
df_products.show(3, truncate=False)

print("\n--- Product Types Sample ---")
df_product_types.show(3, truncate=False)

StatementMeta(, ab7b6008-f544-4c39-8216-a6572c4d355a, 6, Finished, Available, Finished)

Loading source data...
Products loaded: 13 records
Product types loaded: 7 records

--- Products Sample ---
+---------+-------------------------+-----------------------------------+-------------+-----------------+-------------+--------+
|ProductID|ProductName              |ProductDescription                 |ProductTypeID|StandardCostPrice|UnitOfMeasure|IsActive|
+---------+-------------------------+-----------------------------------+-------------+-----------------+-------------+--------+
|1        |Strawberry Blast Popsicle|Classic strawberry flavored ice pop|1            |0.50             |unit         |true    |
|2        |Lemon Zing Ice Pop       |Refreshing lemon ice pop           |1            |0.45             |unit         |true    |
|3        |Orange Sunshine Pop      |Bright orange flavored popsicle    |1            |0.55             |unit         |true    |
+---------+-------------------------+-----------------------------------+-------------+-----------------+-------------

In [5]:
print("\nStarting products data cleaning...")

df_products_clean = df_products \
    .filter(col("ProductID").isNotNull()) \
    .withColumn("ProductName", trim(col("ProductName"))) \
    .withColumn("ProductDescription", trim(col("ProductDescription"))) \
    .withColumn("StandardCostPrice", 
               when(col("StandardCostPrice").isNull(), 0.0)
               .otherwise(col("StandardCostPrice"))) \
    .withColumn("UnitOfMeasure", 
               when(col("UnitOfMeasure").isNull(), "unit")
               .otherwise(upper(trim(col("UnitOfMeasure"))))) \
    .withColumn("IsActive", 
               when(col("IsActive").isNull(), True)
               .otherwise(col("IsActive"))) \
    .filter(col("ProductName") != "") \
    .filter(col("StandardCostPrice") >= 0)

print(f"Products after cleaning: {df_products_clean.count()} records")

# Check for any issues
print("\n--- Products Cleaning Check ---")
print(f"Null ProductIDs: {df_products_clean.filter(col('ProductID').isNull()).count()}")
print(f"Empty ProductNames: {df_products_clean.filter(col('ProductName') == '').count()}")
print(f"Negative prices: {df_products_clean.filter(col('StandardCostPrice') < 0).count()}")

StatementMeta(, ab7b6008-f544-4c39-8216-a6572c4d355a, 7, Finished, Available, Finished)


Starting products data cleaning...
Products after cleaning: 13 records

--- Products Cleaning Check ---
Null ProductIDs: 0
Empty ProductNames: 0
Negative prices: 0


In [6]:
print("\nStarting product types data cleaning...")

df_product_types_clean = df_product_types \
    .filter(col("ProductTypeID").isNotNull()) \
    .withColumn("ProductTypeName", trim(col("ProductTypeName"))) \
    .filter(col("ProductTypeName") != "") \
    .dropDuplicates(["ProductTypeID"])

print(f"Product types after cleaning: {df_product_types_clean.count()} records")

# Check for duplicates
print("\n--- Product Types Cleaning Check ---")
print(f"Null ProductTypeIDs: {df_product_types_clean.filter(col('ProductTypeID').isNull()).count()}")
print(f"Empty ProductTypeNames: {df_product_types_clean.filter(col('ProductTypeName') == '').count()}")

StatementMeta(, ab7b6008-f544-4c39-8216-a6572c4d355a, 8, Finished, Available, Finished)


Starting product types data cleaning...
Product types after cleaning: 7 records

--- Product Types Cleaning Check ---
Null ProductTypeIDs: 0
Empty ProductTypeNames: 0


In [7]:
print("\nCreating product dimension...")

# Join products and product types
dim_product = df_products_clean.alias("p") \
    .join(df_product_types_clean.alias("pt"), 
          col("p.ProductTypeID") == col("pt.ProductTypeID"), 
          "left") \
    .select(
        col("p.ProductID").alias("product_id"),
        col("p.ProductName").alias("product_name"),
        col("p.ProductDescription").alias("product_description"),
        coalesce(col("pt.ProductTypeName"), lit("Unknown")).alias("product_type_name"),
        col("p.StandardCostPrice").alias("standard_cost_price"),
        col("p.UnitOfMeasure").alias("unit_of_measure"),
        col("p.IsActive").alias("is_active")
    ) \
    .withColumn("created_at", current_timestamp()) \
    .withColumn("updated_at", current_timestamp()) \
    .withColumn("record_hash", 
               md5(concat_ws("|", 
                           col("product_id"),
                           col("product_name"),
                           col("product_description"), 
                           col("product_type_name"),
                           col("standard_cost_price"),
                           col("unit_of_measure"),
                           col("is_active"))))

print(f"Dimension created: {dim_product.count()} records")

# Display sample of final dimension
print("\n--- Final Dimension Sample ---")
dim_product.show(5, truncate=False)

# Check join results
print("\n--- Join Analysis ---")
print(f"Products with Unknown type: {dim_product.filter(col('product_type_name') == 'Unknown').count()}")


StatementMeta(, ab7b6008-f544-4c39-8216-a6572c4d355a, 9, Finished, Available, Finished)


Creating product dimension...
Dimension created: 13 records

--- Final Dimension Sample ---
+----------+-------------------------+-----------------------------------+-----------------+-------------------+---------------+---------+--------------------------+--------------------------+--------------------------------+
|product_id|product_name             |product_description                |product_type_name|standard_cost_price|unit_of_measure|is_active|created_at                |updated_at                |record_hash                     |
+----------+-------------------------+-----------------------------------+-----------------+-------------------+---------------+---------+--------------------------+--------------------------+--------------------------------+
|1         |Strawberry Blast Popsicle|Classic strawberry flavored ice pop|Popsicle         |0.5                |UNIT           |true     |2025-05-29 15:11:23.836068|2025-05-29 15:11:23.836068|2cd2af864a1892e202893ff34d7443b8|
|2 

In [10]:
print(f"\nSaving data to path: {delta_output_path}")

# Check if Delta table already exists
if DeltaTable.isDeltaTable(spark, delta_output_path):
    print("Existing Delta table found. Executing merge...")
    
    # Load existing Delta table
    delta_table = DeltaTable.forPath(spark, delta_output_path)
    
    # Execute merge (upsert) based on product_id
    delta_table.alias("target") \
        .merge(dim_product.alias("source"), 
               "target.product_id = source.product_id") \
        .whenMatchedUpdate(
            condition="target.record_hash != source.record_hash",
            set={
                "product_name": "source.product_name",
                "product_description": "source.product_description",
                "product_type_name": "source.product_type_name",
                "standard_cost_price": "source.standard_cost_price",
                "unit_of_measure": "source.unit_of_measure",
                "is_active": "source.is_active",
                "updated_at": "source.updated_at",
                "record_hash": "source.record_hash"
            }) \
        .whenNotMatchedInsertAll() \
        .execute()
        
    print("Merge executed successfully!")
    
else:
    print("Creating new Delta table...")
    dim_product.write \
        .format("delta") \
        .mode("overwrite") \
        .option("mergeSchema", "true") \
        .save(delta_output_path)
    print("Delta table created successfully!")

StatementMeta(, ab7b6008-f544-4c39-8216-a6572c4d355a, 12, Finished, Available, Finished)


Saving data to path: Tables/dim_product
Creating new Delta table...
Delta table created successfully!


In [12]:
print("\nRunning data quality checks...")

# Load the saved table for verification
df_final = spark.read.format("delta").load(delta_output_path)

# Basic checks
total_records = df_final.count()
null_product_names = df_final.filter(col("product_name").isNull()).count()
duplicate_keys = df_final.groupBy("product_id").count().filter(col("count") > 1).count()

print(f"Total records: {total_records}")
print(f"Null product names: {null_product_names}")
print(f"Duplicate keys: {duplicate_keys}")

# Show final data sample
print("\n--- Final Data Sample ---")
df_final.show(5, truncate=False)

StatementMeta(, ab7b6008-f544-4c39-8216-a6572c4d355a, 14, Finished, Available, Finished)


Running data quality checks...
Total records: 13
Null product names: 0
Duplicate keys: 0

--- Final Data Sample ---
+----------+-------------------------+-----------------------------------+-----------------+-------------------+---------------+---------+--------------------------+--------------------------+--------------------------------+
|product_id|product_name             |product_description                |product_type_name|standard_cost_price|unit_of_measure|is_active|created_at                |updated_at                |record_hash                     |
+----------+-------------------------+-----------------------------------+-----------------+-------------------+---------------+---------+--------------------------+--------------------------+--------------------------------+
|1         |Strawberry Blast Popsicle|Classic strawberry flavored ice pop|Popsicle         |0.5                |UNIT           |true     |2025-05-29 15:12:42.905331|2025-05-29 15:12:42.905331|2cd2af864a189

In [14]:
print("="*50)
print("ETL COMPLETED SUCCESSFULLY!")
print(f"Records processed: {total_records}")
print(f"Table saved at: {delta_output_path}")
print("="*50)

# Optional: Create a summary dictionary for further analysis
summary_stats = {
    "total_records": total_records,
    "null_product_names": null_product_names,
    "duplicate_keys": duplicate_keys,
    "table_path": delta_output_path
}

print(f"Summary: {summary_stats}")

StatementMeta(, ab7b6008-f544-4c39-8216-a6572c4d355a, 16, Finished, Available, Finished)

ETL COMPLETED SUCCESSFULLY!
Records processed: 13
Table saved at: Tables/dim_product
Summary: {'total_records': 13, 'null_product_names': 0, 'duplicate_keys': 0, 'table_path': 'Tables/dim_product'}
