In [2]:
from pyspark.sql.functions import col, when, trim, upper, coalesce, lit, current_timestamp, md5, concat_ws
from pyspark.sql.types import *
from delta.tables import DeltaTable

StatementMeta(, 051060dc-c1d6-45f7-84bf-f5f112c22bcd, 4, Finished, Available, Finished)

In [4]:
# =============================================================================
# CONFIGURATION
# =============================================================================

trucks_path = "Files/bronze/sales/trucks.parquet"
delta_output_path = "Tables/dim_trucks"

StatementMeta(, 051060dc-c1d6-45f7-84bf-f5f112c22bcd, 6, Finished, Available, Finished)

In [5]:
# =============================================================================
# LOAD SOURCE DATA
# =============================================================================

print("Loading trucks data...")

# Load trucks table
df_trucks = spark.read.parquet(trucks_path)
print(f"Trucks loaded: {df_trucks.count()} records")

# Display sample data for debugging
print("\n--- Trucks Sample ---")
df_trucks.show(10, truncate=False)

# Check schema
print("\n--- Trucks Schema ---")
df_trucks.printSchema()

StatementMeta(, 051060dc-c1d6-45f7-84bf-f5f112c22bcd, 7, Finished, Available, Finished)

Loading trucks data...
Trucks loaded: 8 records

--- Trucks Sample ---
+-------+---------------+----------------------+---------------+
|TruckID|TruckIdentifier|Model                 |ManufactureYear|
+-------+---------------+----------------------+---------------+
|1      |EU-ICE-001     |Mercedes Sprinter 2017|2017           |
|2      |EU-ICE-002     |Mercedes Sprinter 2017|2017           |
|3      |EU-ICE-003     |Mercedes Sprinter 2017|2017           |
|4      |DE-TRK-101     |Mercedes Sprinter 2017|2017           |
|5      |FR-TRK-202     |Mercedes Sprinter 2017|2017           |
|6      |IT-VAN-303     |Mercedes Sprinter 2017|2017           |
|7      |ES-FOOD-404    |Mercedes Sprinter 2017|2017           |
|8      |BE-COOL-007    |Mercedes Sprinter 2017|2017           |
+-------+---------------+----------------------+---------------+


--- Trucks Schema ---
root
 |-- TruckID: integer (nullable = true)
 |-- TruckIdentifier: string (nullable = true)
 |-- Model: string (nullable = tr

In [6]:
# =============================================================================
# DATA QUALITY CHECKS - BEFORE CLEANING
# =============================================================================

print("\n--- Data Quality Analysis ---")

# Check for nulls in key fields
print("Trucks Key Field Analysis:")
print(f"Null TruckIDs: {df_trucks.filter(col('TruckID').isNull()).count()}")
print(f"Null TruckIdentifiers: {df_trucks.filter(col('TruckIdentifier').isNull()).count()}")
print(f"Null Models: {df_trucks.filter(col('Model').isNull()).count()}")
print(f"Null ManufactureYears: {df_trucks.filter(col('ManufactureYear').isNull()).count()}")

# Check for duplicates
print(f"Duplicate TruckIDs: {df_trucks.count() - df_trucks.dropDuplicates(['TruckID']).count()}")
print(f"Duplicate TruckIdentifiers: {df_trucks.count() - df_trucks.dropDuplicates(['TruckIdentifier']).count()}")

StatementMeta(, 051060dc-c1d6-45f7-84bf-f5f112c22bcd, 8, Finished, Available, Finished)


--- Data Quality Analysis ---
Trucks Key Field Analysis:
Null TruckIDs: 0
Null TruckIdentifiers: 0
Null Models: 0
Null ManufactureYears: 0
Duplicate TruckIDs: 0
Duplicate TruckIdentifiers: 0


In [7]:
# =============================================================================
# DATA CLEANING - TRUCKS
# =============================================================================

print("\nStarting trucks data cleaning...")

df_trucks_clean = df_trucks \
    .filter(col("TruckID").isNotNull()) \
    .withColumn("TruckIdentifier", 
               when(col("TruckIdentifier").isNull(), "UNKNOWN")
               .otherwise(upper(trim(col("TruckIdentifier"))))) \
    .withColumn("Model", 
               when(col("Model").isNull(), "Unknown Model")
               .otherwise(trim(col("Model")))) \
    .withColumn("ManufactureYear", 
               when(col("ManufactureYear").isNull(), 0)
               .when(col("ManufactureYear") < 1900, 0)
               .when(col("ManufactureYear") > 2030, 0)
               .otherwise(col("ManufactureYear"))) \
    .filter(col("TruckIdentifier") != "") \
    .filter(col("Model") != "") \
    .dropDuplicates(["TruckID"])

print(f"Trucks after cleaning: {df_trucks_clean.count()} records")

StatementMeta(, 051060dc-c1d6-45f7-84bf-f5f112c22bcd, 9, Finished, Available, Finished)


Starting trucks data cleaning...
Trucks after cleaning: 8 records


In [8]:
# Check for cleaning issues
print("\n--- Trucks Cleaning Check ---")
print(f"Unknown truck identifiers: {df_trucks_clean.filter(col('TruckIdentifier') == 'UNKNOWN').count()}")
print(f"Unknown models: {df_trucks_clean.filter(col('Model') == 'Unknown Model').count()}")
print(f"Invalid manufacture years (0): {df_trucks_clean.filter(col('ManufactureYear') == 0).count()}")

StatementMeta(, 051060dc-c1d6-45f7-84bf-f5f112c22bcd, 10, Finished, Available, Finished)


--- Trucks Cleaning Check ---
Unknown truck identifiers: 0
Unknown models: 0
Invalid manufacture years (0): 0


In [9]:
# =============================================================================
# CREATE TRUCKS DIMENSION
# =============================================================================

print("\nCreating trucks dimension...")

# Create enhanced trucks dimension
dim_trucks = df_trucks_clean \
    .select(
        # Primary Key
        col("TruckID").alias("truck_id"),
        
        # Truck Information
        col("TruckIdentifier").alias("truck_identifier"),
        col("Model").alias("truck_model"),
        col("ManufactureYear").alias("manufacture_year"),
        
        # Derived Fields
        when(col("ManufactureYear") > 0, 
             2025 - col("ManufactureYear")).otherwise(0).alias("truck_age_years"),
        
        # Truck Categories based on age
        when(col("ManufactureYear") == 0, "Unknown Age")
        .when(2025 - col("ManufactureYear") <= 3, "New (0-3 years)")
        .when(2025 - col("ManufactureYear") <= 7, "Recent (4-7 years)")
        .when(2025 - col("ManufactureYear") <= 15, "Mature (8-15 years)")
        .otherwise("Veteran (15+ years)").alias("truck_age_category"),
        
        # Brand extraction from model
        when(col("Model").contains("Mercedes"), "Mercedes")
        .when(col("Model").contains("Ford"), "Ford")
        .when(col("Model").contains("Chevrolet"), "Chevrolet")
        .when(col("Model").contains("Iveco"), "Iveco")
        .when(col("Model").contains("Volvo"), "Volvo")
        .otherwise("Other").alias("truck_brand"),
        
        # Composite identifier for reporting
        concat_ws(" - ", col("TruckIdentifier"), col("Model")).alias("truck_display_name")
        
    ) \
    .withColumn("created_at", current_timestamp()) \
    .withColumn("updated_at", current_timestamp()) \
    .withColumn("record_hash", 
               md5(concat_ws("|", 
                           col("truck_id"),
                           col("truck_identifier"),
                           col("truck_model"),
                           col("manufacture_year"))))

print(f"Trucks dimension created: {dim_trucks.count()} records")

# Display sample of trucks dimension
print("\n--- Trucks Dimension Sample ---")
dim_trucks.show(10, truncate=False)

StatementMeta(, 051060dc-c1d6-45f7-84bf-f5f112c22bcd, 11, Finished, Available, Finished)


Creating trucks dimension...
Trucks dimension created: 8 records

--- Trucks Dimension Sample ---
+--------+----------------+----------------------+----------------+---------------+-------------------+-----------+------------------------------------+-------------------------+-------------------------+--------------------------------+
|truck_id|truck_identifier|truck_model           |manufacture_year|truck_age_years|truck_age_category |truck_brand|truck_display_name                  |created_at               |updated_at               |record_hash                     |
+--------+----------------+----------------------+----------------+---------------+-------------------+-----------+------------------------------------+-------------------------+-------------------------+--------------------------------+
|1       |EU-ICE-001      |Mercedes Sprinter 2017|2017            |8              |Mature (8-15 years)|Mercedes   |EU-ICE-001 - Mercedes Sprinter 2017 |2025-05-29 19:22:22.43234|2025-05-2

In [10]:
# =============================================================================
# DATA ANALYSIS
# =============================================================================

print("\n--- Trucks Analysis ---")

# Age distribution
print("Truck Age Distribution:")
age_distribution = dim_trucks.groupBy("truck_age_category") \
    .count() \
    .orderBy("count", ascending=False)
age_distribution.show(truncate=False)

# Brand distribution
print("Truck Brand Distribution:")
brand_distribution = dim_trucks.groupBy("truck_brand") \
    .count() \
    .orderBy("count", ascending=False)
brand_distribution.show(truncate=False)

# Manufacture year distribution
print("Manufacture Year Distribution:")
year_distribution = dim_trucks.groupBy("manufacture_year") \
    .count() \
    .orderBy("manufacture_year")
year_distribution.show(truncate=False)

StatementMeta(, 051060dc-c1d6-45f7-84bf-f5f112c22bcd, 12, Finished, Available, Finished)


--- Trucks Analysis ---
Truck Age Distribution:
+-------------------+-----+
|truck_age_category |count|
+-------------------+-----+
|Mature (8-15 years)|8    |
+-------------------+-----+

Truck Brand Distribution:
+-----------+-----+
|truck_brand|count|
+-----------+-----+
|Mercedes   |8    |
+-----------+-----+

Manufacture Year Distribution:
+----------------+-----+
|manufacture_year|count|
+----------------+-----+
|2017            |8    |
+----------------+-----+



In [11]:
# =============================================================================
# DATA QUALITY VALIDATION
# =============================================================================

print("\n--- Trucks Dimension Validation ---")

# Check for data integrity
total_records = dim_trucks.count()
null_truck_ids = dim_trucks.filter(col("truck_id").isNull()).count()
null_identifiers = dim_trucks.filter(col("truck_identifier").isNull()).count()
unknown_identifiers = dim_trucks.filter(col("truck_identifier") == "UNKNOWN").count()
unknown_models = dim_trucks.filter(col("truck_model") == "Unknown Model").count()

print(f"Total truck records: {total_records}")
print(f"Null truck IDs: {null_truck_ids}")
print(f"Null identifiers: {null_identifiers}")
print(f"Unknown identifiers: {unknown_identifiers}")
print(f"Unknown models: {unknown_models}")

# Age validation
print(f"Trucks with valid age: {dim_trucks.filter(col('truck_age_years') > 0).count()}")
print(f"Average truck age: {dim_trucks.filter(col('truck_age_years') > 0).agg({'truck_age_years': 'avg'}).collect()[0][0]:.1f} years")


StatementMeta(, 051060dc-c1d6-45f7-84bf-f5f112c22bcd, 13, Finished, Available, Finished)


--- Trucks Dimension Validation ---
Total truck records: 8
Null truck IDs: 0
Null identifiers: 0
Unknown identifiers: 0
Unknown models: 0
Trucks with valid age: 8
Average truck age: 8.0 years


In [12]:
# =============================================================================
# SAVE AS DELTA TABLE (INCREMENTAL)
# =============================================================================

print(f"\nSaving trucks dimension to: {delta_output_path}")

# Check if Delta table already exists
if DeltaTable.isDeltaTable(spark, delta_output_path):
    print("Existing Delta table found. Executing merge...")
    
    # Load existing Delta table
    delta_table = DeltaTable.forPath(spark, delta_output_path)
    
    # Execute merge (upsert) based on truck_id
    delta_table.alias("target") \
        .merge(dim_trucks.alias("source"), 
               "target.truck_id = source.truck_id") \
        .whenMatchedUpdate(
            condition="target.record_hash != source.record_hash",
            set={
                "truck_identifier": "source.truck_identifier",
                "truck_model": "source.truck_model",
                "manufacture_year": "source.manufacture_year",
                "truck_age_years": "source.truck_age_years",
                "truck_age_category": "source.truck_age_category",
                "truck_brand": "source.truck_brand",
                "truck_display_name": "source.truck_display_name",
                "updated_at": "source.updated_at",
                "record_hash": "source.record_hash"
            }) \
        .whenNotMatchedInsertAll() \
        .execute()
        
    print("Merge executed successfully!")
    
else:
    print("Creating new Delta table...")
    dim_trucks.write \
        .format("delta") \
        .mode("overwrite") \
        .option("mergeSchema", "true") \
        .save(delta_output_path)
    print("Delta table created successfully!")

StatementMeta(, 051060dc-c1d6-45f7-84bf-f5f112c22bcd, 14, Finished, Available, Finished)


Saving trucks dimension to: Tables/dim_trucks
Creating new Delta table...
Delta table created successfully!


In [13]:
# =============================================================================
# OPTIMIZE DELTA TABLE
# =============================================================================

print("\nOptimizing Delta table...")
try:
    spark.sql(f"OPTIMIZE delta.`{delta_output_path}`")
    print("Optimization completed!")
except Exception as e:
    print(f"Optimization error: {str(e)}")

StatementMeta(, 051060dc-c1d6-45f7-84bf-f5f112c22bcd, 15, Finished, Available, Finished)


Optimizing Delta table...
Optimization completed!


In [14]:
# =============================================================================
# FINAL DATA QUALITY CHECKS
# =============================================================================

print("\nRunning final data quality checks...")

# Load the saved table for verification
df_final = spark.read.format("delta").load(delta_output_path)

# Final statistics
final_total_records = df_final.count()
distinct_trucks = df_final.select("truck_id").distinct().count()
distinct_identifiers = df_final.select("truck_identifier").distinct().count()

print(f"Final total records: {final_total_records}")
print(f"Distinct truck IDs: {distinct_trucks}")
print(f"Distinct truck identifiers: {distinct_identifiers}")

# Show final data sample
print("\n--- Final Trucks Dimension Sample ---")
df_final.show(10, truncate=False)

StatementMeta(, 051060dc-c1d6-45f7-84bf-f5f112c22bcd, 16, Finished, Available, Finished)


Running final data quality checks...
Final total records: 8
Distinct truck IDs: 8
Distinct truck identifiers: 8

--- Final Trucks Dimension Sample ---
+--------+----------------+----------------------+----------------+---------------+-------------------+-----------+------------------------------------+--------------------------+--------------------------+--------------------------------+
|truck_id|truck_identifier|truck_model           |manufacture_year|truck_age_years|truck_age_category |truck_brand|truck_display_name                  |created_at                |updated_at                |record_hash                     |
+--------+----------------+----------------------+----------------+---------------+-------------------+-----------+------------------------------------+--------------------------+--------------------------+--------------------------------+
|1       |EU-ICE-001      |Mercedes Sprinter 2017|2017            |8              |Mature (8-15 years)|Mercedes   |EU-ICE-001 - 

In [15]:

# =============================================================================
# SUMMARY
# =============================================================================

print("="*50)
print("TRUCKS DIMENSION ETL COMPLETED SUCCESSFULLY!")
print(f"Records processed: {final_total_records}")
print(f"Table saved at: {delta_output_path}")
print("="*50)

# Summary dictionary for further analysis
summary_stats = {
    "total_records": final_total_records,
    "distinct_trucks": distinct_trucks,
    "distinct_identifiers": distinct_identifiers,
    "table_path": delta_output_path
}

print(f"Summary: {summary_stats}")

# Show truck fleet summary
print("\n--- Fleet Summary ---")
fleet_summary = df_final.agg({
    'truck_age_years': 'avg',
    'manufacture_year': 'min', 
    'manufacture_year': 'max'
}).collect()[0]

avg_age = fleet_summary[0] if fleet_summary[0] else 0
print(f"Fleet average age: {avg_age:.1f} years")
print(f"Oldest truck year: {df_final.filter(col('manufacture_year') > 0).agg({'manufacture_year': 'min'}).collect()[0][0]}")
print(f"Newest truck year: {df_final.agg({'manufacture_year': 'max'}).collect()[0][0]}")

StatementMeta(, 051060dc-c1d6-45f7-84bf-f5f112c22bcd, 17, Finished, Available, Finished)

TRUCKS DIMENSION ETL COMPLETED SUCCESSFULLY!
Records processed: 8
Table saved at: Tables/dim_trucks
Summary: {'total_records': 8, 'distinct_trucks': 8, 'distinct_identifiers': 8, 'table_path': 'Tables/dim_trucks'}

--- Fleet Summary ---
Fleet average age: 2017.0 years
Oldest truck year: 2017
Newest truck year: 2017
