In [1]:
# =============================================================================
# FUEL PRICES FACT TABLE ETL - MS FABRIC NOTEBOOK V3
# Creates fact table with numerical measures only - no dimensions
# Focuses on 2025 data with incremental Delta processing
# =============================================================================

from pyspark.sql.functions import (
    col, when, coalesce, lit, current_timestamp, md5, concat_ws, to_date, 
    date_format, dayofweek, weekofyear, abs as spark_abs, year, month, 
    dayofmonth, quarter, row_number, avg, count, max as spark_max, 
    min as spark_min, sum, isnan, isnull
)
from pyspark.sql.types import DoubleType, IntegerType
from pyspark.sql.window import Window
from delta.tables import DeltaTable

print("🚛 FUEL PRICES FACT TABLE ETL V3 - MS FABRIC")
print("=" * 60)
print("📋 Scope: Fact table only with numerical measures")
print("📅 Period: 2025 data focus")
print("💾 Storage: Delta Lake with incremental processing")
print("=" * 60)

StatementMeta(, 135c8a95-976f-493e-9f55-8c69c7287413, 3, Finished, Available, Finished)

🚛 FUEL PRICES FACT TABLE ETL V3 - MS FABRIC
📋 Scope: Fact table only with numerical measures
📅 Period: 2025 data focus
💾 Storage: Delta Lake with incremental processing


In [2]:
# =============================================================================
# CONFIGURATION
# =============================================================================

fuel_prices_path = "Files/bronze/road_fuel_prices_data/road_fuel_prices_data.parquet"
dim_date_path = "Tables/dim_date"
dim_fuel_types_path = "Tables/dim_fuel_types"
fact_fuel_prices_path = "Tables/fact_fuel_prices"

# Business rules
CURRENT_YEAR = 2025
MAX_REASONABLE_PRICE = 10.0  # Cap for outlier detection
MIN_REASONABLE_PRICE = 0.0   # Floor for negative values

print(f"📁 Source: {fuel_prices_path}")
print(f"🎯 Target: {fact_fuel_prices_path}")
print(f"📊 Processing year: {CURRENT_YEAR}")

StatementMeta(, 135c8a95-976f-493e-9f55-8c69c7287413, 4, Finished, Available, Finished)

📁 Source: Files/bronze/road_fuel_prices_data/road_fuel_prices_data.parquet
🎯 Target: Tables/fact_fuel_prices
📊 Processing year: 2025


In [3]:
# =============================================================================
# DATA EXTRACTION
# =============================================================================

print(f"\n📊 EXTRACTING SOURCE DATA")
print("=" * 30)

try:
    df_fuel_raw = spark.read.parquet(fuel_prices_path)
    raw_count = df_fuel_raw.count()
    print(f"✅ Raw data loaded: {raw_count:,} records")

    # ⬇️ INSERIR AQUI A LIMPEZA DE NOMES
    import re

    def sanitize_column_name(name):
        return re.sub(r'\W+', '_', name.strip()).lower()

    sanitized_columns = {col_name: sanitize_column_name(col_name) for col_name in df_fuel_raw.columns}
    for original_name, sanitized_name in sanitized_columns.items():
        df_fuel_raw = df_fuel_raw.withColumnRenamed(original_name, sanitized_name)

    print(f"🧼 Column names sanitized ({len(sanitized_columns)} total):")
    for i, (original, sanitized) in enumerate(sanitized_columns.items(), 1):
        print(f"  {i:2d}. {original} → {sanitized}")

    # Mostrar colunas já com nomes limpos
    print(f"\n📋 Available columns ({len(df_fuel_raw.columns)}):")
    for i, col_name in enumerate(df_fuel_raw.columns, 1):
        print(f"  {i:2d}. {col_name}")

    print(f"\n👀 Sample data (first 3 rows):")
    df_fuel_raw.show(3, truncate=False)

except Exception as e:
    print(f"❌ Error loading data: {str(e)}")
    raise


StatementMeta(, 135c8a95-976f-493e-9f55-8c69c7287413, 5, Finished, Available, Finished)


📊 EXTRACTING SOURCE DATA
✅ Raw data loaded: 1,147 records
🧼 Column names sanitized (11 total):
   1. Date → date
   2.  ULSP:  Pump price (p/litre) → ulsp_pump_price_p_litre_
   3. ULSP:  Diff on previous WEEK (p/litre) → ulsp_diff_on_previous_week_p_litre_
   4.  ULSP: Diff on previous  YEAR (p/litre) → ulsp_diff_on_previous_year_p_litre_
   5. Duty rate ULSP (p/litre) → duty_rate_ulsp_p_litre_
   6. VAT (% rate) ULSP → vat_rate_ulsp
   7. ULSD: Pump price (p/litre) → ulsd_pump_price_p_litre_
   8. ULSD: Diff on previous WEEK (p/litre) → ulsd_diff_on_previous_week_p_litre_
   9. ULSD: Diff on previous  YEAR (p/litre) → ulsd_diff_on_previous_year_p_litre_
  10. Duty rate ULSD (p/litre) → duty_rate_ulsd_p_litre_
  11. VAT (% rate) ULSD → vat_rate_ulsd

📋 Available columns (11):
   1. date
   2. ulsp_pump_price_p_litre_
   3. ulsp_diff_on_previous_week_p_litre_
   4. ulsp_diff_on_previous_year_p_litre_
   5. duty_rate_ulsp_p_litre_
   6. vat_rate_ulsp
   7. ulsd_pump_price_p_litre_
   8

In [4]:
# =============================================================================
# DATA QUALITY ASSESSMENT
# =============================================================================

print(f"\n🔍 DATA QUALITY ASSESSMENT")
print("=" * 35)

# Check date column
date_col = "Date"
null_dates = df_fuel_raw.filter(col(date_col).isNull()).count()
print(f"📅 Date column: {date_col}")
print(f"   Null dates: {null_dates:,}")

# Get date range
try:
    date_stats = df_fuel_raw.agg(
        spark_min(date_col).alias("min_date"),
        spark_max(date_col).alias("max_date")
    ).collect()[0]
    print(f"   Date range: {date_stats['min_date']} → {date_stats['max_date']}")
except:
    print("   ⚠️ Could not determine date range")

# Identify price columns
price_columns = [
    col_name for col_name in df_fuel_raw.columns 
    if any(indicator in col_name.upper() for indicator in [
        'ULSD', 'ULSP', 'PRICE', 'PUMP', 'DIFF', 'VAT', 'DUTY'
    ]) and col_name != date_col
]

print(f"\n💰 Price columns identified ({len(price_columns)}):")
for i, col_name in enumerate(price_columns, 1):
    print(f"  {i:2d}. {col_name}")

StatementMeta(, 135c8a95-976f-493e-9f55-8c69c7287413, 6, Finished, Available, Finished)


🔍 DATA QUALITY ASSESSMENT
📅 Date column: Date
   Null dates: 1
   Date range: 2003-06-09 → 2025-05-19

💰 Price columns identified (10):
   1. ulsp_pump_price_p_litre_
   2. ulsp_diff_on_previous_week_p_litre_
   3. ulsp_diff_on_previous_year_p_litre_
   4. duty_rate_ulsp_p_litre_
   5. vat_rate_ulsp
   6. ulsd_pump_price_p_litre_
   7. ulsd_diff_on_previous_week_p_litre_
   8. ulsd_diff_on_previous_year_p_litre_
   9. duty_rate_ulsd_p_litre_
  10. vat_rate_ulsd


In [5]:
# =============================================================================
# DATA CLEANING AND FILTERING
# =============================================================================

print(f"\n🧹 DATA CLEANING - {CURRENT_YEAR} FOCUS")
print("=" * 40)

# Clean date and filter for current year
df_fuel_clean = df_fuel_raw \
    .withColumn("date_clean", to_date(col(date_col))) \
    .filter(col("date_clean").isNotNull()) \
    .filter(year(col("date_clean")) == CURRENT_YEAR)

clean_count = df_fuel_clean.count()
print(f"✅ Records after {CURRENT_YEAR} filter: {clean_count:,}")

if clean_count == 0:
    print(f"⚠️ No data found for {CURRENT_YEAR}. Adjusting filter...")
    # Fallback to most recent year available
    df_fuel_clean = df_fuel_raw \
        .withColumn("date_clean", to_date(col(date_col))) \
        .filter(col("date_clean").isNotNull())
    
    recent_year = df_fuel_clean.agg(spark_max(year(col("date_clean")))).collect()[0][0]
    df_fuel_clean = df_fuel_clean.filter(year(col("date_clean")) == recent_year)
    clean_count = df_fuel_clean.count()
    print(f"📊 Using most recent year {recent_year}: {clean_count:,} records")

# Add date dimensions
df_fuel_clean = df_fuel_clean \
    .withColumn("year", year(col("date_clean"))) \
    .withColumn("month", month(col("date_clean"))) \
    .withColumn("day", dayofmonth(col("date_clean"))) \
    .withColumn("quarter", quarter(col("date_clean"))) \
    .withColumn("day_of_week", dayofweek(col("date_clean"))) \
    .withColumn("week_of_year", weekofyear(col("date_clean"))) \
    .withColumn("is_weekend", 
               when(dayofweek(col("date_clean")).isin([1, 7]), 1).otherwise(0))

print("📅 Date dimensions added")

StatementMeta(, 135c8a95-976f-493e-9f55-8c69c7287413, 7, Finished, Available, Finished)


🧹 DATA CLEANING - 2025 FOCUS
✅ Records after 2025 filter: 20
📅 Date dimensions added


In [6]:
# =============================================================================
# NUMERIC PRICE PROCESSING
# =============================================================================

print(f"\n💹 PROCESSING NUMERIC PRICE COLUMNS")
print("=" * 40)

processed_columns = []
processing_errors = []

for col_name in price_columns:
    try:
        # Test if column contains numeric data
        sample_check = df_fuel_clean.select(col_name) \
            .filter(col(col_name).isNotNull() & ~isnan(col(col_name))) \
            .limit(1) \
            .collect()
        
        if sample_check:
            # Create cleaned version
            clean_col_name = f"{col_name}_clean"
            df_fuel_clean = df_fuel_clean \
                .withColumn(clean_col_name,
                           when(col(col_name).isNull() | isnan(col(col_name)), 0.0)
                           .when(col(col_name) < MIN_REASONABLE_PRICE, MIN_REASONABLE_PRICE)
                           .when(col(col_name) > MAX_REASONABLE_PRICE, MAX_REASONABLE_PRICE)
                           .otherwise(col(col_name).cast(DoubleType())))
            
            processed_columns.append((col_name, clean_col_name))
            
    except Exception as e:
        processing_errors.append((col_name, str(e)))

print(f"✅ Successfully processed: {len(processed_columns)} columns")
print(f"⚠️ Processing errors: {len(processing_errors)} columns")

if processing_errors:
    print(f"📋 Columns with errors:")
    for col_name, error in processing_errors[:3]:  # Show first 3 errors
        print(f"   • {col_name}: {error}")

# Get all cleaned column names
cleaned_columns = [clean_name for _, clean_name in processed_columns]
print(f"📊 Available cleaned columns: {len(cleaned_columns)}")

StatementMeta(, 135c8a95-976f-493e-9f55-8c69c7287413, 8, Finished, Available, Finished)


💹 PROCESSING NUMERIC PRICE COLUMNS
✅ Successfully processed: 10 columns
⚠️ Processing errors: 0 columns
📊 Available cleaned columns: 10


In [10]:
# =============================================================================
# AUDIT AND METADATA
# =============================================================================

from pyspark.sql.functions import current_timestamp

fact_fuel_prices = fact_fuel_prices \
    .withColumn("created_at", current_timestamp()) \
    .withColumn("updated_at", current_timestamp())

print("📋 Audit fields added")

# Final record count
final_count = fact_fuel_prices.count()
print(f"📊 Final fact table records: {final_count:,}")


StatementMeta(, 135c8a95-976f-493e-9f55-8c69c7287413, 12, Finished, Available, Finished)

📋 Audit fields added
📊 Final fact table records: 20


In [11]:
# =============================================================================
# INCREMENTAL DELTA LAKE PROCESSING
# =============================================================================

print(f"\n💾 INCREMENTAL DELTA LAKE PROCESSING")
print("=" * 45)

print(f"🎯 Target location: {fact_fuel_prices_path}")

from delta.tables import DeltaTable

try:
    if DeltaTable.isDeltaTable(spark, fact_fuel_prices_path):
        print("📋 Existing Delta table detected - Executing merge...")
        
        delta_table = DeltaTable.forPath(spark, fact_fuel_prices_path)
        
        delta_table.alias("target").merge(
            fact_fuel_prices.alias("source"),
            "target.fuel_price_id = source.fuel_price_id AND target.price_date = source.price_date"
        ).whenMatchedUpdateAll() \
         .whenNotMatchedInsertAll() \
         .execute()

        print("✅ Incremental merge completed!")

    else:
        print("📋 Creating new Delta table (first time)...")

        fact_fuel_prices.write \
            .format("delta") \
            .mode("overwrite") \
            .partitionBy("year", "month") \
            .option("delta.autoOptimize.optimizeWrite", "true") \
            .option("delta.autoOptimize.autoCompact", "true") \
            .save(fact_fuel_prices_path)

        print("✅ New Delta table created!")

        # Optional: register in Spark catalog
        try:
            spark.sql(f"""
                CREATE TABLE IF NOT EXISTS fact_fuel_prices
                USING DELTA
                LOCATION '{fact_fuel_prices_path}'
            """)
            print("📝 Table registered in catalog")
        except Exception as catalog_error:
            print(f"ℹ️ Catalog registration warning: {str(catalog_error)}")

    # Optimization
    print("⚡ Optimizing Delta table...")
    spark.sql(f"OPTIMIZE delta.`{fact_fuel_prices_path}`")
    print("✅ Delta optimization completed!")

    # Final read for verification
    saved_table = spark.read.format("delta").load(fact_fuel_prices_path)
    saved_count = saved_table.count()
    print(f"🎯 Final table verification: {saved_count:,} records")

except Exception as e:
    print(f"❌ Delta operation error: {str(e)}")
    print("🔄 Attempting fallback save...")

    try:
        fact_fuel_prices.write \
            .format("delta") \
            .mode("overwrite") \
            .partitionBy("year", "month") \
            .save(fact_fuel_prices_path)
        print("✅ Fallback save successful!")
    except Exception as fallback_error:
        print(f"❌ Fallback failed: {str(fallback_error)}")
        raise


StatementMeta(, 135c8a95-976f-493e-9f55-8c69c7287413, 13, Finished, Available, Finished)


💾 INCREMENTAL DELTA LAKE PROCESSING
🎯 Target location: Tables/fact_fuel_prices
📋 Creating new Delta table (first time)...
✅ New Delta table created!
📝 Table registered in catalog
⚡ Optimizing Delta table...
✅ Delta optimization completed!
🎯 Final table verification: 20 records


In [12]:
# =============================================================================
# SUCCESS SUMMARY
# =============================================================================

print(f"\n" + "=" * 60)
print("🎉 FUEL PRICES FACT TABLE - SIMPLE VERSION COMPLETED!")
print("=" * 60)

print(f"✅ Processing Summary:")
print(f"   • Final fact records: {final_count:,}")
print(f"   • Partitioning: year, month")
print(f"   • Format: Delta Lake")
print(f"   • Optimization: Enabled")

print(f"\n📁 Table Location:")
print(f"   {fact_fuel_prices_path}")


StatementMeta(, 135c8a95-976f-493e-9f55-8c69c7287413, 14, Finished, Available, Finished)


🎉 FUEL PRICES FACT TABLE - SIMPLE VERSION COMPLETED!
✅ Processing Summary:
   • Final fact records: 20
   • Partitioning: year, month
   • Format: Delta Lake
   • Optimization: Enabled

📁 Table Location:
   Tables/fact_fuel_prices
