In [1]:
from pyspark.sql.functions import col, when, trim, upper, coalesce, lit, current_timestamp, md5, concat_ws, to_date, date_format, dayofweek, weekofyear, abs as spark_abs, round as spark_round, year, month, dayofmonth, quarter, monotonically_increasing_id
from pyspark.sql.types import *
from delta.tables import DeltaTable

StatementMeta(, e0addd06-5fcf-4d7f-a9c2-3bcef6715164, 3, Finished, Available, Finished)

In [2]:
# =============================================================================
# CONFIGURATION
# =============================================================================

weather_path = "Tables/tbl_weather" 
geography_dim_path = "Tables/dim_geography"

# Output paths
dim_weather_conditions_path = "Tables/dim_weather_conditions"
dim_date_path = "Tables/dim_date"
fact_weather_path = "Tables/fact_weather"

StatementMeta(, e0addd06-5fcf-4d7f-a9c2-3bcef6715164, 4, Finished, Available, Finished)

In [3]:
# =============================================================================
# LOAD WEATHER DATA FROM DELTA TABLE
# =============================================================================

print("Loading weather data from Delta table...")

try:
    df_weather_raw = spark.read.format("delta").load(weather_path)
    print(f"Weather data loaded: {df_weather_raw.count()} records")
    
except Exception as e:
    print(f"Error loading weather Delta table: {str(e)}")
    raise

# Display sample data
print("\n--- Raw Weather Data Sample ---")
df_weather_raw.show(5, truncate=False)

print("\n--- Weather Data Schema ---")
df_weather_raw.printSchema()

StatementMeta(, e0addd06-5fcf-4d7f-a9c2-3bcef6715164, 5, Finished, Available, Finished)

Loading weather data from Delta table...
Weather data loaded: 40 records

--- Raw Weather Data Sample ---
+------------+---------+----------+---------------+---------------+---------------+--------+--------+-----------------+-------------------+----------+----------+-------------------+----+-----+-----------+------------+------+--------------------+----------+
|country_code|city_name|date      |temperature_avg|temperature_max|temperature_min|humidity|pressure|weather_condition|weather_description|wind_speed|cloudiness|created_at         |year|month|day_of_week|week_of_year|season|temperature_category|is_weekend|
+------------+---------+----------+---------------+---------------+---------------+--------+--------+-----------------+-------------------+----------+----------+-------------------+----+-----+-----------+------------+------+--------------------+----------+
|AT          |Vienna   |2025-05-28|12.95          |13.96          |11.63          |84      |1014    |Clouds           |few 

In [4]:
# =============================================================================
# LOAD EXISTING GEOGRAPHY DIMENSION
# =============================================================================

print("\nLoading existing geography dimension...")

try:
    df_geography = spark.read.format("delta").load(geography_dim_path)
    print(f"Geography dimension loaded: {df_geography.count()} records")
    
    print("\n--- Geography Dimension Sample ---")
    df_geography.show(5, truncate=False)
    
except Exception as e:
    print(f"Warning: Could not load geography dimension: {str(e)}")
    print("Creating mock geography dimension for weather cities...")
    
    # Create basic geography dimension for weather cities if not exists
    weather_cities = df_weather_raw.select("country_code", "city_name").distinct()
    df_geography = weather_cities.withColumn("city_id", monotonically_increasing_id()) \
        .withColumn("country_iso3", 
                   when(col("country_code") == "PT", "PRT")
                   .when(col("country_code") == "GR", "GRC")
                   .when(col("country_code") == "CH", "CHE")
                   .when(col("country_code") == "SI", "SVN")
                   .when(col("country_code") == "AT", "AUT")
                   .when(col("country_code") == "BE", "BEL")
                   .when(col("country_code") == "CZ", "CZE")
                   .when(col("country_code") == "PL", "POL")
                   .when(col("country_code") == "DK", "DNK")
                   .when(col("country_code") == "NO", "NOR")
                   .when(col("country_code") == "HU", "HUN")
                   .when(col("country_code") == "SE", "SWE")
                   .when(col("country_code") == "IT", "ITA")
                   .when(col("country_code") == "NL", "NLD")
                   .when(col("country_code") == "DE", "DEU")
                   .when(col("country_code") == "ES", "ESP")
                   .when(col("country_code") == "GB", "GBR")
                   .when(col("country_code") == "FR", "FRA")
                   .when(col("country_code") == "FI", "FIN")
                   .when(col("country_code") == "IE", "IRL")
                   .otherwise("UNK"))

StatementMeta(, e0addd06-5fcf-4d7f-a9c2-3bcef6715164, 6, Finished, Available, Finished)


Loading existing geography dimension...
Geography dimension loaded: 21 records

--- Geography Dimension Sample ---
+-------+---------+----------+------------+------------+------------+----------------------+-------------+-------------------------+-------------------------+--------------------------------+
|city_id|city_name|country_id|country_name|country_iso2|country_iso3|city_country          |city_iso3    |created_at               |updated_at               |record_hash                     |
+-------+---------+----------+------------+------------+------------+----------------------+-------------+-------------------------+-------------------------+--------------------------------+
|1      |Berlin   |1         |Germany     |DE          |DEU         |Berlin, Germany       |Berlin-DEU   |2025-05-29 18:53:07.29665|2025-05-29 18:53:07.29665|987d50189e36d37fe3766f62338f9c9f|
|2      |Munich   |1         |Germany     |DE          |DEU         |Munich, Germany       |Munich-DEU   |2025-05-29

In [5]:
# =============================================================================
# DATA QUALITY CHECKS - WEATHER DATA
# =============================================================================

print("\n--- Weather Data Quality Analysis ---")

# Check for nulls and invalid values
print("Temperature Analysis:")
print(f"Null temperature_avg: {df_weather_raw.filter(col('temperature_avg').isNull()).count()}")
print(f"Invalid temperatures (< -50 or > 60): {df_weather_raw.filter((col('temperature_avg') < -50) | (col('temperature_avg') > 60)).count()}")

print("\nDate Analysis:")
print(f"Null dates: {df_weather_raw.filter(col('date').isNull()).count()}")
print(f"Date range: {df_weather_raw.agg({'date': 'min'}).collect()[0][0]} to {df_weather_raw.agg({'date': 'max'}).collect()[0][0]}")

print("\nLocation Analysis:")
unique_cities = df_weather_raw.select("country_code", "city_name").distinct().count()
print(f"Unique city-country combinations: {unique_cities}")

StatementMeta(, e0addd06-5fcf-4d7f-a9c2-3bcef6715164, 7, Finished, Available, Finished)


--- Weather Data Quality Analysis ---
Temperature Analysis:
Null temperature_avg: 0
Invalid temperatures (< -50 or > 60): 0

Date Analysis:
Null dates: 0
Date range: 2025-05-28 to 2025-05-29

Location Analysis:
Unique city-country combinations: 20


In [6]:
# =============================================================================
# CLEAN WEATHER DATA
# =============================================================================

print("\nCleaning weather data...")

df_weather_clean = df_weather_raw \
    .filter(col("country_code").isNotNull()) \
    .filter(col("city_name").isNotNull()) \
    .filter(col("date").isNotNull()) \
    .withColumn("date", to_date(col("date"), "yyyy-MM-dd")) \
    .withColumn("temperature_avg", 
               when(col("temperature_avg").isNull(), 0.0)
               .when(col("temperature_avg") < -50, -50.0)
               .when(col("temperature_avg") > 60, 60.0)
               .otherwise(col("temperature_avg").cast(DoubleType()))) \
    .withColumn("temperature_max", 
               when(col("temperature_max").isNull(), col("temperature_avg"))
               .otherwise(col("temperature_max").cast(DoubleType()))) \
    .withColumn("temperature_min", 
               when(col("temperature_min").isNull(), col("temperature_avg"))
               .otherwise(col("temperature_min").cast(DoubleType()))) \
    .withColumn("humidity", 
               when(col("humidity").isNull(), 0)
               .when(col("humidity") < 0, 0)
               .when(col("humidity") > 100, 100)
               .otherwise(col("humidity"))) \
    .withColumn("pressure", 
               when(col("pressure").isNull(), 1013)
               .when(col("pressure") < 900, 900)
               .when(col("pressure") > 1100, 1100)
               .otherwise(col("pressure"))) \
    .withColumn("wind_speed", 
               when(col("wind_speed").isNull(), 0.0)
               .when(col("wind_speed") < 0, 0.0)
               .otherwise(col("wind_speed").cast(DoubleType()))) \
    .withColumn("cloudiness", 
               when(col("cloudiness").isNull(), 0)
               .when(col("cloudiness") < 0, 0)
               .when(col("cloudiness") > 100, 100)
               .otherwise(col("cloudiness"))) \
    .withColumn("weather_condition", 
               when(col("weather_condition").isNull(), "Unknown")
               .otherwise(trim(col("weather_condition")))) \
    .withColumn("weather_description", 
               when(col("weather_description").isNull(), "")
               .otherwise(trim(col("weather_description"))))

print(f"Weather data after cleaning: {df_weather_clean.count()} records")

StatementMeta(, e0addd06-5fcf-4d7f-a9c2-3bcef6715164, 8, Finished, Available, Finished)


Cleaning weather data...
Weather data after cleaning: 40 records


In [7]:
# =============================================================================
# CREATE WEATHER CONDITIONS DIMENSION
# =============================================================================

print("\nCreating weather conditions dimension...")

dim_weather_conditions = df_weather_clean \
    .select("weather_condition", "weather_description") \
    .distinct() \
    .withColumn("weather_condition_id", monotonically_increasing_id()) \
    .select(
        col("weather_condition_id"),
        col("weather_condition").alias("condition_name"),
        col("weather_description").alias("condition_description"),
        
        # Categorize weather conditions
        when(col("weather_condition").isin(["Clear"]), "Clear")
        .when(col("weather_condition").isin(["Clouds"]), "Cloudy")
        .when(col("weather_condition").isin(["Rain", "Drizzle"]), "Rainy")
        .when(col("weather_condition").isin(["Snow"]), "Snowy")
        .when(col("weather_condition").isin(["Thunderstorm"]), "Stormy")
        .when(col("weather_condition").isin(["Mist", "Fog", "Haze"]), "Misty")
        .otherwise("Other").alias("condition_category"),
        
        # Weather impact on business
        when(col("weather_condition").isin(["Clear"]), "Favorable")
        .when(col("weather_condition").isin(["Clouds"]), "Neutral")
        .when(col("weather_condition").isin(["Rain", "Drizzle", "Snow", "Thunderstorm"]), "Challenging")
        .otherwise("Unknown").alias("business_impact")
    ) \
    .withColumn("created_at", current_timestamp()) \
    .withColumn("updated_at", current_timestamp()) \
    .withColumn("record_hash", 
               md5(concat_ws("|", 
                           col("condition_name"),
                           col("condition_description"),
                           col("condition_category"),
                           col("business_impact"))))

print(f"Weather conditions dimension created: {dim_weather_conditions.count()} records")

print("\n--- Weather Conditions Dimension Sample ---")
dim_weather_conditions.show(truncate=False)

StatementMeta(, e0addd06-5fcf-4d7f-a9c2-3bcef6715164, 9, Finished, Available, Finished)


Creating weather conditions dimension...
Weather conditions dimension created: 7 records

--- Weather Conditions Dimension Sample ---
+--------------------+--------------+---------------------+------------------+---------------+--------------------------+--------------------------+--------------------------------+
|weather_condition_id|condition_name|condition_description|condition_category|business_impact|created_at                |updated_at                |record_hash                     |
+--------------------+--------------+---------------------+------------------+---------------+--------------------------+--------------------------+--------------------------------+
|0                   |Rain          |light rain           |Rainy             |Challenging    |2025-05-29 20:09:13.209127|2025-05-29 20:09:13.209127|0b1a9d7a7358df547f6c633ddb29a460|
|1                   |Clouds        |broken clouds        |Cloudy            |Neutral        |2025-05-29 20:09:13.209127|2025-05-29 20:09

In [8]:
# =============================================================================
# CREATE DATE DIMENSION
# =============================================================================

print("\nCreating date dimension...")

# Get unique dates from weather data
unique_dates = df_weather_clean.select("date").distinct()

dim_date = unique_dates \
    .withColumn("date_id", date_format(col("date"), "yyyyMMdd").cast(IntegerType())) \
    .withColumn("year", year(col("date"))) \
    .withColumn("month", month(col("date"))) \
    .withColumn("day", dayofmonth(col("date"))) \
    .withColumn("quarter", quarter(col("date"))) \
    .withColumn("day_of_week", dayofweek(col("date"))) \
    .withColumn("week_of_year", weekofyear(col("date"))) \
    .withColumn("day_name", date_format(col("date"), "EEEE")) \
    .withColumn("month_name", date_format(col("date"), "MMMM")) \
    .withColumn("is_weekend", 
               when(dayofweek(col("date")).isin([1, 7]), True).otherwise(False)) \
    .withColumn("season",
               when(month(col("date")).isin([12, 1, 2]), "Winter")
               .when(month(col("date")).isin([3, 4, 5]), "Spring")
               .when(month(col("date")).isin([6, 7, 8]), "Summer")
               .when(month(col("date")).isin([9, 10, 11]), "Autumn")
               .otherwise("Unknown")) \
    .withColumn("created_at", current_timestamp()) \
    .withColumn("updated_at", current_timestamp()) \
    .withColumn("record_hash", 
               md5(concat_ws("|", 
                           col("date_id"),
                           col("year"),
                           col("month"),
                           col("day"),
                           col("quarter"))))

print(f"Date dimension created: {dim_date.count()} records")

print("\n--- Date Dimension Sample ---")
dim_date.show(10, truncate=False)

StatementMeta(, e0addd06-5fcf-4d7f-a9c2-3bcef6715164, 10, Finished, Available, Finished)


Creating date dimension...
Date dimension created: 2 records

--- Date Dimension Sample ---
+----------+--------+----+-----+---+-------+-----------+------------+---------+----------+----------+------+--------------------------+--------------------------+--------------------------------+
|date      |date_id |year|month|day|quarter|day_of_week|week_of_year|day_name |month_name|is_weekend|season|created_at                |updated_at                |record_hash                     |
+----------+--------+----+-----+---+-------+-----------+------------+---------+----------+----------+------+--------------------------+--------------------------+--------------------------------+
|2025-05-29|20250529|2025|5    |29 |2      |5          |22          |Thursday |May       |false     |Spring|2025-05-29 20:09:14.938969|2025-05-29 20:09:14.938969|638d7653ff94013cd2fd7b2b08a25099|
|2025-05-28|20250528|2025|5    |28 |2      |4          |22          |Wednesday|May       |false     |Spring|2025-05-29 20:0

In [9]:
# =============================================================================
# CREATE WEATHER FACT TABLE
# =============================================================================

print("\nCreating weather fact table...")

# First, create a lookup for geography based on city_name and country_iso2
geography_lookup = df_geography.select(
    col("city_id"),
    coalesce(col("city_name"), lit("")).alias("geo_city_name"),
    coalesce(col("country_iso2"), lit("")).alias("geo_country_code")
)

# Join weather data with dimensions
fact_weather = df_weather_clean.alias("w") \
    .join(dim_date.alias("d"), 
          col("w.date") == col("d.date"), 
          "inner") \
    .join(dim_weather_conditions.alias("wc"), 
          (col("w.weather_condition") == col("wc.condition_name")) & 
          (col("w.weather_description") == col("wc.condition_description")), 
          "inner") \
    .join(geography_lookup.alias("g"),
          (upper(col("w.city_name")) == upper(col("g.geo_city_name"))) &
          (col("w.country_code") == col("g.geo_country_code")),
          "left") \
    .select(
        # Primary Key (composite)
        concat_ws("-", col("w.country_code"), col("w.city_name"), 
                 date_format(col("w.date"), "yyyyMMdd")).alias("weather_fact_id"),
        
        # Foreign Keys (Dimension References)
        col("d.date_id"),
        coalesce(col("g.city_id"), lit(-1)).alias("city_id"),
        col("wc.weather_condition_id"),
        
        # Date and Location Context
        col("w.date").alias("weather_date"),
        col("w.country_code"),
        col("w.city_name"),
        
        # Weather Measurements (Facts)
        spark_round(col("w.temperature_avg"), 2).alias("temperature_avg"),
        spark_round(col("w.temperature_max"), 2).alias("temperature_max"),
        spark_round(col("w.temperature_min"), 2).alias("temperature_min"),
        col("w.humidity").alias("humidity_percent"),
        col("w.pressure").alias("atmospheric_pressure"),
        spark_round(col("w.wind_speed"), 2).alias("wind_speed_kmh"),
        col("w.cloudiness").alias("cloudiness_percent"),
        
        # Derived Measures
        spark_round(col("w.temperature_max") - col("w.temperature_min"), 2).alias("temperature_range"),
        when(col("w.temperature_avg") < 10, "Cold")
        .when(col("w.temperature_avg") < 20, "Mild")
        .when(col("w.temperature_avg") < 30, "Warm")
        .otherwise("Hot").alias("temperature_category"),
        
        # Comfort Index (simplified)
        when((col("w.temperature_avg") >= 18) & (col("w.temperature_avg") <= 24) & 
             (col("w.humidity") >= 40) & (col("w.humidity") <= 60) &
             (col("w.wind_speed") <= 10), "Comfortable")
        .when((col("w.temperature_avg") < 5) | (col("w.temperature_avg") > 35), "Uncomfortable")
        .otherwise("Moderate").alias("comfort_level"),
        
        # Business Impact Indicators
        when(col("w.weather_condition").isin(["Rain", "Snow", "Thunderstorm"]), True)
        .otherwise(False).alias("is_adverse_weather"),
        
        when((col("w.temperature_avg") >= 20) & 
             (col("w.weather_condition").isin(["Clear", "Clouds"])), True)
        .otherwise(False).alias("is_good_ice_cream_weather"),
        
        # Original API timestamp
        col("w.created_at").alias("api_timestamp")
        
    ) \
    .withColumn("created_at", current_timestamp()) \
    .withColumn("updated_at", current_timestamp()) \
    .withColumn("record_hash", 
               md5(concat_ws("|", 
                           col("weather_fact_id"),
                           col("temperature_avg"),
                           col("temperature_max"),
                           col("temperature_min"),
                           col("humidity_percent"),
                           col("atmospheric_pressure"))))

print(f"Weather fact table created: {fact_weather.count()} records")

# Display sample of fact table
print("\n--- Weather Fact Table Sample ---")
fact_weather.show(5, truncate=False)

StatementMeta(, e0addd06-5fcf-4d7f-a9c2-3bcef6715164, 11, Finished, Available, Finished)


Creating weather fact table...
Weather fact table created: 40 records

--- Weather Fact Table Sample ---
+---------------------+--------+-------+--------------------+------------+------------+---------+---------------+---------------+---------------+----------------+--------------------+--------------+------------------+-----------------+--------------------+-------------+------------------+-------------------------+-------------------+--------------------------+--------------------------+--------------------------------+
|weather_fact_id      |date_id |city_id|weather_condition_id|weather_date|country_code|city_name|temperature_avg|temperature_max|temperature_min|humidity_percent|atmospheric_pressure|wind_speed_kmh|cloudiness_percent|temperature_range|temperature_category|comfort_level|is_adverse_weather|is_good_ice_cream_weather|api_timestamp      |created_at                |updated_at                |record_hash                     |
+---------------------+--------+-------+--------

In [10]:
# =============================================================================
# DATA QUALITY VALIDATION
# =============================================================================

print("\n--- Weather Fact Table Validation ---")

# Check for data integrity
total_records = fact_weather.count()
null_date_ids = fact_weather.filter(col("date_id").isNull()).count()
missing_geography = fact_weather.filter(col("city_id") == -1).count()
null_weather_conditions = fact_weather.filter(col("weather_condition_id").isNull()).count()

print(f"Total weather fact records: {total_records}")
print(f"Null date IDs: {null_date_ids}")
print(f"Missing geography links: {missing_geography}")
print(f"Null weather condition IDs: {null_weather_conditions}")

# Business metrics
print("\n--- Weather Business Analysis ---")
adverse_weather_days = fact_weather.filter(col("is_adverse_weather") == True).count()
good_ice_cream_days = fact_weather.filter(col("is_good_ice_cream_weather") == True).count()
comfortable_days = fact_weather.filter(col("comfort_level") == "Comfortable").count()

print(f"Adverse weather days: {adverse_weather_days} ({adverse_weather_days/total_records*100:.1f}%)")
print(f"Good ice cream weather days: {good_ice_cream_days} ({good_ice_cream_days/total_records*100:.1f}%)")
print(f"Comfortable weather days: {comfortable_days} ({comfortable_days/total_records*100:.1f}%)")

StatementMeta(, e0addd06-5fcf-4d7f-a9c2-3bcef6715164, 12, Finished, Available, Finished)


--- Weather Fact Table Validation ---
Total weather fact records: 40
Null date IDs: 0
Missing geography links: 24
Null weather condition IDs: 0

--- Weather Business Analysis ---
Adverse weather days: 4 (10.0%)
Good ice cream weather days: 11 (27.5%)
Comfortable weather days: 6 (15.0%)


In [11]:
# =============================================================================
# SAVE DIMENSIONS
# =============================================================================

print("\n=== SAVING DIMENSIONS ===")

# Save Weather Conditions Dimension
print(f"Saving weather conditions dimension to: {dim_weather_conditions_path}")
if DeltaTable.isDeltaTable(spark, dim_weather_conditions_path):
    print("Existing weather conditions table found. Executing merge...")
    delta_table = DeltaTable.forPath(spark, dim_weather_conditions_path)
    delta_table.alias("target") \
        .merge(dim_weather_conditions.alias("source"), 
               "target.condition_name = source.condition_name AND target.condition_description = source.condition_description") \
        .whenMatchedUpdate(
            condition="target.record_hash != source.record_hash",
            set={
                "condition_category": "source.condition_category",
                "business_impact": "source.business_impact",
                "updated_at": "source.updated_at",
                "record_hash": "source.record_hash"
            }) \
        .whenNotMatchedInsertAll() \
        .execute()
else:
    dim_weather_conditions.write.format("delta").mode("overwrite").save(dim_weather_conditions_path)
print("Weather conditions dimension saved!")

# Save Date Dimension
print(f"Saving date dimension to: {dim_date_path}")
if DeltaTable.isDeltaTable(spark, dim_date_path):
    print("Existing date table found. Executing merge...")
    delta_table = DeltaTable.forPath(spark, dim_date_path)
    delta_table.alias("target") \
        .merge(dim_date.alias("source"), 
               "target.date_id = source.date_id") \
        .whenNotMatchedInsertAll() \
        .execute()
else:
    dim_date.write.format("delta").mode("overwrite").save(dim_date_path)
print("Date dimension saved!")

StatementMeta(, e0addd06-5fcf-4d7f-a9c2-3bcef6715164, 13, Finished, Available, Finished)


=== SAVING DIMENSIONS ===
Saving weather conditions dimension to: Tables/dim_weather_conditions
Weather conditions dimension saved!
Saving date dimension to: Tables/dim_date
Date dimension saved!


In [12]:
# =============================================================================
# SAVE FACT TABLE
# =============================================================================

print(f"\nSaving weather fact table to: {fact_weather_path}")

if DeltaTable.isDeltaTable(spark, fact_weather_path):
    print("Existing weather fact table found. Executing merge...")
    delta_table = DeltaTable.forPath(spark, fact_weather_path)
    delta_table.alias("target") \
        .merge(fact_weather.alias("source"), 
               "target.weather_fact_id = source.weather_fact_id") \
        .whenMatchedUpdate(
            condition="target.record_hash != source.record_hash",
            set={
                "temperature_avg": "source.temperature_avg",
                "temperature_max": "source.temperature_max",
                "temperature_min": "source.temperature_min",
                "humidity_percent": "source.humidity_percent",
                "atmospheric_pressure": "source.atmospheric_pressure",
                "wind_speed_kmh": "source.wind_speed_kmh",
                "cloudiness_percent": "source.cloudiness_percent",
                "temperature_range": "source.temperature_range",
                "temperature_category": "source.temperature_category",
                "comfort_level": "source.comfort_level",
                "is_adverse_weather": "source.is_adverse_weather",
                "is_good_ice_cream_weather": "source.is_good_ice_cream_weather",
                "updated_at": "source.updated_at",
                "record_hash": "source.record_hash"
            }) \
        .whenNotMatchedInsertAll() \
        .execute()
else:
    fact_weather.write.format("delta").mode("overwrite") \
        .partitionBy("weather_date") \
        .save(fact_weather_path)

print("Weather fact table saved!")

StatementMeta(, e0addd06-5fcf-4d7f-a9c2-3bcef6715164, 14, Finished, Available, Finished)


Saving weather fact table to: Tables/fact_weather
Weather fact table saved!


In [13]:
# =============================================================================
# OPTIMIZE TABLES
# =============================================================================

print("\nOptimizing Delta tables...")

tables_to_optimize = [
    dim_weather_conditions_path,
    dim_date_path,
    fact_weather_path
]

for table_path in tables_to_optimize:
    try:
        spark.sql(f"OPTIMIZE delta.`{table_path}`")
        print(f"Optimized: {table_path}")
    except Exception as e:
        print(f"Optimization error for {table_path}: {str(e)}")


StatementMeta(, e0addd06-5fcf-4d7f-a9c2-3bcef6715164, 15, Finished, Available, Finished)


Optimizing Delta tables...
Optimized: Tables/dim_weather_conditions
Optimized: Tables/dim_date
Optimized: Tables/fact_weather


In [14]:
# =============================================================================
# FINAL SUMMARY
# =============================================================================

print("\n" + "="*60)
print("WEATHER DATA ETL COMPLETED SUCCESSFULLY!")
print("="*60)

print(f"✅ Weather Conditions Dimension: {dim_weather_conditions.count()} records")
print(f"✅ Date Dimension: {dim_date.count()} records") 
print(f"✅ Weather Fact Table: {fact_weather.count()} records")
print(f"✅ Geography Integration: {fact_weather.filter(col('city_id') != -1).count()} records linked")

print("\n📊 Business Insights:")
print(f"• Adverse weather impact: {adverse_weather_days/total_records*100:.1f}% of days")
print(f"• Ice cream business opportunities: {good_ice_cream_days/total_records*100:.1f}% of days")
print(f"• Customer comfort days: {comfortable_days/total_records*100:.1f}% of days")

print(f"\n📁 Tables saved to:")
print(f"• Weather Conditions: {dim_weather_conditions_path}")
print(f"• Date Dimension: {dim_date_path}")
print(f"• Weather Facts: {fact_weather_path}")

print("\n🔗 Ready for analysis and joins with your sales data!")

StatementMeta(, e0addd06-5fcf-4d7f-a9c2-3bcef6715164, 16, Finished, Available, Finished)


WEATHER DATA ETL COMPLETED SUCCESSFULLY!
✅ Weather Conditions Dimension: 7 records
✅ Date Dimension: 2 records
✅ Weather Fact Table: 40 records
✅ Geography Integration: 16 records linked

📊 Business Insights:
• Adverse weather impact: 10.0% of days
• Ice cream business opportunities: 27.5% of days
• Customer comfort days: 15.0% of days

📁 Tables saved to:
• Weather Conditions: Tables/dim_weather_conditions
• Date Dimension: Tables/dim_date
• Weather Facts: Tables/fact_weather

🔗 Ready for analysis and joins with your sales data!
