In [15]:
from pyspark.sql.functions import col, when, trim, upper, coalesce, lit, current_timestamp, md5, concat_ws, regexp_replace, split, explode, array, struct, round as spark_round, monotonically_increasing_id, avg, count, abs as spark_abs, year, month, dayofmonth, quarter
from pyspark.sql.types import *
from delta.tables import DeltaTable

StatementMeta(, 04ad417e-e5d4-4763-a3c6-6c5e218ec4f5, 17, Finished, Available, Finished)

In [21]:
# =============================================================================
# CONFIGURATION
# =============================================================================

imf_gdp_path = "Files/bronze/imf_dm_export/imf_dm_export_data.parquet"
geography_dim_path = "Tables/dim_geography"
dim_economic_indicators_path = "Tables/dim_economic_indicators"
fact_gdp_growth_path = "Tables/fact_gdp_growth"

StatementMeta(, 04ad417e-e5d4-4763-a3c6-6c5e218ec4f5, 23, Finished, Available, Finished)

In [4]:
# =============================================================================
# LOAD SOURCE DATA
# =============================================================================

print("Loading IMF GDP data...")

# Load IMF GDP data
df_imf_raw = spark.read.parquet(imf_gdp_path)
print(f"IMF GDP data loaded: {df_imf_raw.count()} records")

# Display sample data for debugging
print("\n--- Raw IMF GDP Data Sample ---")
df_imf_raw.show(5, truncate=False)

# Check schema
print("\n--- IMF GDP Data Schema ---")
df_imf_raw.printSchema()

# Check column names (especially the year columns)
print("\n--- Column Names ---")
for col_name in df_imf_raw.columns:
    print(f"- {col_name}")

StatementMeta(, 04ad417e-e5d4-4763-a3c6-6c5e218ec4f5, 6, Finished, Available, Finished)

Loading IMF GDP data...
IMF GDP data loaded: 228 records

--- Raw IMF GDP Data Sample ---
+---------------------------------------+----+----+----+----+----+----+----+----+----+----+----+-----+----+-----+----+----+----+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+----+
|Real GDP growth (Annual percent change)|1980|1981|1982|1983|1984|1985|1986|1987|1988|1989|1990|1991 |1992|1993 |1994|1995|1996|1997 |1998|1999|2000|2001|2002|2003|2004|2005|2006|2007|2008|2009|2010|2011|2012|2013|2014|2015|2016|2017|2018|2019|2020 |2021|2022|2023|2024|2025|2026|2027|2028|2029|2030|
+---------------------------------------+----+----+----+----+----+----+----+----+----+----+----+-----+----+-----+----+----+----+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+----+
|Antigu

In [9]:
# =============================================================================
# DATA QUALITY CHECKS - BEFORE CLEANING
# =============================================================================

print("\n--- IMF Data Quality Analysis ---")

# Check column names first to identify the correct country column
print("Available columns:")
for i, col_name in enumerate(df_imf_raw.columns):
    print(f"{i}: {col_name}")

# Find the country name column (usually the first column)
country_col = df_imf_raw.columns[0]  # First column is typically country names
print(f"\nUsing country column: {country_col}")

# Check for nulls in country names
print("Country Analysis:")
print(f"Total countries/regions: {df_imf_raw.count()}")
print(f"Null country names: {df_imf_raw.filter(col(country_col).isNull()).count()}")

# Check unique countries
print(f"Unique countries/regions: {df_imf_raw.select(country_col).distinct().count()}")

# Sample some country names
print("\n--- Sample Country Names ---")
df_imf_raw.select(col(country_col).alias('country_name')).distinct().show(10, truncate=False)

StatementMeta(, 04ad417e-e5d4-4763-a3c6-6c5e218ec4f5, 11, Finished, Available, Finished)


--- IMF Data Quality Analysis ---
Available columns:
0: Real GDP growth (Annual percent change)
1: 1980
2: 1981
3: 1982
4: 1983
5: 1984
6: 1985
7: 1986
8: 1987
9: 1988
10: 1989
11: 1990
12: 1991
13: 1992
14: 1993
15: 1994
16: 1995
17: 1996
18: 1997
19: 1998
20: 1999
21: 2000
22: 2001
23: 2002
24: 2003
25: 2004
26: 2005
27: 2006
28: 2007
29: 2008
30: 2009
31: 2010
32: 2011
33: 2012
34: 2013
35: 2014
36: 2015
37: 2016
38: 2017
39: 2018
40: 2019
41: 2020
42: 2021
43: 2022
44: 2023
45: 2024
46: 2025
47: 2026
48: 2027
49: 2028
50: 2029
51: 2030

Using country column: Real GDP growth (Annual percent change)
Country Analysis:
Total countries/regions: 228
Null country names: 0
Unique countries/regions: 228

--- Sample Country Names ---
+----------------------------+
|country_name                |
+----------------------------+
|Côte d'Ivoire               |
|South Asia                  |
|Chad                        |
|Paraguay                    |
|Congo, Republic of          |
|Emerging and

In [10]:
# =============================================================================
# DATA CLEANING - COUNTRY NAMES
# =============================================================================

print("\nCleaning country names...")

# Use the correct country column name
country_col = df_imf_raw.columns[0]  # First column is typically country names

# Clean and standardize country names
df_imf_clean = df_imf_raw \
    .withColumnRenamed(country_col, 'country_name') \
    .withColumn("country_name_clean", 
               when(col("country_name").isNull(), "Unknown")
               .otherwise(trim(col("country_name")))) \
    .filter(col("country_name_clean") != "Unknown") \
    .filter(col("country_name_clean") != "")

print(f"Records after country name cleaning: {df_imf_clean.count()}")

# Create country mapping for better geography matching
print("\nCreating country name mapping...")
df_imf_clean = df_imf_clean \
    .withColumn("country_mapped",
               # Map common variations to standard names
               when(col("country_name_clean") == "Antigua and Barbuda", "Antigua and Barbuda")
               .when(col("country_name_clean") == "Brazil", "Brazil")
               .when(col("country_name_clean") == "Bulgaria", "Bulgaria")
               .when(col("country_name_clean") == "Dominica", "Dominica")
               .when(col("country_name_clean") == "Ecuador", "Ecuador")
               .when(col("country_name_clean") == "Eswatini", "Eswatini")
               .when(col("country_name_clean") == "Iceland", "Iceland")
               .when(col("country_name_clean") == "Peru", "Peru")
               .when(col("country_name_clean") == "Singapore", "Singapore")
               .when(col("country_name_clean") == "South Africa", "South Africa")
               .when(col("country_name_clean") == "China, People's Republic of", "China")
               .when(col("country_name_clean") == "Dominican Republic", "Dominican Republic")
               .when(col("country_name_clean") == "Ghana", "Ghana")
               .when(col("country_name_clean") == "Nepal", "Nepal")
               .when(col("country_name_clean") == "Panama", "Panama")
               .when(col("country_name_clean") == "United Arab Emirates", "United Arab Emirates")
               # European countries that might match your geography
               .when(col("country_name_clean").contains("Germany"), "Germany")
               .when(col("country_name_clean").contains("France"), "France")
               .when(col("country_name_clean").contains("United Kingdom"), "United Kingdom")
               .when(col("country_name_clean").contains("Italy"), "Italy")
               .when(col("country_name_clean").contains("Spain"), "Spain")
               .when(col("country_name_clean").contains("Netherlands"), "Netherlands")
               .when(col("country_name_clean").contains("Belgium"), "Belgium")
               .when(col("country_name_clean").contains("Sweden"), "Sweden")
               .when(col("country_name_clean").contains("Poland"), "Poland")
               .when(col("country_name_clean").contains("Portugal"), "Portugal")
               .when(col("country_name_clean").contains("Ireland"), "Ireland")
               .when(col("country_name_clean").contains("Austria"), "Austria")
               .when(col("country_name_clean").contains("Switzerland"), "Switzerland")
               .when(col("country_name_clean").contains("Denmark"), "Denmark")
               .when(col("country_name_clean").contains("Norway"), "Norway")
               .otherwise(col("country_name_clean")))

print("Country name mapping applied")

StatementMeta(, 04ad417e-e5d4-4763-a3c6-6c5e218ec4f5, 12, Finished, Available, Finished)


Cleaning country names...
Records after country name cleaning: 228

Creating country name mapping...
Country name mapping applied


In [11]:
# =============================================================================
# UNPIVOT YEAR COLUMNS TO ROWS
# =============================================================================

print("\nUnpivoting year columns to create time series data...")

# Get all year columns (assuming they start from 1980)
year_columns = [col_name for col_name in df_imf_clean.columns if col_name.startswith('19') or col_name.startswith('20')]
print(f"Found year columns: {len(year_columns)} years")
print(f"Year range: {year_columns[:5]} ... {year_columns[-5:]}")

# Create year-value pairs for unpivoting
year_value_exprs = []
for year_col in year_columns:
    year_value_exprs.append(struct(lit(year_col).alias("year"), col(year_col).alias("gdp_growth_rate")))

# Unpivot the data
df_unpivoted = df_imf_clean \
    .select(
        col("country_name_clean").alias("country_name"),
        col("country_mapped"),
        explode(array(*year_value_exprs)).alias("year_data")
    ) \
    .select(
        col("country_name"),
        col("country_mapped"),
        col("year_data.year").cast(IntegerType()).alias("year"),
        col("year_data.gdp_growth_rate").cast(DoubleType()).alias("gdp_growth_rate")
    )

print(f"Unpivoted data created: {df_unpivoted.count()} records")

# Display sample of unpivoted data
print("\n--- Unpivoted Data Sample ---")
df_unpivoted.show(10, truncate=False)

StatementMeta(, 04ad417e-e5d4-4763-a3c6-6c5e218ec4f5, 13, Finished, Available, Finished)


Unpivoting year columns to create time series data...
Found year columns: 51 years
Year range: ['1980', '1981', '1982', '1983', '1984'] ... ['2026', '2027', '2028', '2029', '2030']
Unpivoted data created: 11628 records

--- Unpivoted Data Sample ---
+-------------------+-------------------+----+---------------+
|country_name       |country_mapped     |year|gdp_growth_rate|
+-------------------+-------------------+----+---------------+
|Antigua and Barbuda|Antigua and Barbuda|1980|8.2            |
|Antigua and Barbuda|Antigua and Barbuda|1981|3.8            |
|Antigua and Barbuda|Antigua and Barbuda|1982|-0.1           |
|Antigua and Barbuda|Antigua and Barbuda|1983|5.4            |
|Antigua and Barbuda|Antigua and Barbuda|1984|10.2           |
|Antigua and Barbuda|Antigua and Barbuda|1985|7.6            |
|Antigua and Barbuda|Antigua and Barbuda|1986|11.5           |
|Antigua and Barbuda|Antigua and Barbuda|1987|6.6            |
|Antigua and Barbuda|Antigua and Barbuda|1988|5.2       

In [13]:
# =============================================================================
# DATA CLEANING - GDP VALUES
# =============================================================================

print("\nCleaning GDP growth rate values...")

df_gdp_clean = df_unpivoted \
    .filter(col("year").isNotNull()) \
    .filter(col("year") >= 2025) \
    .filter(col("year") <= 2030) \
    .withColumn("gdp_growth_rate_clean",
               when(col("gdp_growth_rate").isNull(), 0.0)
               .when(col("gdp_growth_rate") < -50, -50.0)  # Cap extreme negative values
               .when(col("gdp_growth_rate") > 50, 50.0)    # Cap extreme positive values
               .otherwise(spark_round(col("gdp_growth_rate"), 2))) \
    .withColumn("is_recession",
               when(col("gdp_growth_rate_clean") < 0, True).otherwise(False)) \
    .withColumn("growth_category",
               when(col("gdp_growth_rate_clean") < -5, "Deep Recession")
               .when(col("gdp_growth_rate_clean") < 0, "Recession")
               .when(col("gdp_growth_rate_clean") < 2, "Slow Growth")
               .when(col("gdp_growth_rate_clean") < 5, "Moderate Growth")
               .when(col("gdp_growth_rate_clean") < 8, "Strong Growth")
               .otherwise("Very Strong Growth")) \
    .withColumn("economic_health",
               when(col("gdp_growth_rate_clean") < -2, "Poor")
               .when(col("gdp_growth_rate_clean") < 1, "Weak")
               .when(col("gdp_growth_rate_clean") < 3, "Moderate")
               .when(col("gdp_growth_rate_clean") < 6, "Good")
               .otherwise("Excellent"))

print(f"GDP data after cleaning (2025-2030 only): {df_gdp_clean.count()} records")

print("\n--- GDP Data Quality Check (2025-2030 European Focus) ---")
print(f"Records with valid GDP rates: {df_gdp_clean.filter(col('gdp_growth_rate_clean') != 0).count()}")
print(f"Recession periods: {df_gdp_clean.filter(col('is_recession') == True).count()}")
print(f"Year range: 2025-2030")

# Show available years
print("\n--- Available Years ---")
df_gdp_clean.groupBy("year").count().orderBy("year").show()

# Growth category distribution
print("\n--- Growth Category Distribution (2025-2030) ---")
df_gdp_clean.groupBy("growth_category").count().orderBy("count", ascending=False).show()


StatementMeta(, 04ad417e-e5d4-4763-a3c6-6c5e218ec4f5, 15, Finished, Available, Finished)


Cleaning GDP growth rate values...
GDP data after cleaning (2025-2030 only): 1368 records

--- GDP Data Quality Check (2025-2030 European Focus) ---
Records with valid GDP rates: 1321
Recession periods: 13
Year range: 2025-2030

--- Available Years ---
+----+-----+
|year|count|
+----+-----+
|2025|  228|
|2026|  228|
|2027|  228|
|2028|  228|
|2029|  228|
|2030|  228|
+----+-----+


--- Growth Category Distribution (2025-2030) ---
+------------------+-----+
|   growth_category|count|
+------------------+-----+
|   Moderate Growth|  805|
|       Slow Growth|  329|
|     Strong Growth|  197|
|Very Strong Growth|   24|
|         Recession|   12|
|    Deep Recession|    1|
+------------------+-----+



In [16]:
# =============================================================================
# CREATE ECONOMIC INDICATORS DIMENSION
# =============================================================================

print("\nCreating economic indicators dimension...")

dim_economic_indicators = df_gdp_clean \
    .select("growth_category", "economic_health") \
    .distinct() \
    .withColumn("indicator_id", monotonically_increasing_id()) \
    .select(
        col("indicator_id"),
        col("growth_category"),
        col("economic_health"),
        
        # Business impact indicators
        when(col("economic_health").isin(["Good", "Excellent"]), "Positive")
        .when(col("economic_health") == "Moderate", "Neutral")
        .otherwise("Negative").alias("business_impact"),
        
        # Consumer spending indicator
        when(col("growth_category").contains("Strong"), "High")
        .when(col("growth_category").contains("Moderate"), "Medium")
        .when(col("growth_category").contains("Slow"), "Low")
        .otherwise("Very Low").alias("consumer_spending_level")
    ) \
    .withColumn("created_at", current_timestamp()) \
    .withColumn("updated_at", current_timestamp()) \
    .withColumn("record_hash", 
               md5(concat_ws("|", 
                           col("growth_category"),
                           col("economic_health"),
                           col("business_impact"),
                           col("consumer_spending_level"))))

print(f"Economic indicators dimension created: {dim_economic_indicators.count()} records")

print("\n--- Economic Indicators Dimension ---")
dim_economic_indicators.show(truncate=False)

StatementMeta(, 04ad417e-e5d4-4763-a3c6-6c5e218ec4f5, 18, Finished, Available, Finished)


Creating economic indicators dimension...
Economic indicators dimension created: 10 records

--- Economic Indicators Dimension ---
+------------+------------------+---------------+---------------+-----------------------+--------------------------+--------------------------+--------------------------------+
|indicator_id|growth_category   |economic_health|business_impact|consumer_spending_level|created_at                |updated_at                |record_hash                     |
+------------+------------------+---------------+---------------+-----------------------+--------------------------+--------------------------+--------------------------------+
|0           |Slow Growth       |Moderate       |Neutral        |Low                    |2025-05-29 22:40:42.758448|2025-05-29 22:40:42.758448|f2e7cc86229d52361a9b2fba8fc47016|
|1           |Recession         |Weak           |Negative       |Very Low               |2025-05-29 22:40:42.758448|2025-05-29 22:40:42.758448|e098de2aa5046c8f0

In [17]:
# =============================================================================
# JOIN WITH GEOGRAPHY DIMENSION
# =============================================================================

print("\nJoining GDP data with geography dimension (ERP-based filtering)...")

# Create geography lookup
geography_lookup = df_geography.select(
    col("city_id"),
    col("country_name").alias("geo_country_name"),
    col("country_iso3")
).distinct()

# Join GDP data with geography - this automatically filters to ERP countries
df_gdp_with_geo = df_gdp_clean.alias("gdp") \
    .join(geography_lookup.alias("geo"),
          upper(col("gdp.country_mapped")) == upper(col("geo.geo_country_name")),
          "inner") \
    .select(
        col("gdp.*"),
        col("geo.city_id"),
        col("geo.geo_country_name"),
        col("geo.country_iso3")
    )

# Check geography matching - now only ERP countries
matched_countries = df_gdp_with_geo.select("country_mapped").distinct().count()
total_erp_countries = len(european_country_names)

print(f"ERP-based geography matching: {matched_countries}/{total_erp_countries} ERP countries found in GDP data")

# Show which ERP countries have GDP data
print("\n--- ERP Countries with GDP Data (2025-2030) ---")
df_gdp_with_geo.select("geo_country_name", "country_iso3").distinct() \
    .orderBy("geo_country_name").show(50, truncate=False)

# Show which ERP countries are missing from GDP data
print("\n--- ERP Countries Missing from GDP Data ---")
gdp_countries = [row['country_mapped'] for row in df_gdp_with_geo.select("country_mapped").distinct().collect()]
missing_countries = [country for country in european_country_names if country not in gdp_countries]
if missing_countries:
    print(f"Missing: {missing_countries}")
else:
    print("All ERP countries found in GDP data!")

StatementMeta(, 04ad417e-e5d4-4763-a3c6-6c5e218ec4f5, 19, Finished, Available, Finished)


Joining GDP data with geography dimension (ERP-based filtering)...
ERP-based geography matching: 8/8 ERP countries found in GDP data

--- ERP Countries with GDP Data (2025-2030) ---
+----------------+------------+
|geo_country_name|country_iso3|
+----------------+------------+
|Belgium         |BEL         |
|France          |FRA         |
|Germany         |DEU         |
|Italy           |ITA         |
|Netherlands     |NLD         |
|Portugal        |PRT         |
|Spain           |ESP         |
|United Kingdom  |GBR         |
+----------------+------------+


--- ERP Countries Missing from GDP Data ---
All ERP countries found in GDP data!


In [18]:
# =============================================================================
# CREATE GDP GROWTH FACT TABLE
# =============================================================================

print("\nCreating GDP growth fact table...")

# Join with economic indicators dimension
fact_gdp_growth = df_gdp_with_geo.alias("gdp") \
    .join(dim_economic_indicators.alias("econ"),
          (col("gdp.growth_category") == col("econ.growth_category")) &
          (col("gdp.economic_health") == col("econ.economic_health")),
          "inner") \
    .select(
        # Primary Key
        concat_ws("-", col("gdp.country_mapped"), col("gdp.year")).alias("gdp_fact_id"),
        
        # Foreign Keys
        col("gdp.city_id"),
        col("econ.indicator_id"),
        
        # Dimensions
        col("gdp.year"),
        col("gdp.country_name").alias("original_country_name"),
        col("gdp.country_mapped").alias("standardized_country_name"),
        col("gdp.geo_country_name"),
        col("gdp.country_iso3"),
        
        # Facts/Measures
        col("gdp.gdp_growth_rate_clean").alias("gdp_growth_rate"),
        
        # Indicators
        col("gdp.is_recession"),
        col("gdp.growth_category"),
        col("gdp.economic_health"),
        
        # Derived measures for analytics
        when(col("gdp.gdp_growth_rate_clean") > 0, col("gdp.gdp_growth_rate_clean")).otherwise(0.0).alias("positive_growth_rate"),
        when(col("gdp.gdp_growth_rate_clean") < 0, spark_abs(col("gdp.gdp_growth_rate_clean"))).otherwise(0.0).alias("recession_depth"),
        
        # Business context
        when(col("gdp.economic_health").isin(["Good", "Excellent"]), True).otherwise(False).alias("favorable_for_business"),
        when(col("gdp.growth_category").contains("Strong"), True).otherwise(False).alias("strong_economy")
        
    ) \
    .withColumn("created_at", current_timestamp()) \
    .withColumn("updated_at", current_timestamp()) \
    .withColumn("record_hash", 
               md5(concat_ws("|", 
                           col("gdp_fact_id"),
                           col("gdp_growth_rate"),
                           col("growth_category"),
                           col("economic_health"))))

print(f"GDP growth fact table created: {fact_gdp_growth.count()} records")

# Display sample of fact table
print("\n--- GDP Growth Fact Table Sample ---")
fact_gdp_growth.show(10, truncate=False)

StatementMeta(, 04ad417e-e5d4-4763-a3c6-6c5e218ec4f5, 20, Finished, Available, Finished)


Creating GDP growth fact table...
GDP growth fact table created: 126 records

--- GDP Growth Fact Table Sample ---
+------------+-------+------------+----+---------------------+-------------------------+----------------+------------+---------------+------------+---------------+---------------+--------------------+---------------+----------------------+--------------+--------------------------+--------------------------+--------------------------------+
|gdp_fact_id |city_id|indicator_id|year|original_country_name|standardized_country_name|geo_country_name|country_iso3|gdp_growth_rate|is_recession|growth_category|economic_health|positive_growth_rate|recession_depth|favorable_for_business|strong_economy|created_at                |updated_at                |record_hash                     |
+------------+-------+------------+----+---------------------+-------------------------+----------------+------------+---------------+------------+---------------+---------------+--------------------+

In [19]:
# =============================================================================
# DATA ANALYSIS
# =============================================================================

print("\n--- GDP Growth Analysis (2025-2030 European Focus) ---")

# Year-by-year analysis for forecast period
print("GDP Growth by Year (2025-2030):")
df_gdp_clean.groupBy("year") \
    .agg(
        avg("gdp_growth_rate").alias("avg_growth"),
        count("*").alias("country_count")
    ) \
    .orderBy("year").show()

# Economic health distribution for European countries
print("Economic Health Distribution (European Countries 2025-2030):")
fact_gdp_growth.groupBy("economic_health") \
    .count() \
    .orderBy("count", ascending=False).show()

# Country-specific analysis for Europe
print("Top European Countries by Average Growth (2025-2030):")
fact_gdp_growth.filter(col("country_iso3").isNotNull()) \
    .groupBy("standardized_country_name", "country_iso3") \
    .agg(avg("gdp_growth_rate").alias("avg_growth_rate")) \
    .orderBy("avg_growth_rate", ascending=False) \
    .show(15, truncate=False)

# Recession analysis for forecast period
recession_count = fact_gdp_growth.filter(col("is_recession") == True).count()
total_records = fact_gdp_growth.count()
print(f"Recession periods (2025-2030): {recession_count}/{total_records} ({recession_count/total_records*100:.1f}%)")

StatementMeta(, 04ad417e-e5d4-4763-a3c6-6c5e218ec4f5, 21, Finished, Available, Finished)


--- GDP Growth Analysis (2025-2030 European Focus) ---
GDP Growth by Year (2025-2030):
+----+------------------+-------------+
|year|        avg_growth|country_count|
+----+------------------+-------------+
|2025|3.0427927927927914|          228|
|2026| 3.565765765765766|          228|
|2027| 3.493636363636363|          228|
|2028|3.5827272727272725|          228|
|2029|3.4540909090909095|          228|
|2030|3.2213636363636384|          228|
+----+------------------+-------------+

Economic Health Distribution (European Countries 2025-2030):
+---------------+-----+
|economic_health|count|
+---------------+-----+
|       Moderate|   94|
|           Weak|   32|
+---------------+-----+

Top European Countries by Average Growth (2025-2030):
+-------------------------+------------+------------------+
|standardized_country_name|country_iso3|avg_growth_rate   |
+-------------------------+------------+------------------+
|Spain                    |ESP         |1.8000000000000007|
|Portugal  

In [22]:
# =============================================================================
# SAVE DIMENSIONS AND FACT TABLE
# =============================================================================

print("\n=== SAVING TABLES ===")

# Save Economic Indicators Dimension
print(f"Saving economic indicators dimension to: {dim_economic_indicators_path}")
if DeltaTable.isDeltaTable(spark, dim_economic_indicators_path):
    print("Existing economic indicators table found. Executing merge...")
    delta_table = DeltaTable.forPath(spark, dim_economic_indicators_path)
    delta_table.alias("target") \
        .merge(dim_economic_indicators.alias("source"), 
               "target.growth_category = source.growth_category AND target.economic_health = source.economic_health") \
        .whenMatchedUpdate(
            condition="target.record_hash != source.record_hash",
            set={
                "business_impact": "source.business_impact",
                "consumer_spending_level": "source.consumer_spending_level",
                "updated_at": "source.updated_at",
                "record_hash": "source.record_hash"
            }) \
        .whenNotMatchedInsertAll() \
        .execute()
else:
    dim_economic_indicators.write.format("delta").mode("overwrite").save(dim_economic_indicators_path)
print("Economic indicators dimension saved!")

# Save GDP Growth Fact Table
print(f"Saving GDP growth fact table to: {fact_gdp_growth_path}")
if DeltaTable.isDeltaTable(spark, fact_gdp_growth_path):
    print("Existing GDP fact table found. Executing merge...")
    delta_table = DeltaTable.forPath(spark, fact_gdp_growth_path)
    delta_table.alias("target") \
        .merge(fact_gdp_growth.alias("source"), 
               "target.gdp_fact_id = source.gdp_fact_id") \
        .whenMatchedUpdate(
            condition="target.record_hash != source.record_hash",
            set={
                "gdp_growth_rate": "source.gdp_growth_rate",
                "is_recession": "source.is_recession",
                "growth_category": "source.growth_category",
                "economic_health": "source.economic_health",
                "positive_growth_rate": "source.positive_growth_rate",
                "recession_depth": "source.recession_depth",
                "favorable_for_business": "source.favorable_for_business",
                "strong_economy": "source.strong_economy",
                "updated_at": "source.updated_at",
                "record_hash": "source.record_hash"
            }) \
        .whenNotMatchedInsertAll() \
        .execute()
else:
    fact_gdp_growth.write.format("delta").mode("overwrite") \
        .partitionBy("year") \
        .save(fact_gdp_growth_path)
print("GDP growth fact table saved!")

StatementMeta(, 04ad417e-e5d4-4763-a3c6-6c5e218ec4f5, 24, Finished, Available, Finished)


=== SAVING TABLES ===
Saving economic indicators dimension to: Tables/dim_economic_indicators
Existing economic indicators table found. Executing merge...
Economic indicators dimension saved!
Saving GDP growth fact table to: Tables/fact_gdp_growth
GDP growth fact table saved!


In [23]:
# =============================================================================
# OPTIMIZE TABLES
# =============================================================================

print("\nOptimizing Delta tables...")

tables_to_optimize = [
    dim_economic_indicators_path,
    fact_gdp_growth_path
]

for table_path in tables_to_optimize:
    try:
        spark.sql(f"OPTIMIZE delta.`{table_path}`")
        print(f"Optimized: {table_path}")
    except Exception as e:
        print(f"Optimization error for {table_path}: {str(e)}")

StatementMeta(, 04ad417e-e5d4-4763-a3c6-6c5e218ec4f5, 25, Finished, Available, Finished)


Optimizing Delta tables...
Optimized: Tables/dim_economic_indicators
Optimized: Tables/fact_gdp_growth


In [24]:
# =============================================================================
# FINAL SUMMARY
# =============================================================================

print("\n" + "="*60)
print("IMF GDP DATA ETL COMPLETED SUCCESSFULLY!")
print("="*60)

print(f"✅ Economic Indicators Dimension: {dim_economic_indicators.count()} records")
print(f"✅ GDP Growth Fact Table: {fact_gdp_growth.count()} records")

if df_geography is not None:
    matched = fact_gdp_growth.filter(col("city_id") != -1).count()
    print(f"✅ Geography Integration: {matched} records linked")

print("\n📊 Economic Insights (ERP European Markets 2025-2030):")
erp_countries_with_gdp = df_gdp_with_geo.select("geo_country_name").distinct().count()
print(f"• ERP European countries with GDP data: {erp_countries_with_gdp}")
print(f"• Forecast period: 2025-2030")
print(f"• Recession risk: {recession_count/total_records*100:.1f}% of country-year observations")

# Business recommendations based on ERP countries
strong_economies = fact_gdp_growth.filter(col("strong_economy") == True).select("standardized_country_name").distinct().count()
favorable_markets = fact_gdp_growth.filter(col("favorable_for_business") == True).select("standardized_country_name").distinct().count()
print(f"• Strong growth ERP markets: {strong_economies} countries")
print(f"• Favorable business conditions: {favorable_markets} countries")

print(f"\n📁 Tables saved to:")
print(f"• Economic Indicators: {dim_economic_indicators_path}")
print(f"• GDP Growth Facts: {fact_gdp_growth_path}")

print("\n🔗 Ready for economic impact analysis on your business data!")

StatementMeta(, 04ad417e-e5d4-4763-a3c6-6c5e218ec4f5, 26, Finished, Available, Finished)


IMF GDP DATA ETL COMPLETED SUCCESSFULLY!
✅ Economic Indicators Dimension: 10 records
✅ GDP Growth Fact Table: 126 records
✅ Geography Integration: 126 records linked

📊 Economic Insights (ERP European Markets 2025-2030):
• ERP European countries with GDP data: 8
• Forecast period: 2025-2030
• Recession risk: 0.0% of country-year observations
• Strong growth ERP markets: 0 countries
• Favorable business conditions: 0 countries

📁 Tables saved to:
• Economic Indicators: Tables/dim_economic_indicators
• GDP Growth Facts: Tables/fact_gdp_growth

🔗 Ready for economic impact analysis on your business data!
