In [25]:
from pyspark.sql.functions import col, when, trim, upper, coalesce, lit, current_timestamp, md5, concat_ws, count, collect_list
from pyspark.sql.types import *
from delta.tables import DeltaTable

StatementMeta(, d46ca668-9d78-484c-aa9a-6d1ab9aab1be, 27, Finished, Available, Finished)

In [16]:
# =============================================================================
# CONFIGURATION
# =============================================================================

cities_path = "Files/bronze/sales/cities.parquet"
countries_path = "Files/bronze/sales/countries.parquet"
delta_output_path = "Tables/dim_geography"


StatementMeta(, d46ca668-9d78-484c-aa9a-6d1ab9aab1be, 18, Finished, Available, Finished)

In [7]:
# =============================================================================
# ISO3 COUNTRY CODE MAPPING
# =============================================================================

print("Creating ISO3 country code mapping...")

# Create ISO3 mapping dictionary (ISO2 to ISO3)
iso3_mapping = {
    "DE": "DEU",  # Germany
    "FR": "FRA",  # France
    "GB": "GBR",  # United Kingdom
    "IT": "ITA",  # Italy
    "ES": "ESP",  # Spain
    "NL": "NLD",  # Netherlands
    "BE": "BEL",  # Belgium
    "SE": "SWE",  # Sweden
    "PL": "POL",  # Poland
    "PT": "PRT",  # Portugal
    "IE": "IRL",  # Ireland
    "AT": "AUT",  # Austria
    "CH": "CHE",  # Switzerland
    "DK": "DNK",  # Denmark
    "NO": "NOR"   # Norway
}

print(f"ISO3 mapping created for {len(iso3_mapping)} countries")

StatementMeta(, d46ca668-9d78-484c-aa9a-6d1ab9aab1be, 9, Finished, Available, Finished)

Creating ISO3 country code mapping...
ISO3 mapping created for 15 countries


In [8]:
# =============================================================================
# LOAD SOURCE DATA
# =============================================================================

print("Loading source data...")

# Load cities table
df_cities = spark.read.parquet(cities_path)
print(f"Cities loaded: {df_cities.count()} records")

# Load countries table
df_countries = spark.read.parquet(countries_path)
print(f"Countries loaded: {df_countries.count()} records")

# Display sample data for debugging
print("\n--- Cities Sample ---")
df_cities.show(5, truncate=False)

print("\n--- Countries Sample ---")
df_countries.show(5, truncate=False)

# Check schemas
print("\n--- Cities Schema ---")
df_cities.printSchema()

print("\n--- Countries Schema ---")
df_countries.printSchema()

StatementMeta(, d46ca668-9d78-484c-aa9a-6d1ab9aab1be, 10, Finished, Available, Finished)

Loading source data...
Cities loaded: 21 records
Countries loaded: 15 records

--- Cities Sample ---
+------+----------+---------+
|CityID|CityName  |CountryID|
+------+----------+---------+
|16    |Amsterdam |6        |
|19    |Antwerp   |7        |
|14    |Barcelona |5        |
|1     |Berlin    |1        |
|9     |Birmingham|3        |
+------+----------+---------+
only showing top 5 rows


--- Countries Sample ---
+---------+--------------+-------+
|CountryID|CountryName   |ISOCode|
+---------+--------------+-------+
|1        |Germany       |DE     |
|2        |France        |FR     |
|3        |United Kingdom|GB     |
|4        |Italy         |IT     |
|5        |Spain         |ES     |
+---------+--------------+-------+
only showing top 5 rows


--- Cities Schema ---
root
 |-- CityID: integer (nullable = true)
 |-- CityName: string (nullable = true)
 |-- CountryID: integer (nullable = true)


--- Countries Schema ---
root
 |-- CountryID: integer (nullable = true)
 |-- CountryNam

In [9]:
# =============================================================================
# DATA QUALITY CHECKS - BEFORE CLEANING
# =============================================================================

print("\n--- Data Quality Analysis ---")

# Check for nulls in key fields
print("Cities Key Field Analysis:")
print(f"Null CityIDs: {df_cities.filter(col('CityID').isNull()).count()}")
print(f"Null CityNames: {df_cities.filter(col('CityName').isNull()).count()}")
print(f"Null CountryIDs: {df_cities.filter(col('CountryID').isNull()).count()}")

print("\nCountries Key Field Analysis:")
print(f"Null CountryIDs: {df_countries.filter(col('CountryID').isNull()).count()}")
print(f"Null CountryNames: {df_countries.filter(col('CountryName').isNull()).count()}")
print(f"Null ISOCodes: {df_countries.filter(col('ISOCode').isNull()).count()}")

# Check for orphaned records
print("\nJoin Analysis:")
cities_countryids = df_cities.select("CountryID").distinct()
countries_countryids = df_countries.select("CountryID").distinct()

orphaned_cities = cities_countryids.join(countries_countryids, "CountryID", "left_anti")
print(f"Cities without countries: {orphaned_cities.count()}")

orphaned_countries = countries_countryids.join(cities_countryids, "CountryID", "left_anti")
print(f"Countries without cities: {orphaned_countries.count()}")


StatementMeta(, d46ca668-9d78-484c-aa9a-6d1ab9aab1be, 11, Finished, Available, Finished)


--- Data Quality Analysis ---
Cities Key Field Analysis:
Null CityIDs: 0
Null CityNames: 0
Null CountryIDs: 0

Countries Key Field Analysis:
Null CountryIDs: 0
Null CountryNames: 0
Null ISOCodes: 0

Join Analysis:
Cities without countries: 0
Countries without cities: 7


In [10]:
# =============================================================================
# DATA CLEANING - CITIES
# =============================================================================

print("\nStarting cities data cleaning...")

df_cities_clean = df_cities \
    .filter(col("CityID").isNotNull()) \
    .filter(col("CountryID").isNotNull()) \
    .withColumn("CityName", 
               when(col("CityName").isNull(), "Unknown City")
               .otherwise(trim(col("CityName")))) \
    .filter(col("CityName") != "") \
    .dropDuplicates(["CityID"])

print(f"Cities after cleaning: {df_cities_clean.count()} records")

# Check for cleaning issues
print("\n--- Cities Cleaning Check ---")
print(f"Empty city names after cleaning: {df_cities_clean.filter(col('CityName') == '').count()}")
print(f"'Unknown City' entries: {df_cities_clean.filter(col('CityName') == 'Unknown City').count()}")

StatementMeta(, d46ca668-9d78-484c-aa9a-6d1ab9aab1be, 12, Finished, Available, Finished)


Starting cities data cleaning...
Cities after cleaning: 21 records

--- Cities Cleaning Check ---
Empty city names after cleaning: 0
'Unknown City' entries: 0


In [11]:
# =============================================================================
# DATA CLEANING - COUNTRIES
# =============================================================================

print("\nStarting countries data cleaning...")

df_countries_clean = df_countries \
    .filter(col("CountryID").isNotNull()) \
    .withColumn("CountryName", 
               when(col("CountryName").isNull(), "Unknown Country")
               .otherwise(trim(col("CountryName")))) \
    .withColumn("ISOCode", 
               when(col("ISOCode").isNull(), "XX")
               .otherwise(upper(trim(col("ISOCode"))))) \
    .filter(col("CountryName") != "") \
    .dropDuplicates(["CountryID"])

print(f"Countries after cleaning: {df_countries_clean.count()} records")

# Check for cleaning issues
print("\n--- Countries Cleaning Check ---")
print(f"Empty country names: {df_countries_clean.filter(col('CountryName') == '').count()}")
print(f"'Unknown Country' entries: {df_countries_clean.filter(col('CountryName') == 'Unknown Country').count()}")
print(f"Missing ISO codes (XX): {df_countries_clean.filter(col('ISOCode') == 'XX').count()}")

StatementMeta(, d46ca668-9d78-484c-aa9a-6d1ab9aab1be, 13, Finished, Available, Finished)


Starting countries data cleaning...
Countries after cleaning: 15 records

--- Countries Cleaning Check ---
Empty country names: 0
'Unknown Country' entries: 0
Missing ISO codes (XX): 0


In [12]:
# =============================================================================
# ADD ISO3 COUNTRY CODES
# =============================================================================

print("\nAdding ISO3 country codes...")

# Create a Spark DataFrame from the mapping dictionary
iso3_mapping_data = [(iso2, iso3) for iso2, iso3 in iso3_mapping.items()]
iso3_mapping_schema = StructType([
    StructField("iso2_code", StringType(), True),
    StructField("iso3_code", StringType(), True)
])

df_iso3_mapping = spark.createDataFrame(iso3_mapping_data, iso3_mapping_schema)

print("--- ISO3 Mapping ---")
df_iso3_mapping.show(truncate=False)

# Join countries with ISO3 mapping
df_countries_with_iso3 = df_countries_clean.alias("c") \
    .join(df_iso3_mapping.alias("iso3"), 
          col("c.ISOCode") == col("iso3.iso2_code"), 
          "left") \
    .select(
        col("c.CountryID"),
        col("c.CountryName"), 
        col("c.ISOCode").alias("iso2_code"),
        coalesce(col("iso3.iso3_code"), lit("UNK")).alias("iso3_code")
    )

print(f"Countries with ISO3 codes: {df_countries_with_iso3.count()} records")

# Check ISO3 mapping results
print("\n--- ISO3 Mapping Results ---")
unmapped_countries = df_countries_with_iso3.filter(col("iso3_code") == "UNK").count()
print(f"Countries without ISO3 mapping: {unmapped_countries}")

if unmapped_countries > 0:
    print("Countries missing ISO3 codes:")
    df_countries_with_iso3.filter(col("iso3_code") == "UNK").show(truncate=False)

StatementMeta(, d46ca668-9d78-484c-aa9a-6d1ab9aab1be, 14, Finished, Available, Finished)


Adding ISO3 country codes...
--- ISO3 Mapping ---
+---------+---------+
|iso2_code|iso3_code|
+---------+---------+
|DE       |DEU      |
|FR       |FRA      |
|GB       |GBR      |
|IT       |ITA      |
|ES       |ESP      |
|NL       |NLD      |
|BE       |BEL      |
|SE       |SWE      |
|PL       |POL      |
|PT       |PRT      |
|IE       |IRL      |
|AT       |AUT      |
|CH       |CHE      |
|DK       |DNK      |
|NO       |NOR      |
+---------+---------+

Countries with ISO3 codes: 15 records

--- ISO3 Mapping Results ---
Countries without ISO3 mapping: 0


In [13]:
# =============================================================================
# CREATE GEOGRAPHY DIMENSION
# =============================================================================

print("\nCreating geography dimension...")

# Join cities with countries (now including ISO3)
dim_geography = df_cities_clean.alias("ci") \
    .join(df_countries_with_iso3.alias("co"), 
          col("ci.CountryID") == col("co.CountryID"), 
          "left") \
    .select(
        # Primary Key
        col("ci.CityID").alias("city_id"),
        
        # City Information
        col("ci.CityName").alias("city_name"),
        
        # Country Information
        col("ci.CountryID").alias("country_id"),
        coalesce(col("co.CountryName"), lit("Unknown Country")).alias("country_name"),
        coalesce(col("co.iso2_code"), lit("XX")).alias("country_iso2"),
        coalesce(col("co.iso3_code"), lit("UNK")).alias("country_iso3"),
        
        # Composite Fields for Analytics
        concat_ws(", ", col("ci.CityName"), 
                 coalesce(col("co.CountryName"), lit("Unknown Country"))).alias("city_country"),
        concat_ws("-", col("ci.CityName"), 
                 coalesce(col("co.iso3_code"), lit("UNK"))).alias("city_iso3")
    ) \
    .withColumn("created_at", current_timestamp()) \
    .withColumn("updated_at", current_timestamp()) \
    .withColumn("record_hash", 
               md5(concat_ws("|", 
                           col("city_id"),
                           col("city_name"),
                           col("country_id"),
                           col("country_name"),
                           col("country_iso2"),
                           col("country_iso3"))))

print(f"Geography dimension created: {dim_geography.count()} records")

# Display sample of geography dimension
print("\n--- Geography Dimension Sample ---")
dim_geography.show(10, truncate=False)

StatementMeta(, d46ca668-9d78-484c-aa9a-6d1ab9aab1be, 15, Finished, Available, Finished)


Creating geography dimension...
Geography dimension created: 21 records

--- Geography Dimension Sample ---
+-------+----------+----------+--------------+------------+------------+--------------------------+--------------+--------------------------+--------------------------+--------------------------------+
|city_id|city_name |country_id|country_name  |country_iso2|country_iso3|city_country              |city_iso3     |created_at                |updated_at                |record_hash                     |
+-------+----------+----------+--------------+------------+------------+--------------------------+--------------+--------------------------+--------------------------+--------------------------------+
|1      |Berlin    |1         |Germany       |DE          |DEU         |Berlin, Germany           |Berlin-DEU    |2025-05-29 18:52:08.539526|2025-05-29 18:52:08.539526|987d50189e36d37fe3766f62338f9c9f|
|2      |Munich    |1         |Germany       |DE          |DEU         |Munich, Ger

In [14]:
# =============================================================================
# DATA QUALITY VALIDATION
# =============================================================================

print("\n--- Geography Dimension Validation ---")

# Check for data integrity
total_records = dim_geography.count()
null_city_ids = dim_geography.filter(col("city_id").isNull()).count()
null_city_names = dim_geography.filter(col("city_name").isNull()).count()
unknown_countries = dim_geography.filter(col("country_name") == "Unknown Country").count()
unmapped_iso3 = dim_geography.filter(col("country_iso3") == "UNK").count()

print(f"Total geography records: {total_records}")
print(f"Null city IDs: {null_city_ids}")
print(f"Null city names: {null_city_names}")
print(f"Unknown countries: {unknown_countries}")
print(f"Unmapped ISO3 codes: {unmapped_iso3}")

# Country distribution
print("\n--- Country Distribution ---")
country_stats = dim_geography.groupBy("country_name", "country_iso3") \
    .count() \
    .orderBy(col("count").desc())
country_stats.show(20, truncate=False)

StatementMeta(, d46ca668-9d78-484c-aa9a-6d1ab9aab1be, 16, Finished, Available, Finished)


--- Geography Dimension Validation ---
Total geography records: 21
Null city IDs: 0
Null city names: 0
Unknown countries: 0
Unmapped ISO3 codes: 0

--- Country Distribution ---
+--------------+------------+-----+
|country_name  |country_iso3|count|
+--------------+------------+-----+
|France        |FRA         |3    |
|Spain         |ESP         |3    |
|Germany       |DEU         |3    |
|United Kingdom|GBR         |3    |
|Italy         |ITA         |3    |
|Portugal      |PRT         |2    |
|Belgium       |BEL         |2    |
|Netherlands   |NLD         |2    |
+--------------+------------+-----+



In [17]:

# =============================================================================
# SAVE AS DELTA TABLE (INCREMENTAL)
# =============================================================================

print(f"\nSaving geography dimension to: {delta_output_path}")

# Check if Delta table already exists
if DeltaTable.isDeltaTable(spark, delta_output_path):
    print("Existing Delta table found. Executing merge...")
    
    # Load existing Delta table
    delta_table = DeltaTable.forPath(spark, delta_output_path)
    
    # Execute merge (upsert) based on city_id
    delta_table.alias("target") \
        .merge(dim_geography.alias("source"), 
               "target.city_id = source.city_id") \
        .whenMatchedUpdate(
            condition="target.record_hash != source.record_hash",
            set={
                "city_name": "source.city_name",
                "country_id": "source.country_id",
                "country_name": "source.country_name",
                "country_iso2": "source.country_iso2",
                "country_iso3": "source.country_iso3",
                "city_country": "source.city_country",
                "city_iso3": "source.city_iso3",
                "updated_at": "source.updated_at",
                "record_hash": "source.record_hash"
            }) \
        .whenNotMatchedInsertAll() \
        .execute()
        
    print("Merge executed successfully!")
    
else:
    print("Creating new Delta table...")
    dim_geography.write \
        .format("delta") \
        .mode("overwrite") \
        .option("mergeSchema", "true") \
        .save(delta_output_path)
    print("Delta table created successfully!")

StatementMeta(, d46ca668-9d78-484c-aa9a-6d1ab9aab1be, 19, Finished, Available, Finished)


Saving geography dimension to: Tables/dim_geography
Creating new Delta table...
Delta table created successfully!


In [18]:
# =============================================================================
# OPTIMIZE DELTA TABLE
# =============================================================================

print("\nOptimizing Delta table...")
try:
    spark.sql(f"OPTIMIZE delta.`{delta_output_path}`")
    print("Optimization completed!")
except Exception as e:
    print(f"Optimization error: {str(e)}")

StatementMeta(, d46ca668-9d78-484c-aa9a-6d1ab9aab1be, 20, Finished, Available, Finished)


Optimizing Delta table...
Optimization completed!


In [20]:
# =============================================================================
# FINAL DATA QUALITY CHECKS
# =============================================================================

print("\nRunning final data quality checks...")

# Load the saved table for verification
df_final = spark.read.format("delta").load(delta_output_path)

# Final statistics
final_total_records = df_final.count()
distinct_cities = df_final.select("city_id").distinct().count()
distinct_countries = df_final.select("country_id").distinct().count()
distinct_iso3_codes = df_final.select("country_iso3").distinct().count()

print(f"Final total records: {final_total_records}")
print(f"Distinct cities: {distinct_cities}")
print(f"Distinct countries: {distinct_countries}")
print(f"Distinct ISO3 codes: {distinct_iso3_codes}")

# Show final data sample
print("\n--- Final Geography Dimension Sample ---")
df_final.show(10, truncate=False)

# ISO3 validation summary
print("\n--- ISO3 Code Summary ---")
iso3_summary = df_final.groupBy("country_iso3") \
    .count() \
    .orderBy(col("count").desc())
iso3_summary.show(truncate=False)

StatementMeta(, d46ca668-9d78-484c-aa9a-6d1ab9aab1be, 22, Finished, Available, Finished)


Running final data quality checks...
Final total records: 21
Distinct cities: 21
Distinct countries: 8
Distinct ISO3 codes: 8

--- Final Geography Dimension Sample ---
+-------+----------+----------+--------------+------------+------------+--------------------------+--------------+-------------------------+-------------------------+--------------------------------+
|city_id|city_name |country_id|country_name  |country_iso2|country_iso3|city_country              |city_iso3     |created_at               |updated_at               |record_hash                     |
+-------+----------+----------+--------------+------------+------------+--------------------------+--------------+-------------------------+-------------------------+--------------------------------+
|1      |Berlin    |1         |Germany       |DE          |DEU         |Berlin, Germany           |Berlin-DEU    |2025-05-29 18:53:07.29665|2025-05-29 18:53:07.29665|987d50189e36d37fe3766f62338f9c9f|
|2      |Munich    |1         |

In [26]:
# =============================================================================
# SUMMARY
# =============================================================================

print("="*50)
print("GEOGRAPHY DIMENSION ETL COMPLETED SUCCESSFULLY!")
print(f"Records processed: {final_total_records}")
print(f"Table saved at: {delta_output_path}")
print(f"ISO3 standardization applied")
print("="*50)

# Summary dictionary for further analysis
summary_stats = {
    "total_records": final_total_records,
    "distinct_cities": distinct_cities,
    "distinct_countries": distinct_countries,
    "distinct_iso3_codes": distinct_iso3_codes,
    "table_path": delta_output_path
}

print(f"Summary: {summary_stats}")

# Optional: Show which cities belong to each country with ISO3
print("\n--- Cities by Country (ISO3) ---")
cities_by_country = df_final.groupBy("country_name", "country_iso3") \
    .agg(
        count("city_id").alias("city_count"),
        collect_list("city_name").alias("cities")
    ) \
    .orderBy("country_name")

# Show just the counts first
cities_by_country.select("country_name", "country_iso3", "city_count").show(20, truncate=False)

StatementMeta(, d46ca668-9d78-484c-aa9a-6d1ab9aab1be, 28, Finished, Available, Finished)

GEOGRAPHY DIMENSION ETL COMPLETED SUCCESSFULLY!
Records processed: 21
Table saved at: Tables/dim_geography
ISO3 standardization applied
Summary: {'total_records': 21, 'distinct_cities': 21, 'distinct_countries': 8, 'distinct_iso3_codes': 8, 'table_path': 'Tables/dim_geography'}

--- Cities by Country (ISO3) ---
+--------------+------------+----------+
|country_name  |country_iso3|city_count|
+--------------+------------+----------+
|Belgium       |BEL         |2         |
|France        |FRA         |3         |
|Germany       |DEU         |3         |
|Italy         |ITA         |3         |
|Netherlands   |NLD         |2         |
|Portugal      |PRT         |2         |
|Spain         |ESP         |3         |
|United Kingdom|GBR         |3         |
+--------------+------------+----------+

