In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime, timedelta

StatementMeta(, b9bc65bd-de9d-47e1-b5e6-951720ba1bab, 3, Finished, Available, Finished)

In [2]:
# Configuration
SOURCE_PATH = "Files/bronze/bigmacdata/big_mac_hist_data.parquet"
DELTA_TABLE_NAME = "currency"

StatementMeta(, b9bc65bd-de9d-47e1-b5e6-951720ba1bab, 4, Finished, Available, Finished)

In [3]:
# European countries mapping (ISO codes to country names)
european_countries = {
    'AUT': 'Austria',
    'BEL': 'Belgium', 
    'BGR': 'Bulgaria',
    'HRV': 'Croatia',
    'CZE': 'Czech Republic',
    'DNK': 'Denmark',
    'EST': 'Estonia',
    'FIN': 'Finland',
    'FRA': 'France',
    'DEU': 'Germany',
    'GRC': 'Greece',
    'HUN': 'Hungary',
    'IRL': 'Ireland',
    'ITA': 'Italy',
    'LVA': 'Latvia',
    'LTU': 'Lithuania',
    'LUX': 'Luxembourg',
    'MLT': 'Malta',
    'NLD': 'Netherlands',
    'POL': 'Poland',
    'PRT': 'Portugal',
    'ROU': 'Romania',
    'SVK': 'Slovakia',
    'SVN': 'Slovenia',
    'ESP': 'Spain',
    'SWE': 'Sweden',
    'CHE': 'Switzerland',
    'GBR': 'United Kingdom',
    'NOR': 'Norway',
    'ISL': 'Iceland'
}

StatementMeta(, b9bc65bd-de9d-47e1-b5e6-951720ba1bab, 5, Finished, Available, Finished)

In [4]:
def load_and_filter_data():
    """
    Load Big Mac data and apply European cities filter
    """
    print("📂 Loading Big Mac historical data...")
    
    # Read the parquet file
    df = spark.read.parquet(SOURCE_PATH)
    
    print(f"📊 Original dataset: {df.count()} records")
    
    # Show original schema
    print("\n🔍 Original Schema:")
    df.printSchema()
    
    return df

StatementMeta(, b9bc65bd-de9d-47e1-b5e6-951720ba1bab, 6, Finished, Available, Finished)

In [5]:
def filter_european_cities(df):
    """
    Filter only European countries
    """
    print("🇪🇺 Filtering European cities...")
    
    # Get list of European ISO codes
    european_iso_codes = list(european_countries.keys())
    
    # Filter European countries
    european_df = df.filter(col("iso_a3").isin(european_iso_codes))
    
    print(f"✅ European records: {european_df.count()}")
    
    # Show European countries found
    countries_found = european_df.select("iso_a3", "name").distinct().orderBy("name")
    print("\n🏳️ European countries found:")
    countries_found.show(truncate=False)
    
    return european_df

StatementMeta(, b9bc65bd-de9d-47e1-b5e6-951720ba1bab, 7, Finished, Available, Finished)

In [6]:
def filter_recent_dates(df, months_back=6):
    """
    Filter last available months from the dataset (not necessarily 2025)
    """
    print(f"📅 Finding last {months_back} months of available data...")
    
    # First, check what dates we actually have
    date_df = df.withColumn("date", to_date(col("date")))
    
    # Get the actual date range in the dataset
    date_range = date_df.select(
        min("date").alias("min_date"),
        max("date").alias("max_date")
    ).collect()[0]
    
    print(f"📊 Available data range: {date_range['min_date']} to {date_range['max_date']}")
    
    if date_range['max_date'] is None:
        print("❌ No valid dates found in dataset!")
        return df.limit(0)
    
    # Calculate start date (3 months back from latest date)
    latest_date = date_range['max_date']
    start_date = latest_date - timedelta(days=90)  # Approximately 3 months
    
    print(f"🎯 Filtering from {start_date} to {latest_date}")
    
    # Filter for last 3 months
    filtered_df = date_df.filter(
        col("date") >= lit(start_date)
    )
    
    filtered_count = filtered_df.count()
    print(f"✅ Records in last {months_back} months: {filtered_count}")
    
    # Show actual date range after filtering
    if filtered_count > 0:
        actual_range = filtered_df.select(
            min("date").alias("min_date"),
            max("date").alias("max_date")
        ).collect()[0]
        print(f"📅 Filtered data range: {actual_range['min_date']} to {actual_range['max_date']}")
    
    return filtered_df

StatementMeta(, b9bc65bd-de9d-47e1-b5e6-951720ba1bab, 8, Finished, Available, Finished)

In [7]:
def clean_and_format_data(df):
    """
    Clean and format price columns and other data
    """
    print("🧹 Cleaning and formatting data...")
    
    if df.count() == 0:
        print("❌ No data to clean - returning empty DataFrame")
        return df
    
    # Clean and format the data
    cleaned_df = df.select(
        col("name").alias("country_name"),
        col("iso_a3").alias("country_code"),
        col("currency_code"),
        col("date"),
        # Clean local_price - handle string conversion to double
        when(col("local_price").isNull() | (col("local_price") == ""), 0.0)
        .otherwise(regexp_replace(col("local_price"), "[^0-9.]", "").cast(DoubleType()))
        .alias("local_price_cleaned"),
        
        # Clean dollar_ex - handle string conversion to double
        when(col("dollar_ex").isNull() | (col("dollar_ex") == ""), 1.0)
        .otherwise(regexp_replace(col("dollar_ex"), "[^0-9.]", "").cast(DoubleType()))
        .alias("dollar_exchange_rate")
    )
    
    # Add calculated USD price
    cleaned_df = cleaned_df.withColumn(
        "usd_price",
        when((col("dollar_exchange_rate") > 0) & (col("local_price_cleaned") > 0),
             col("local_price_cleaned") / col("dollar_exchange_rate"))
        .otherwise(0.0)
    ).filter(
        # Remove invalid records
        (col("local_price_cleaned") > 0) & 
        (col("dollar_exchange_rate") > 0)
    )
    
    if cleaned_df.count() == 0:
        print("❌ No valid records after cleaning")
        return cleaned_df
    
    # Add derived columns
    enhanced_df = cleaned_df.withColumn(
        "year", year(col("date"))
    ).withColumn(
        "month", month(col("date"))
    ).withColumn(
        "quarter", quarter(col("date"))
    ).withColumn(
        # Price category
        "price_category",
        when(col("usd_price") < 3.0, "Low")
        .when(col("usd_price") < 5.0, "Medium")
        .when(col("usd_price") < 7.0, "High")
        .otherwise("Very High")
    ).withColumn(
        # European region classification
        "european_region",
        when(col("country_code").isin(["GBR", "IRL"]), "Western Europe")
        .when(col("country_code").isin(["DEU", "FRA", "NLD", "BEL", "LUX", "AUT", "CHE"]), "Central Europe")
        .when(col("country_code").isin(["ITA", "ESP", "PRT", "GRC", "MLT"]), "Southern Europe")
        .when(col("country_code").isin(["SWE", "DNK", "NOR", "FIN", "ISL"]), "Northern Europe")
        .when(col("country_code").isin(["POL", "CZE", "SVK", "HUN", "SVN", "HRV", "ROU", "BGR", "EST", "LVA", "LTU"]), "Eastern Europe")
        .otherwise("Other Europe")
    ).withColumn(
        "created_at", current_timestamp()
    )
    
    print(f"✅ Cleaned records: {enhanced_df.count()}")
    
    return enhanced_df

StatementMeta(, b9bc65bd-de9d-47e1-b5e6-951720ba1bab, 9, Finished, Available, Finished)

In [8]:
def validate_data_quality(df):
    """
    Perform data quality checks
    """
    print("🔍 Data Quality Validation:")
    
    total_records = df.count()
    
    # Check for nulls in key columns
    null_checks = df.select([
        count(when(col(c).isNull(), c)).alias(f"{c}_nulls") 
        for c in ["country_name", "local_price_cleaned", "dollar_exchange_rate", "usd_price"]
    ])
    
    print("❌ Null values check:")
    null_checks.show()
    
    # Price range validation
    price_stats = df.select(
        min("local_price_cleaned").alias("min_local_price"),
        max("local_price_cleaned").alias("max_local_price"),
        avg("local_price_cleaned").alias("avg_local_price"),
        min("usd_price").alias("min_usd_price"),
        max("usd_price").alias("max_usd_price"),
        avg("usd_price").alias("avg_usd_price")
    )
    
    print("💰 Price statistics:")
    price_stats.show()
    
    # Country distribution
    country_dist = df.groupBy("country_name", "european_region") \
                     .count() \
                     .orderBy(desc("count"))
    
    print("🏳️ Records per country:")
    country_dist.show(truncate=False)
    
    return True

StatementMeta(, b9bc65bd-de9d-47e1-b5e6-951720ba1bab, 10, Finished, Available, Finished)

In [9]:
def save_to_delta_table(df, table_name):
    """
    Save cleaned data to Delta table
    """
    print(f"💾 Saving to Delta table: {table_name}")
    
    # Save as Delta table
    df.write \
      .format("delta") \
      .mode("overwrite") \
      .option("mergeSchema", "true") \
      .saveAsTable(table_name)
    
    print(f"✅ Delta table '{table_name}' created successfully!")
    
    # Show final table info
    final_count = spark.table(table_name).count()
    print(f"📊 Final table records: {final_count}")


StatementMeta(, b9bc65bd-de9d-47e1-b5e6-951720ba1bab, 11, Finished, Available, Finished)

In [10]:
def main():
    """
    Main execution function
    """
    print("🚀 Starting Big Mac European Cities Data Processing")
    print("=" * 70)
    
    try:
        # 1. Load original data
        df = load_and_filter_data()
        
        # 2. Filter European cities
        european_df = filter_european_cities(df)
        
        # 3. Filter recent dates (last 3 months available, not necessarily 2025)
        recent_df = filter_recent_dates(european_df, months_back=3)
        
        # 4. Clean and format data
        cleaned_df = clean_and_format_data(recent_df)
        
        # 5. Validate data quality
        validate_data_quality(cleaned_df)
        
        # 6. Save to Delta table
        save_to_delta_table(cleaned_df, DELTA_TABLE_NAME)
        
        print("=" * 70)
        print("✅ SUCCESS! European Big Mac data processed and saved")
        print(f"📊 Delta table: {DELTA_TABLE_NAME}")
        print("🔄 Ready for analysis and Power BI integration")
        
        # Show sample of final data
        print("\n📋 Sample of processed data:")
        spark.table(DELTA_TABLE_NAME).show(10, truncate=False)
        
    except Exception as e:
        print(f"❌ Error in processing: {str(e)}")
        raise e

StatementMeta(, b9bc65bd-de9d-47e1-b5e6-951720ba1bab, 12, Finished, Available, Finished)

In [11]:
main()

StatementMeta(, b9bc65bd-de9d-47e1-b5e6-951720ba1bab, 13, Finished, Available, Finished)

🚀 Starting Big Mac European Cities Data Processing
📂 Loading Big Mac historical data...
📊 Original dataset: 2302 records

🔍 Original Schema:
root
 |-- name: string (nullable = true)
 |-- iso_a3: string (nullable = true)
 |-- currency_code: string (nullable = true)
 |-- local_price: string (nullable = true)
 |-- dollar_ex: string (nullable = true)
 |-- GDP_dollar: string (nullable = true)
 |-- GDP_local: string (nullable = true)
 |-- date: string (nullable = true)

🇪🇺 Filtering European cities...
✅ European records: 752

🏳️ European countries found:
+------+--------------+
|iso_a3|name          |
+------+--------------+
|AUT   |Austria       |
|BEL   |Belgium       |
|GBR   |Britain       |
|HRV   |Croatia       |
|CZE   |Czech Republic|
|DNK   |Denmark       |
|EST   |Estonia       |
|FIN   |Finland       |
|FRA   |France        |
|DEU   |Germany       |
|GRC   |Greece        |
|HUN   |Hungary       |
|IRL   |Ireland       |
|ITA   |Italy         |
|LVA   |Latvia        |
|LTU   |Lithu