# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [2]:
%timeout 20

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Current timeout is None minutes.
timeout has been set to 20 minutes.


In [4]:
%%configure
{
    "--job-bookmark-option": "job-bookmark-enable"
}

The following configurations have been updated: {'--job-bookmark-option': 'job-bookmark-enable'}


In [1]:
import sys
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from awsglue.dynamicframe import DynamicFrame
from awsglue.utils import getResolvedOptions
from awsglue.job import Job
import re
import unicodedata


# Initialize all the variables needed
source_bucket = "data-engineering-project-8433-3658-8863"
folder_name = "bronze_data"
processed_folder_name = "silver_data"

# Set up catalog parameters
glue_database = "data-engineering-project-glue-database"
mapping_table_name = "raw_data_french_region_city_mapping_20251116_210949_parquet"

# Set up the spark contexts, glue contexts and initialize job
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

# Alternative für Notebook - ohne JOB_NAME Parameter
try:
    args = getResolvedOptions(sys.argv, ['JOB_NAME'])
    JOB_NAME = args['JOB_NAME']
except:
    JOB_NAME = "notebook-job-regionmapping-transform"

job.init(JOB_NAME, args if 'args' in locals() else {})

Trying to create a Glue session for the kernel.
Session Type: glueetl
Timeout: 20
Session ID: 186fa250-4e66-496f-8a95-671f206ee4a3
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
--job-bookmark-option job-bookmark-enable
Waiting for session 186fa250-4e66-496f-8a95-671f206ee4a3 to get into ready status...
Session 186fa250-4e66-496f-8a95-671f206ee4a3 has been created.



In [2]:
# ============================================
# Helper Functions (from analytics code)
# ============================================

def strip_accents(s):
    if s is None:
        return ""
    return ''.join(c for c in unicodedata.normalize('NFKD', str(s)) 
                  if not unicodedata.combining(c))

def canonicalize(txt):
    if txt is None or str(txt).lower() == 'nan':
        return ""
    s = strip_accents(str(txt).lower().strip())
    for ch in ["'", "’", "-", "_", ".", ",", "(", ")", "/", "\\"]:
        s = s.replace(ch, " ")
    return " ".join(s.split())

def normalize_city(s):
    if s is None:
        return ""
    s = str(s).lower()
    s = strip_accents(s)
    s = re.sub(r"[^a-z\s\-]", " ", s)
    s = " ".join(s.split())
    s = s.replace(" ste ", " sainte ").replace(" st ", " saint ")
    if s.startswith("ste "): 
        s = "sainte " + s[4:]
    if s.startswith("st "):  
        s = "saint " + s[3:]
    return s

# Region mapping configuration (exactly from analytics code)
OLD_TO_NEW = {
    "aquitaine": "nouvelle aquitaine", 
    "poitou charentes": "nouvelle aquitaine", 
    "limousin": "nouvelle aquitaine",
    "midi pyrenees": "occitanie", 
    "languedoc roussillon": "occitanie",
    "burgundy": "bourgogne franche comte", 
    "franche comte": "bourgogne franche comte",
    "alsace": "grand est", 
    "lorraine": "grand est", 
    "champagne ardenne": "grand est",
    "haute normandie": "normandie", 
    "basse normandie": "normandie",
    "nord pas de calais": "hauts de france", 
    "picardy": "hauts de france", 
    "picardie": "hauts de france",
    "centre": "centre val de loire", 
    "brittany": "bretagne",
    "rhone alpes": "auvergne rhone alpes", 
    "auvergne": "auvergne rhone alpes",
    "paca": "provence alpes cote d azur", 
    "provence alpes": "provence alpes cote d azur",
    "corsica": "corse",
    # Overseas -> exclude (from analytics)
    "guadeloupe": None, "martinique": None, "french guiana": None, 
    "guyane": None, "reunion": None, "mayotte": None
}

NEW_REGIONS = [
    "auvergne rhone alpes", "bourgogne franche comte", "bretagne", "centre val de loire",
    "grand est", "hauts de france", "ile de france", "normandie", "nouvelle aquitaine",
    "occitanie", "pays de la loire", "provence alpes cote d azur", "corse"
]

def map_region_to_new(name):
    if name is None:
        return None
    c = canonicalize(name)
    if c in OLD_TO_NEW:
        return OLD_TO_NEW[c]
    if c in NEW_REGIONS:
        return c
    if "rhone alpe" in c: 
        return "auvergne rhone alpes"
    return c

def force_display_name_for_elec(raw):
    if raw is None:
        return None
    if canonicalize(raw) in ("paca", "provence alpes cote d azur"):
        return "Provence-Alpes-Cote d'Azur"
    return str(raw)

# Parse cities function from analytics code
def parse_cities(val):
    if val is None:
        return []
    try:
        # Try to evaluate as Python literal (list)
        import ast
        out = ast.literal_eval(str(val))
        if isinstance(out, (list, tuple)):
            return list(out)
    except:
        pass
    # Fallback: split by comma
    return [p.strip() for p in str(val).split(",") if p.strip()]

# Register UDFs
strip_accents_udf = udf(strip_accents, StringType())
canonicalize_udf = udf(canonicalize, StringType())
normalize_city_udf = udf(normalize_city, StringType())
map_region_to_new_udf = udf(map_region_to_new, StringType())
force_display_name_udf = udf(force_display_name_for_elec, StringType())
parse_cities_udf = udf(parse_cities, ArrayType(StringType()))




In [3]:
# ============================================
# Read Source Data
# ============================================

# Read region-city mapping data from catalog
mapping_df_from_catalog = glueContext.create_data_frame_from_catalog(
    glue_database,
    mapping_table_name,
    additional_options={"useCatalogSchema": True, "useSparkDataSource": True, "header": True},
    transformation_ctx="mapping_df_from_catalog"
)

print("Original mapping data schema:")
mapping_df_from_catalog.printSchema()
print(f"Original mapping row count: {mapping_df_from_catalog.count()}")

print("Sample of original mapping data:")
mapping_df_from_catalog.show(10, truncate=False)

Original mapping data schema:
root
 |-- region: string (nullable = true)
 |-- city: string (nullable = true)
 |-- scraped_at: timestamp (nullable = true)
 |-- source_url: string (nullable = true)
 |-- region_clean: string (nullable = true)
 |-- city_count: long (nullable = true)
 |-- city_first_letter: string (nullable = true)

Original mapping row count: 336
Sample of original mapping data:
+---------+-----------+--------------------------+---------------------------------------------------------------------------+------------+----------+-----------------+
|region   |city       |scraped_at                |source_url                                                                 |region_clean|city_count|city_first_letter|
+---------+-----------+--------------------------+---------------------------------------------------------------------------+------------+----------+-----------------+
|Alsace   |Colmar     |2025-11-16 21:09:49.385497|https://www.britannica.com/topic/list-of-cities-

In [5]:
# ============================================
# Transform Region-City Mapping Data (exactly like analytics)
# ============================================


print("Starting region mapping transformation...")

# Step 1: Use the actual schema - each row is already one city-region mapping
mapping_df = mapping_df_from_catalog

print("Actual mapping data structure:")
mapping_df.show(10, truncate=False)

# Since we have 'city' column (not 'cities'), we use it directly
mapping_exploded = mapping_df.select(
    col("region").alias("region_raw"),
    col("city").alias("city_raw")
).distinct()

print("After selecting region and city:")
mapping_exploded.show(20, truncate=False)

# Step 2: Apply region normalization (exactly like analytics)
mapping_exploded = mapping_exploded.withColumn(
    "region_canon",
    map_region_to_new_udf(col("region_raw"))
)

# Filter only the new regions (like elec_regions_new in analytics)
mapping_exploded = mapping_exploded.filter(
    col("region_canon").isin(NEW_REGIONS)
)

print("After region normalization and filtering:")
mapping_exploded.show(20, truncate=False)

# Step 3: Create display names (like canon_to_disp in analytics)
mapping_exploded = mapping_exploded.withColumn(
    "region_clean",
    force_display_name_udf(col("region_raw"))
)

# Step 4: Normalize city names (like city_norm in analytics)
mapping_exploded = mapping_exploded.withColumn(
    "city_norm",
    normalize_city_udf(col("city_raw"))
)

print("Final transformed mapping data:")
mapping_exploded.show(30, truncate=False)



print("After exploding cities:")
mapping_exploded.show(20, truncate=False)

# Step 2: Apply region normalization (exactly like analytics)
mapping_exploded = mapping_exploded.withColumn(
    "region_canon",
    map_region_to_new_udf(col("region_raw"))
)

# Filter only the new regions (like elec_regions_new in analytics)
mapping_exploded = mapping_exploded.filter(
    col("region_canon").isin(NEW_REGIONS)
)

print("After region normalization and filtering:")
mapping_exploded.show(20, truncate=False)

# Step 3: Create display names (like canon_to_disp in analytics)
mapping_exploded = mapping_exploded.withColumn(
    "region_clean",
    force_display_name_udf(col("region_raw"))
)

# Step 4: Normalize city names (like city_norm in analytics)
mapping_exploded = mapping_exploded.withColumn(
    "city_norm",
    normalize_city_udf(col("city_raw"))
)

print("Final transformed mapping data:")
mapping_exploded.show(30, truncate=False)

# Step 5: Create the canonical to display mapping table (like canon_to_disp)
canon_to_disp = mapping_exploded.select("region_canon", "region_clean").distinct()
print("Canonical to display name mapping:")
canon_to_disp.show(truncate=False)

# Step 6: Create city to region mapping (like city2region in analytics)
city_to_region = mapping_exploded.select("city_norm", "region_clean").distinct()
print("City to region mapping:")
city_to_region.show(30, truncate=False)

# Step 7: Add region capitals coordinates (like REGION_CAPITALS in analytics)
REGION_CAPITALS = {
    "Auvergne-Rhône-Alpes": (45.7640, 4.8357),
    "Bourgogne-Franche-Comté": (47.3220, 5.0415),
    "Bretagne": (48.1173, -1.6778),
    "Centre-Val de Loire": (47.9029, 1.9093),
    "Grand-Est": (48.5734, 7.7521),
    "Hauts-de-France": (50.6292, 3.0573),
    "Île-de-France": (48.8566, 2.3522),
    "Normandie": (49.4432, 1.0993),
    "Nouvelle-Aquitaine": (44.8378, -0.5792),
    "Occitanie": (43.6047, 1.4442),
    "Pays-de-la-Loire": (47.2184, -1.5536),
    "Provence-Alpes-Cote d'Azur": (43.2965, 5.3698),
    "Corse": (41.9192, 8.7386),
}

# Create capitals DataFrame
capitals_data = [(region, coords[0], coords[1]) for region, coords in REGION_CAPITALS.items()]
capitals_df = spark.createDataFrame(capitals_data, ["region_clean", "cap_lat", "cap_lon"])

print("Region capitals:")
capitals_df.show(truncate=False)

# Step 8: Join capitals with mapping data
mapping_final = mapping_exploded.join(
    capitals_df, 
    on="region_clean", 
    how="left"
)

print("Final mapping data with capitals:")
mapping_final.show(30, truncate=False)

# Step 9: Create summary statistics
region_summary = mapping_final.groupBy("region_clean", "region_canon").agg(
    count("city_raw").alias("city_count"),
    first("cap_lat").alias("cap_lat"),
    first("cap_lon").alias("cap_lon")
)

print("Region summary:")
region_summary.show(truncate=False)

Starting region mapping transformation...
Actual mapping data structure:
+---------+-----------+--------------------------+---------------------------------------------------------------------------+------------+----------+-----------------+
|region   |city       |scraped_at                |source_url                                                                 |region_clean|city_count|city_first_letter|
+---------+-----------+--------------------------+---------------------------------------------------------------------------+------------+----------+-----------------+
|Alsace   |Colmar     |2025-10-14 12:40:33.388156|https://www.britannica.com/topic/list-of-cities-and-towns-in-France-2039172|Alsace      |5         |C                |
|Alsace   |Haguenau   |2025-10-14 12:40:33.388156|https://www.britannica.com/topic/list-of-cities-and-towns-in-France-2039172|Alsace      |5         |H                |
|Alsace   |Mulhouse   |2025-10-14 12:40:33.388156|https://www.britannica.com/topic

In [None]:
# ============================================
# Write to Silver Bucket - Only essential data for analysis
# ============================================

# Write the main mapping table - only essential columns
mapping_essential = mapping_final.select(
    "city_norm",           # Normalized city name for joins
    "city_raw",            # Original city name for reference
    "region_clean",        # Clean region name (primary key for joins)
    "region_canon",        # Canonical region name
    "cap_lat",             # Capital latitude for geo analysis
    "cap_lon"              # Capital longitude for geo analysis
)

mapping_dynamic_frame = DynamicFrame.fromDF(
    mapping_essential, 
    glueContext, 
    "region_mapping_transformed"
)

sink1 = glueContext.getSink(
    path=f"s3://{source_bucket}/{processed_folder_name}/region_transformed/",
    connection_type="s3",
    updateBehavior="UPDATE_IN_DATABASE",
    partitionKeys=[],  # You could partition by region_clean if needed
    compression="snappy",
    enableUpdateCatalog=True,
    transformation_ctx="sink1",
)

sink1.setCatalogInfo(
    catalogDatabase=glue_database,
    catalogTableName="silver_region_mapping"
)

sink1.setFormat("glueparquet")
sink1.writeFrame(mapping_dynamic_frame)

print(f"Successfully written essential region mapping data to:")
print(f"- s3://{source_bucket}/{processed_folder_name}/region_transformed/")
print(f"Table: silver_region_mapping")
print(f"Essential columns: city_norm, city_raw, region_clean, region_canon, cap_lat, cap_lon")

# Optional: Write the optimized city-region lookup table if needed for frequent joins
# But for simplicity, we'll just use the main table since it's not too large

# Commit the job
job.commit()