# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


In [2]:
%timeout 20

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Current timeout is None minutes.
timeout has been set to 20 minutes.


In [4]:
%%configure
{
    "--job-bookmark-option": "job-bookmark-enable"
}

The following configurations have been updated: {'--job-bookmark-option': 'job-bookmark-enable'}


In [None]:
import sys
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from awsglue.dynamicframe import DynamicFrame
from awsglue.utils import getResolvedOptions
from awsglue.job import Job
import pytz
import re
import unicodedata

# Initialize all the variables needed
source_bucket = "data-engineering-project-8433-3658-8863"
folder_name = "bronze_data"
processed_folder_name = "silver_data"

# Set up catalog parameters
glue_database = "data-engineering-project-glue-database"
weather_table_name = "raw_data_weather_data"
mapping_table_name = "raw_data_french_region_city_mapping_20251116_210949_parquet"

# Set up the spark contexts, glue contexts and initialize job
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

try:
    args = getResolvedOptions(sys.argv, ['JOB_NAME'])
    JOB_NAME = args['JOB_NAME']
except:
    JOB_NAME = "notebook-job-weather-transform-final"

job.init(JOB_NAME, args if 'args' in locals() else {})

Trying to create a Glue session for the kernel.
Session Type: glueetl
Timeout: 20
Session ID: 9b6e4e8d-3387-44a8-8612-5fca732ba8a3
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
--job-bookmark-option job-bookmark-enable
Waiting for session 9b6e4e8d-3387-44a8-8612-5fca732ba8a3 to get into ready status...


In [2]:
# ============================================
# Helper Functions for Data Transformation
# ============================================

def strip_accents(s):
    if s is None:
        return ""
    return ''.join(c for c in unicodedata.normalize('NFKD', str(s)) 
                  if not unicodedata.combining(c))

def canonicalize(txt):
    if txt is None or str(txt).lower() == 'nan':
        return ""
    s = strip_accents(str(txt).lower().strip())
    for ch in ["'", "’", "-", "_", ".", ",", "(", ")", "/", "\\"]:
        s = s.replace(ch, " ")
    return " ".join(s.split())

def normalize_city(s):
    if s is None:
        return ""
    s = str(s).lower()
    s = strip_accents(s)
    s = re.sub(r"[^a-z\s\-]", " ", s)
    s = " ".join(s.split())
    s = s.replace(" ste ", " sainte ").replace(" st ", " saint ")
    if s.startswith("ste "): 
        s = "sainte " + s[4:]
    if s.startswith("st "):  
        s = "saint " + s[3:]
    return s

# Region mapping configuration
OLD_TO_NEW = {
    "aquitaine": "nouvelle aquitaine", 
    "poitou charentes": "nouvelle aquitaine", 
    "limousin": "nouvelle aquitaine",
    "midi pyrenees": "occitanie", 
    "languedoc roussillon": "occitanie",
    "burgundy": "bourgogne franche comte", 
    "franche comte": "bourgogne franche comte",
    "alsace": "grand est", 
    "lorraine": "grand est", 
    "champagne ardenne": "grand est",
    "haute normandie": "normandie", 
    "basse normandie": "normandie",
    "nord pas de calais": "hauts de france", 
    "picardy": "hauts de france", 
    "picardie": "hauts de france",
    "centre": "centre val de loire", 
    "brittany": "bretagne",
    "rhone alpes": "auvergne rhone alpes", 
    "auvergne": "auvergne rhone alpes",
    "paca": "provence alpes cote d azur", 
    "provence alpes": "provence alpes cote d azur",
    "corsica": "corse"
}

NEW_REGIONS = [
    "auvergne rhone alpes", "bourgogne franche comte", "bretagne", "centre val de loire",
    "grand est", "hauts de france", "ile de france", "normandie", "nouvelle aquitaine",
    "occitanie", "pays de la loire", "provence alpes cote d azur", "corse"
]

def map_region_to_new(name):
    if name is None:
        return None
    c = canonicalize(name)
    if c in OLD_TO_NEW:
        return OLD_TO_NEW[c]
    if c in NEW_REGIONS:
        return c
    if "rhone alpe" in c: 
        return "auvergne rhone alpes"
    return c

def force_display_name_for_elec(raw):
    if raw is None:
        return None
    if canonicalize(raw) in ("paca", "provence alpes cote d azur"):
        return "Provence-Alpes-Cote d'Azur"
    return str(raw)

# Register UDFs
strip_accents_udf = udf(strip_accents, StringType())
canonicalize_udf = udf(canonicalize, StringType())
normalize_city_udf = udf(normalize_city, StringType())
map_region_to_new_udf = udf(map_region_to_new, StringType())
force_display_name_udf = udf(force_display_name_for_elec, StringType())




In [3]:
# ============================================
# Read Source Data
# ============================================

# Read weather data from catalog
weather_df_from_catalog = glueContext.create_data_frame_from_catalog(
    glue_database,
    weather_table_name,
    additional_options={"useCatalogSchema": True, "useSparkDataSource": True, "header": True},
    transformation_ctx="weather_df_from_catalog"
)

# Read region-city mapping data from catalog
mapping_df_from_catalog = glueContext.create_data_frame_from_catalog(
    glue_database,
    mapping_table_name,
    additional_options={"useCatalogSchema": True, "useSparkDataSource": True, "header": True},
    transformation_ctx="mapping_df_from_catalog"
)

print("=== SCHEMA INFORMATION ===")
print("Weather data schema:")
weather_df_from_catalog.printSchema()
print("Weather columns:", weather_df_from_catalog.columns)
print("Sample weather data:")
weather_df_from_catalog.show(5, truncate=False)

print("Mapping data schema:")
mapping_df_from_catalog.printSchema()
print("Sample mapping data:")
mapping_df_from_catalog.show(5, truncate=False)

=== SCHEMA INFORMATION ===
Weather data schema:
root
 |-- city_name: string (nullable = true)
 |-- geoname_id: string (nullable = true)
 |-- date: long (nullable = true)
 |-- temperature_2m: float (nullable = true)
 |-- temperature_2m_previous_day1: float (nullable = true)
 |-- temperature_2m_previous_day2: float (nullable = true)
 |-- temperature_2m_previous_day3: float (nullable = true)
 |-- temperature_2m_previous_day4: float (nullable = true)
 |-- temperature_2m_previous_day5: float (nullable = true)
 |-- country_code: string (nullable = true)
 |-- population: long (nullable = true)
 |-- city_timezone: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)

Weather columns: ['city_name', 'geoname_id', 'date', 'temperature_2m', 'temperature_2m_previous_day1', 'temperature_2m_previous_day2', 'temperature_2m_previous_day3', 'temperature_2m_previous_day4', 'temperature_2m_previous_day5', 'country_code', 'population', 'city_timezone', 'lat', 'lon'

In [4]:
# ============================================
# Transform Region-City Mapping Data
# ============================================

mapping_df = mapping_df_from_catalog

mapping_exploded = mapping_df.select(
    col("region").alias("region_raw"),
    col("city").alias("city_raw")
).distinct()

mapping_exploded = mapping_exploded.withColumn(
    "city_norm", 
    normalize_city_udf(col("city_raw"))
).withColumn(
    "region_canon",
    map_region_to_new_udf(col("region_raw"))
).withColumn(
    "region_clean",
    force_display_name_udf(col("region_raw"))
).filter(
    col("region_canon").isin(NEW_REGIONS)
)

print("Final transformed mapping data:")
mapping_exploded.show(20, truncate=False)

Final transformed mapping data:
+--------------------------+-----------+-----------+--------------------------+--------------------------+
|region_raw                |city_raw   |city_norm  |region_canon              |region_clean              |
+--------------------------+-----------+-----------+--------------------------+--------------------------+
|Aquitaine                 |Pau        |pau        |nouvelle aquitaine        |Aquitaine                 |
|Auvergne                  |Riom       |riom       |auvergne rhone alpes      |Auvergne                  |
|Basse-Normandie           |Alençon    |alencon    |normandie                 |Basse-Normandie           |
|Basse-Normandie           |Caen       |caen       |normandie                 |Basse-Normandie           |
|Basse-Normandie           |Coutances  |coutances  |normandie                 |Basse-Normandie           |
|Basse-Normandie           |Ouistreham |ouistreham |normandie                 |Basse-Normandie           |
|Burg

In [5]:
# ============================================
# Transform and Join Weather Data with Regions
# ============================================

weather_df = weather_df_from_catalog

print("Starting weather data transformation...")

# Normalize city names in weather data
weather_df = weather_df.withColumn(
    "city_norm",
    normalize_city_udf(col("city_name"))
)

print("After city normalization:")
weather_df.select("city_name", "city_norm").show(10, truncate=False)

# Handle numeric columns - only temperature columns are available
available_numeric_columns = []
possible_numeric_columns = [
    'temperature_2m', 'temperature_2m_previous_day1', 'temperature_2m_previous_day2', 
    'temperature_2m_previous_day3', 'temperature_2m_previous_day4', 'temperature_2m_previous_day5'
]

for col_name in possible_numeric_columns:
    if col_name in weather_df.columns:
        available_numeric_columns.append(col_name)
        weather_df = weather_df.withColumn(
            col_name,
            when(col(col_name).isNull(), lit(None))
            .otherwise(
                when(col(col_name).cast("double").isNull(), lit(None))
                .otherwise(col(col_name).cast("double"))
            )
        )

print(f"Available numeric columns: {available_numeric_columns}")

# Join weather data with region mapping
weather_with_regions = weather_df.join(
    mapping_exploded.select("city_norm", "region_clean", "region_canon"),
    on="city_norm",
    how="left"
)

print(f"Weather data after region join: {weather_with_regions.count()} rows")

# Check mapping results
mapped_count = weather_with_regions.filter(col("region_clean").isNotNull()).count()
unmapped_count = weather_with_regions.filter(col("region_clean").isNull()).count()
print(f"Mapped cities: {mapped_count}, Unmapped cities: {unmapped_count}")

# Show some unmapped cities for debugging
if unmapped_count > 0:
    print("Sample of unmapped cities:")
    weather_with_regions.filter(col("region_clean").isNull()).select("city_name", "city_norm").distinct().show(10, truncate=False)

# Filter to only keep mapped cities
weather_mapped = weather_with_regions.filter(col("region_clean").isNotNull())
print(f"Weather data after filtering mapped cities: {weather_mapped.count()} rows")

# Show which regions we have
print("Regions in mapped data:")
weather_mapped.select("region_clean").distinct().show(truncate=False)

Starting weather data transformation...
After city normalization:
+---------------+---------------+
|city_name      |city_norm      |
+---------------+---------------+
|Marne La Vallée|marne la vallee|
|Marne La Vallée|marne la vallee|
|Marne La Vallée|marne la vallee|
|Marne La Vallée|marne la vallee|
|Marne La Vallée|marne la vallee|
|Marne La Vallée|marne la vallee|
|Marne La Vallée|marne la vallee|
|Marne La Vallée|marne la vallee|
|Marne La Vallée|marne la vallee|
|Marne La Vallée|marne la vallee|
+---------------+---------------+
only showing top 10 rows

Available numeric columns: ['temperature_2m', 'temperature_2m_previous_day1', 'temperature_2m_previous_day2', 'temperature_2m_previous_day3', 'temperature_2m_previous_day4', 'temperature_2m_previous_day5']
Weather data after region join: 498960 rows
Mapped cities: 439560, Unmapped cities: 59400
Sample of unmapped cities:
+-------------------------+-------------------------+
|city_name                |city_norm                |
+

In [6]:
# ============================================
# Timestamp Processing
# ============================================

print("Converting timestamp from nanoseconds...")

weather_mapped = weather_mapped.withColumn(
    "ts_utc",
    to_timestamp(from_unixtime(col("date").cast("double") / 1e9))
).withColumn(
    "ts_paris", 
    from_utc_timestamp(col("ts_utc"), "Europe/Paris")
)

print("After timestamp conversion:")
weather_mapped.select("date", "ts_utc", "ts_paris").show(10, truncate=False)

Converting timestamp from nanoseconds...
After timestamp conversion:
+-------------------+-------------------+-------------------+
|date               |ts_utc             |ts_paris           |
+-------------------+-------------------+-------------------+
|1753833600000000000|2025-07-30 00:00:00|2025-07-30 02:00:00|
|1753837200000000000|2025-07-30 01:00:00|2025-07-30 03:00:00|
|1753840800000000000|2025-07-30 02:00:00|2025-07-30 04:00:00|
|1753844400000000000|2025-07-30 03:00:00|2025-07-30 05:00:00|
|1753848000000000000|2025-07-30 04:00:00|2025-07-30 06:00:00|
|1753851600000000000|2025-07-30 05:00:00|2025-07-30 07:00:00|
|1753855200000000000|2025-07-30 06:00:00|2025-07-30 08:00:00|
|1753858800000000000|2025-07-30 07:00:00|2025-07-30 09:00:00|
|1753862400000000000|2025-07-30 08:00:00|2025-07-30 10:00:00|
|1753866000000000000|2025-07-30 09:00:00|2025-07-30 11:00:00|
+-------------------+-------------------+-------------------+
only showing top 10 rows


In [7]:
# ============================================
# Feature Engineering
# ============================================

print("Creating weather features...")

# Use temperature_2m column
weather_mapped = weather_mapped.withColumn("temp_c", col("temperature_2m"))

# Calendar features
weather_mapped = weather_mapped.withColumn(
    "date_paris",
    to_date(col("ts_paris"))
).withColumn(
    "hour",
    hour(col("ts_paris"))
).withColumn(
    "dow",
    dayofweek(col("ts_paris"))
).withColumn(
    "month",
    month(col("ts_paris"))
).withColumn(
    "doy",
    dayofyear(col("ts_paris"))
).withColumn(
    "is_weekend",
    when((col("dow") == 1) | (col("dow") == 7), 1).otherwise(0)
)

# Cyclical time features
weather_mapped = weather_mapped.withColumn(
    "hour_sin",
    sin(2 * 3.14159 * col("hour") / 24.0)
).withColumn(
    "hour_cos", 
    cos(2 * 3.14159 * col("hour") / 24.0)
).withColumn(
    "month_sin",
    sin(2 * 3.14159 * col("month") / 12.0)
).withColumn(
    "month_cos",
    cos(2 * 3.14159 * col("month") / 12.0)
)

# Seasonal features
weather_mapped = weather_mapped.withColumn(
    "season",
    when((col("month") >= 3) & (col("month") <= 5), "spring")
    .when((col("month") >= 6) & (col("month") <= 8), "summer")
    .when((col("month") >= 9) & (col("month") <= 11), "autumn")
    .otherwise("winter")
).withColumn(
    "is_daylight",
    when((col("hour") >= 6) & (col("hour") <= 20), 1).otherwise(0)
)

# Weather severity based on temperature
weather_mapped = weather_mapped.withColumn(
    "weather_severity",
    when(col("temp_c") < 0, "cold")
    .when(col("temp_c") > 30, "hot") 
    .otherwise("moderate")
)

print("Final weather_mapped schema:")
weather_mapped.printSchema()


Creating weather features...
Final weather_mapped schema:
root
 |-- city_norm: string (nullable = true)
 |-- city_name: string (nullable = true)
 |-- geoname_id: string (nullable = true)
 |-- date: long (nullable = true)
 |-- temperature_2m: double (nullable = true)
 |-- temperature_2m_previous_day1: double (nullable = true)
 |-- temperature_2m_previous_day2: double (nullable = true)
 |-- temperature_2m_previous_day3: double (nullable = true)
 |-- temperature_2m_previous_day4: double (nullable = true)
 |-- temperature_2m_previous_day5: double (nullable = true)
 |-- country_code: string (nullable = true)
 |-- population: long (nullable = true)
 |-- city_timezone: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- region_clean: string (nullable = true)
 |-- region_canon: string (nullable = true)
 |-- ts_utc: timestamp (nullable = true)
 |-- ts_paris: timestamp (nullable = true)
 |-- temp_c: double (nullable = true)
 |-- date_paris: date (n

In [8]:
# ============================================
# Regional Aggregation
# ============================================

print("Starting regional aggregation...")

# Aggregation based on available columns
agg_exprs = []

# Average temperature
if 'temp_c' in weather_mapped.columns:
    agg_exprs.append(avg("temp_c").alias("temp_c"))

# Also include the previous day temperatures if available
for col_name in available_numeric_columns:
    if col_name != 'temperature_2m':  # temperature_2m already handled via temp_c
        agg_exprs.append(avg(col_name).alias(col_name))

# Add other features
agg_exprs.extend([
    first("season").alias("season"),
    first("is_daylight").alias("is_daylight"),
    first("hour").alias("hour"),
    first("dow").alias("dow"),
    first("month").alias("month"),
    first("is_weekend").alias("is_weekend"),
    first("weather_severity").alias("weather_severity")
])

weather_aggregated = weather_mapped.groupBy("region_clean", "ts_utc").agg(*agg_exprs)

print(f"After aggregation: {weather_aggregated.count()} rows")

# Add back cyclical features after aggregation
weather_aggregated = weather_aggregated.withColumn(
    "hour_sin",
    sin(2 * 3.14159 * col("hour") / 24.0)
).withColumn(
    "hour_cos", 
    cos(2 * 3.14159 * col("hour") / 24.0)
).withColumn(
    "month_sin",
    sin(2 * 3.14159 * col("month") / 12.0)
).withColumn(
    "month_cos",
    cos(2 * 3.14159 * col("month") / 12.0)
)

# Final data quality checks
weather_final = weather_aggregated.filter(
    (col("temp_c") > -50) & (col("temp_c") < 60)
)

print(f"Final transformed row count: {weather_final.count()}")
print("Sample of final weather data:")
weather_final.show(10)

Starting regional aggregation...
After aggregation: 50880 rows
Final transformed row count: 50880
Sample of final weather data:
+------------+-------------------+------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+------+-----------+----+---+-----+----------+----------------+-------------------+-------------------+--------------------+-------------------+
|region_clean|             ts_utc|            temp_c|temperature_2m_previous_day1|temperature_2m_previous_day2|temperature_2m_previous_day3|temperature_2m_previous_day4|temperature_2m_previous_day5|season|is_daylight|hour|dow|month|is_weekend|weather_severity|           hour_sin|           hour_cos|           month_sin|          month_cos|
+------------+-------------------+------------------+----------------------------+----------------------------+----------------------------+----------------------------+-------------------

In [None]:
# ============================================
# Write to Silver Bucket
# ============================================

weather_dynamic_frame = DynamicFrame.fromDF(
    weather_final, 
    glueContext, 
    "weather_transformed"
)

sink = glueContext.getSink(
    path=f"s3://{source_bucket}/{processed_folder_name}/weather_transformed/",
    connection_type="s3",
    updateBehavior="UPDATE_IN_DATABASE",
    partitionKeys=[],
    compression="snappy",
    enableUpdateCatalog=True,
    transformation_ctx="sink",
)

sink.setCatalogInfo(
    catalogDatabase=glue_database,
    catalogTableName="silver_weather_data"
)

sink.setFormat("glueparquet")
sink.writeFrame(weather_dynamic_frame)

print(f"SUCCESS: Written transformed weather data to silver layer")

# Commit the job
job.commit()