# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [2]:
%timeout 20

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Current timeout is None minutes.
timeout has been set to 20 minutes.


In [4]:
%%configure
{
    "--job-bookmark-option": "job-bookmark-enable"
}

The following configurations have been updated: {'--job-bookmark-option': 'job-bookmark-enable'}


In [1]:
import sys
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.types import *
from awsglue.dynamicframe import DynamicFrame
from awsglue.utils import getResolvedOptions
from awsglue.job import Job
import re
import unicodedata


# Initialize all the variables needed
source_bucket = "data-engineering-project-8433-3658-8863"
folder_name = "bronze_data"
processed_folder_name = "silver_data"

# Set up catalog parameters
glue_database = "data-engineering-project-glue-database"
holiday_table_name = "raw_data_holiday_data"

# Set up the spark contexts, glue contexts and initialize job
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

# Alternative fÃ¼r Notebook - ohne JOB_NAME Parameter
try:
    args = getResolvedOptions(sys.argv, ['JOB_NAME'])
    JOB_NAME = args['JOB_NAME']
except:
    JOB_NAME = "notebook-job-holiday-transform"

job.init(JOB_NAME, args if 'args' in locals() else {})

Trying to create a Glue session for the kernel.
Session Type: glueetl
Timeout: 20
Session ID: 2a89d74b-3bb3-42f7-80d7-eab62df935d0
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
--job-bookmark-option job-bookmark-enable
Waiting for session 2a89d74b-3bb3-42f7-80d7-eab62df935d0 to get into ready status...
Session 2a89d74b-3bb3-42f7-80d7-eab62df935d0 has been created.



In [5]:
# ============================================
# Read Source Data
# ============================================

# Read holiday data from catalog
holiday_df_from_catalog = glueContext.create_data_frame_from_catalog(
    glue_database,
    holiday_table_name,
    additional_options={"useCatalogSchema": True, "useSparkDataSource": True, "header": True},
    transformation_ctx="holiday_df_from_catalog"
)

print("Original holiday data schema:")
holiday_df_from_catalog.printSchema()
print(f"Original holiday row count: {holiday_df_from_catalog.count()}")

print("Sample of original holiday data:")
holiday_df_from_catalog.show(10, truncate=False)

Original holiday data schema:
root
 |-- date: string (nullable = true)
 |-- country: string (nullable = true)
 |-- is_holiday: boolean (nullable = true)
 |-- holiday_name: string (nullable = true)
 |-- holiday_type: string (nullable = true)
 |-- location: string (nullable = true)
 |-- data_retrieved_at: timestamp (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)

Original holiday row count: 730
Sample of original holiday data:
+----------+-------+----------+--------------+------------+--------+--------------------------+----+-----+---+
|date      |country|is_holiday|holiday_name  |holiday_type|location|data_retrieved_at         |year|month|day|
+----------+-------+----------+--------------+------------+--------+--------------------------+----+-----+---+
|2025-01-01|France |true      |New Year's Day|National    |France  |2025-10-13 17:00:14.078412|2025|1    |1  |
|2025-01-02|France |false     |              |

In [8]:
# ============================================
# Transform Holiday Data
# ============================================

print("Starting holiday data transformation...")

# Step 1: Filter for France and valid holidays
holiday_df = holiday_df_from_catalog.filter(
    (col("country") == "France") & 
    (col("is_holiday") == True)
)

print(f"After filtering for France and holidays: {holiday_df.count()} rows")

# Step 2: Convert date to proper timestamp and create time features
holiday_df = holiday_df.withColumn(
    "date_paris",
    to_date(col("date"), "yyyy-MM-dd")
).withColumn(
    "ts_utc",
    to_timestamp(col("date"), "yyyy-MM-dd")
).withColumn(
    "ts_paris",
    from_utc_timestamp(col("ts_utc"), "Europe/Paris")
)

# Step 3: Extract calendar features
holiday_df = holiday_df.withColumn(
    "year",
    year(col("date_paris"))
).withColumn(
    "month",
    month(col("date_paris"))
).withColumn(
    "day",
    dayofmonth(col("date_paris"))
).withColumn(
    "dow",  # Day of week (1=Sunday, 7=Saturday)
    dayofweek(col("date_paris"))
).withColumn(
    "doy",  # Day of year
    dayofyear(col("date_paris"))
).withColumn(
    "week",
    weekofyear(col("date_paris"))
).withColumn(
    "quarter",
    quarter(col("date_paris"))
)

# Step 4: Create holiday categories and flags
# Categorize holiday types
holiday_df = holiday_df.withColumn(
    "holiday_category",
    when(col("holiday_type").isin(["National", "Federal", "Public"]), "national")
    .when(col("holiday_type").isin(["Regional", "Local"]), "regional")
    .when(col("holiday_type").isin(["Observance", "Season"]), "observance")
    .otherwise("other")
)

# Create binary flags for common holiday types
holiday_df = holiday_df.withColumn(
    "is_national_holiday",
    when(col("holiday_category") == "national", 1).otherwise(0)
).withColumn(
    "is_regional_holiday", 
    when(col("holiday_category") == "regional", 1).otherwise(0)
).withColumn(
    "is_observance",
    when(col("holiday_category") == "observance", 1).otherwise(0)
)

# Step 5: Create bridge day indicators (like in your analytics code)
# A bridge day is when a holiday creates a long weekend
window_spec = Window.orderBy("date_paris")

holiday_df = holiday_df.withColumn(
    "prev_day",
    lag("date_paris", 1).over(window_spec)
).withColumn(
    "next_day",
    lead("date_paris", 1).over(window_spec)
)

# Calculate if it's a bridge day (Thursday holiday -> Friday bridge, etc.)
holiday_df = holiday_df.withColumn(
    "is_bridge_day",
    when(
        (col("dow") == 4) &  # Thursday holiday
        (datediff(col("next_day"), col("date_paris")) == 1) &  # Next day is Friday
        (col("next_day").isNotNull()), 1)  # And there is a next day
    .when(
        (col("dow") == 2) &  # Tuesday holiday  
        (datediff(col("date_paris"), col("prev_day")) == 1) &  # Previous day is Monday
        (col("prev_day").isNotNull()), 1)  # And there is a previous day
    .otherwise(0)
)

# Step 6: Create holiday season indicators
holiday_df = holiday_df.withColumn(
    "holiday_season",
    when((col("month") == 12) & (col("day") >= 24) & (col("day") <= 31), "christmas_week")
    .when((col("month") == 1) & (col("day") == 1), "new_year")
    .when((col("month") == 12) & (col("day") >= 15) & (col("day") <= 23), "pre_christmas")
    .when((col("month") == 7) | (col("month") == 8), "summer_holidays")
    .when((col("month") == 4) & (col("day") >= 1) & (col("day") <= 15), "spring_holidays")
    .otherwise("regular_holiday")
)

# Step 7: Create weekend proximity flags
holiday_df = holiday_df.withColumn(
    "is_weekend_adjacent",
    when((col("dow") == 1) | (col("dow") == 7) | (col("is_bridge_day") == 1), 1)
    .otherwise(0)
)

# Step 8: Add Easter-related calculations (like in your analytics code)
def _easter_date(year):
    """Calculate Easter date for a given year (from your analytics code)"""
    a = year % 19
    b = year // 100
    c = year % 100
    d = b // 4
    e = b % 4
    f = (b + 8) // 25
    g = (b - f + 1) // 3
    h = (19 * a + b - d - g + 15) % 30
    i = c // 4
    k = c % 4
    l = (32 + 2 * e + 2 * i - h - k) % 7
    m = (a + 11 * h + 22 * l) // 451
    month = (h + l - 7 * m + 114) // 31
    day = ((h + l - 7 * m + 114) % 31) + 1
    return f"{year}-{int(month):02d}-{int(day):02d}"

# Create Easter dates for relevant years
years = holiday_df.select("year").distinct().rdd.flatMap(lambda x: x).collect()
easter_dates = {year: _easter_date(year) for year in years if year is not None}

# Create Easter-related flags
easter_dates_df = spark.createDataFrame(
    [(year, date) for year, date in easter_dates.items()], 
    ["year", "easter_date"]
)

holiday_df = holiday_df.join(easter_dates_df, on="year", how="left")

holiday_df = holiday_df.withColumn(
    "is_easter_related",
    when(
        (col("holiday_name").contains("Easter")) |
        (col("holiday_name").contains("PÃ¢ques")) |
        (datediff(col("date_paris"), to_date(col("easter_date"), "yyyy-MM-dd")).between(-7, 7)),
        1
    ).otherwise(0)
)

# Step 9: Select final columns for analysis
holiday_final = holiday_df.select(
    "date_paris",                    # Primary date for joins
    "ts_utc",                        # UTC timestamp
    "ts_paris",                      # Paris timestamp
    "holiday_name",                   # Name of the holiday
    "holiday_type",                   # Type of holiday
    "holiday_category",               # Categorized holiday type
    "location",                       # Geographic scope
    "is_national_holiday",            # National holiday flag
    "is_regional_holiday",            # Regional holiday flag  
    "is_observance",                  # Observance flag
    "is_bridge_day",                  # Bridge day flag
    "is_weekend_adjacent",            # Weekend adjacent flag
    "is_easter_related",              # Easter related flag
    "holiday_season",                 # Holiday season category
    "year",                           # Year
    "month",                          # Month
    "day",                            # Day
    "dow",                            # Day of week
    "doy",                            # Day of year
    "week",                           # Week number
    "quarter"                         # Quarter
).distinct().orderBy("date_paris")

print("Final transformed holiday data schema:")
holiday_final.printSchema()
print(f"Final transformed row count: {holiday_final.count()}")

# Show sample of transformed data
print("Sample of transformed holiday data:")
holiday_final.show(20, truncate=False)

# Show holiday distribution
print("Holiday distribution by category:")
holiday_final.groupBy("holiday_category", "holiday_season").count().orderBy("count", ascending=False).show()

print("Holiday distribution by month:")
holiday_final.groupBy("month").count().orderBy("month").show()


Starting holiday data transformation...
After filtering for France and holidays: 50 rows
Final transformed holiday data schema:
root
 |-- date_paris: date (nullable = true)
 |-- ts_utc: timestamp (nullable = true)
 |-- ts_paris: timestamp (nullable = true)
 |-- holiday_name: string (nullable = true)
 |-- holiday_type: string (nullable = true)
 |-- holiday_category: string (nullable = false)
 |-- location: string (nullable = true)
 |-- is_national_holiday: integer (nullable = false)
 |-- is_regional_holiday: integer (nullable = false)
 |-- is_observance: integer (nullable = false)
 |-- is_bridge_day: integer (nullable = false)
 |-- is_weekend_adjacent: integer (nullable = false)
 |-- is_easter_related: integer (nullable = false)
 |-- holiday_season: string (nullable = false)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- dow: integer (nullable = true)
 |-- doy: integer (nullable = true)
 |-- week: integer (nullable = t

In [9]:
# ============================================
# Write to Silver Bucket
# ============================================

# Convert back to DynamicFrame for writing
holiday_dynamic_frame = DynamicFrame.fromDF(
    holiday_final, 
    glueContext, 
    "holiday_transformed"
)

# Write to silver bucket as Parquet
sink = glueContext.getSink(
    path=f"s3://{source_bucket}/{processed_folder_name}/holiday_transformed/",
    connection_type="s3",
    updateBehavior="UPDATE_IN_DATABASE",
    partitionKeys=[],  # Could partition by year if needed
    compression="snappy",
    enableUpdateCatalog=True,
    transformation_ctx="sink",
)

sink.setCatalogInfo(
    catalogDatabase=glue_database,
    catalogTableName="silver_holiday_data"
)

sink.setFormat("glueparquet")
sink.writeFrame(holiday_dynamic_frame)

print(f"Successfully written transformed holiday data to:")
print(f"s3://{source_bucket}/{processed_folder_name}/holiday_transformed/")

# Commit the job
job.commit()

Successfully written transformed holiday data to:
s3://data-engineering-project2-432801802552/silver_data/holiday_transformed/
