In [4]:
%%configure 
{
    "--job-bookmark-option":"job-bookmark-enable"
}

You are already connected to a glueetl session c9e15291-012a-4425-b5a4-9b86901128f3.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


The following configurations have been updated: {'--job-bookmark-option': 'job-bookmark-enable'}


In [3]:
import sys
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import current_date, current_timestamp, lit, col, concat_ws, sha2, year, month
from pyspark.sql.types import TimestampType
from awsglue.dynamicframe import DynamicFrame
from awsglue.utils import getResolvedOptions

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
glueContext.spark_session
job = Job(glueContext)
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
job.init(args['JOB_NAME'], args)

# declaring the bucket_variables
bucket_name = "data-engineering-project-920372994009"
source_folder = "bronze_data"
processed_folder = "silver_data"
db_name = "dev"
table_name = "Orders"

# declaring the glue_variables
glue_db = "data-engineering-project-glue-db"
glue_table_name = "raw_data_orders"

# reading the data from glue catalog
orders_df_from_catalog = glueContext.create_data_frame_from_catalog(glue_db,glue_table_name,\
                                                                    additional_options = {"useCatalogSchema":True,"useSparkDataSource":True,"header":True},\
                                                                    transformation_ctx = "orders_df_from_catalog")
if orders_df_from_catalog.count() > 0:
    renamed_orders_df = orders_df_from_catalog.withColumnRenamed("orderid","order_id")\
                                              .withColumnRenamed("ordercustomerid","order_customer_id")\
                                              .withColumnRenamed("orderdate","order_date")\
                                              .withColumnRenamed("paymentmethod","payment_method")\
                                              .withColumnRenamed("orderplatform","order_platform")\
                                              .drop("op")
    # declaring variables for SCD - 2
#     current_date = current_date()
#     current_timestamp = current_timestamp()
#     record_end_ts = lit('9999-12-31').cast(TimestampType())
#     active_flag = lit(1)
    
#     hash_value = concat_ws('',col("order_cust_id"),col("order_date"),col("payment_method"),col("order_platform"))
    
    final_orders_df = renamed_orders_df.withColumn("order_year", year(col("order_date")))\
                                       .withColumn("order_year_pk", year(col("order_date")))\
                                       .withColumn("order_month", month(col("order_date")))\
                                       .withColumn("ingestion_date", current_date())\
                                       .orderBy(col("order_date").desc())
    
    # creating dynamic_framic of the final_df
    final_orders_dyf = DynamicFrame.fromDF(final_orders_df, glueContext, "final_orders_dyf")
    
    # writing data to S3 as parquet
    glueContext.write_dynamic_frame.from_options(
        frame = final_orders_dyf,
        connection_type = "s3",
        connection_options = {"path": f"s3://{bucket_name}/{processed_folder}/{db_name}/{table_name}/", "partitionKeys":["order_year_pk"]},
        format = "parquet",
        transformation_ctx = "orders_dyf_to_s3"
    )
else:
    print(f"Cannot get the data from glue catalog")

job.commit()

{"order_id": "5f7af246-e314-4573-a13e-80a14e2726da", "order_customer_id": "f2a52f68-32fd-4e87-8048-427449889692", "order_date": "2024-12-29", "payment_method": "Credit Card", "order_platform": "Website", "order_year": 2024, "order_year_pk": 2024, "order_month": 12, "ingestion_date": 2025-03-31}
{"order_id": "12b01566-cf79-4f53-b978-112c33aa4269", "order_customer_id": "f2a52f68-32fd-4e87-8048-427449889692", "order_date": "2024-11-16", "payment_method": "Credit Card", "order_platform": "Mobile", "order_year": 2024, "order_year_pk": 2024, "order_month": 11, "ingestion_date": 2025-03-31}
{"order_id": "13272821-905f-42e1-9b67-1d919d5eb8c2", "order_customer_id": "4a4edf67-aef9-4fc5-84d2-244859de1099", "order_date": "2024-09-28", "payment_method": "Credit Card", "order_platform": "Website", "order_year": 2024, "order_year_pk": 2024, "order_month": 9, "ingestion_date": 2025-03-31}
{"order_id": "35da5dbe-9a46-495b-b10f-81e7c2d61e25", "order_customer_id": "f2422541-c9bd-450c-9f70-999e6dba568e", 