In [10]:
%timeout 20

You are already connected to a glueetl session a81013b6-3f1c-4f76-af97-22d35cf07017.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Current timeout is 20 minutes.
timeout has been set to 20 minutes.


In [12]:
%%configure
{
    "--job-bookmark-option":"job-bookmark-enable"
}

You are already connected to a glueetl session a81013b6-3f1c-4f76-af97-22d35cf07017.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


The following configurations have been updated: {'--job-bookmark-option': 'job-bookmark-enable'}


In [None]:
# creating the context and reading data from glue catalog
import sys
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pyspark.sql.functions import current_date
from awsglue.utils import getResolvedOptions

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
glueContext.spark_session
job = Job(glueContext)
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
job.init(args['JOB_NAME'], args)

# S3 variables
bucket_name = "data-engineering-project-920372994009"
source_folder = "bronze_data"
processed_folder = "silver_data"
db_name = "dev"
table_name = "OrderDetails"

# glue variables
glue_db = "data-engineering-project-glue-db"
glue_table_name = "raw_data_orderdetails"

orderDetails_df_from_catalog = glueContext.create_data_frame_from_catalog(glue_db, glue_table_name,
                                                                          additional_options = {"useCatalogSchema":True}, 
                                                                          transformation_ctx = "orderDetails_df_from_catalog")
# orderDetails_df_from_catalog.show()
if orderDetails_df_from_catalog.count() > 0:
    renamed_orderdetails_df = orderDetails_df_from_catalog.withColumnRenamed("orderdetailsid","order_details_id")\
                                                          .withColumnRenamed("orderid", "order_id")\
                                                          .withColumnRenamed("productid", "product_id")\
                                                          .withColumnRenamed("quantity", "product_quantity")\
                                                          .drop("op")
    
    # adding columns
    current_date = current_date()
    
    final_orderdetails_df = renamed_orderdetails_df.withColumn("ingestion_date",current_date)\
                                                   .withColumn("ingestion_date_pk", current_date)
    
    # creating a dynamic_frame and storing the data in S3 as parquet
    order_details_final_dyf = DynamicFrame.fromDF(final_orderdetails_df, glueContext, "orderdetails_final_dyf")
    
    glueContext.write_dynamic_frame.from_options(
        frame = order_details_final_dyf,
        connection_type = "s3",
        connection_options = {"path": f"s3://{bucket_name}/{processed_folder}/{db_name}/{table_name}/", "partitionKeys":["ingestion_date_pk"]},
        format = "parquet",
        transformation_ctx = "orderdetails_dyf_to_s3"
    )
else:
    print("no data from catalog to be processed")
    
job.commit()