In [5]:
%load_ext dotenv
%dotenv

In [14]:
%iam_role arn:aws:iam::684969100054:role/AdminAccessGlueNotebook
%region eu-west-1
%session_id_prefix test-
%glue_version 3.0
%idle_timeout 60
%worker_type G.1X
%number_of_workers 2
%%configure 
{
  "--conf": "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
  "--datalake-formats": "iceberg"
}

Current iam_role is arn:aws:iam::684969100054:role/aws-reserved/sso.amazonaws.com/eu-west-2/AWSReservedSSO_AdministratorAccess_ab408ccf26c25b37
iam_role has been set to arn:aws:iam::684969100054:role/AdminAccessGlueNotebook.
Previous region: eu-west-1
Setting new region to: eu-west-1
Reauthenticating Glue client with new region: eu-west-1
IAM role has been set to arn:aws:iam::684969100054:role/AdminAccessGlueNotebook. Reauthenticating.
Authenticating with environment variables and user-defined glue_role_arn: arn:aws:iam::684969100054:role/AdminAccessGlueNotebook
Authentication done.
Region is set to: eu-west-1
Setting session ID prefix to test-
Setting Glue version to: 3.0
Current idle_timeout is 2880 minutes.
idle_timeout has been set to 60 minutes.
Previous worker type: G.1X
Setting new worker type to: G.1X
Previous number of workers: 5
Setting new number of workers to: 2
The following configurations have been updated: {'--conf': 'spark.sql.extensions=org.apache.iceberg.spark.extensi

In [1]:
import boto3
import datetime
from pyspark.sql import SparkSession, functions as F
from pyspark.sql import SparkSession

catalog_name = "glue_catalog"
bucket_name = "sb-test-bucket-ireland"
bucket_prefix = "sb"
table_name = "datagensb"
warehouse_path = f"s3://{bucket_name}/{bucket_prefix}"
input_prefix = "tpcds_test"
input_path = f"s3://{bucket_name}/{input_prefix}"

spark = SparkSession.builder \
    .config(f"spark.sql.catalog.{catalog_name}", "org.apache.iceberg.spark.SparkCatalog") \
    .config(f"spark.sql.catalog.{catalog_name}.warehouse", f"{warehouse_path}") \
    .config(f"spark.sql.catalog.{catalog_name}.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog") \
    .config(f"spark.sql.catalog.{catalog_name}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.extensions","org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()

Trying to create a Glue session for the kernel.
Worker Type: G.1X
Number of Workers: 2
Session ID: a104ef4d-3d12-4ae6-b78e-656f0b803759
Job Type: glueetl
Applying the following default arguments:
--glue_kernel_version 0.37.4
--enable-glue-datacatalog true
--conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
--datalake-formats iceberg
Waiting for session a104ef4d-3d12-4ae6-b78e-656f0b803759 to get into ready status...
Session a104ef4d-3d12-4ae6-b78e-656f0b803759 has been created.



In [3]:

source_database_name = "tpcds_test"
dest_database_name = "tpcds_test_glue_iceberg"
output_directory = f"{catalog_name}.{dest_database_name}.{table_name}"
future_end_datetime = datetime.datetime(2250, 1, 1)

## Create a database with the name hudi_df to host hudi tables if not exists.
try:
    glue = boto3.client('glue')
    glue.create_database(DatabaseInput={'Name': dest_database_name})
    print(f"New database {dest_database_name} created")
except glue.exceptions.AlreadyExistsException:
    print(f"Database {dest_database_name} already exist")

## Delete files in S3
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name)
bucket.objects.filter(Prefix=f"{bucket_name}/{bucket_prefix}/").delete()

Database tpcds_test_glue_iceberg already exist
[]


In [2]:
full_load = spark.read.option('header','true').parquet(f"{input_path}/full_load")
full_load.show()

+---------------+---------------+----------+--------------+-----------+-----------+-----------+----------+-----------+----------------+-----------+-----------------+-------------+--------------+-------------------+------------------+---------------------+-----------------+----------+-------------+-----------+-------------------+-------------+--------------------+----+---+
|ss_sold_date_sk|ss_sold_time_sk|ss_item_sk|ss_customer_sk|ss_cdemo_sk|ss_hdemo_sk|ss_store_sk|ss_addr_sk|ss_promo_sk|ss_ticket_number|ss_quantity|ss_wholesale_cost|ss_list_price|ss_sales_price|ss_ext_discount_amt|ss_ext_sales_price|ss_ext_wholesale_cost|ss_ext_list_price|ss_ext_tax|ss_coupon_amt|ss_net_paid|ss_net_paid_inc_tax|ss_net_profit|extraction_timestamp|  op| pk|
+---------------+---------------+----------+--------------+-----------+-----------+-----------+----------+-----------+----------------+-----------+-----------------+-------------+--------------+-------------------+------------------+-----------------

In [13]:
## Drop table in Glue Data Catalog
try:
    glue = boto3.client('glue')
    glue.delete_table(DatabaseName=dest_database_name, Name=table_name)
except glue.exceptions.EntityNotFoundException:
    print(f"Table {dest_database_name}.{table_name} does not exist")

def bulk_insert(full_load_path,output_directory,future_end_datetime):
    
    # read the bulk insert parquet file
    full_load=spark.read.parquet(full_load_path)
    # adds 3 new columns
    full_load = full_load.withColumn("start_datetime",F.col("extraction_timestamp"))
    full_load = full_load.withColumn("end_datetime", F.to_timestamp(F.lit(future_end_datetime), 'yyyy-MM-dd'))
    full_load = full_load.withColumn("op",F.lit("None"))
    full_load = full_load.withColumn("is_current",F.lit(True))
    full_load.writeTo(output_directory).createOrReplace()
    
bulk_insert(f"{input_path}/full_load", output_directory,future_end_datetime)
spark.table(output_directory).show()

+---------------+---------------+----------+--------------+-----------+-----------+-----------+----------+-----------+----------------+-----------+-----------------+-------------+--------------+-------------------+------------------+---------------------+-----------------+----------+-------------+-----------+-------------------+-------------+--------------------+----+---+-------------------+-------------------+----------+
|ss_sold_date_sk|ss_sold_time_sk|ss_item_sk|ss_customer_sk|ss_cdemo_sk|ss_hdemo_sk|ss_store_sk|ss_addr_sk|ss_promo_sk|ss_ticket_number|ss_quantity|ss_wholesale_cost|ss_list_price|ss_sales_price|ss_ext_discount_amt|ss_ext_sales_price|ss_ext_wholesale_cost|ss_ext_list_price|ss_ext_tax|ss_coupon_amt|ss_net_paid|ss_net_paid_inc_tax|ss_net_profit|extraction_timestamp|  op| pk|     start_datetime|       end_datetime|is_current|
+---------------+---------------+----------+--------------+-----------+-----------+-----------+----------+-----------+----------------+-----------+-

In [7]:
spark.read.option('header','true').parquet(f"{input_path}/cdc_1").show()


+---------------+---------------+----------+--------------+-----------+-----------+-----------+----------+-----------+----------------+-----------+-----------------+-------------+--------------+-------------------+------------------+---------------------+-----------------+----------+-------------+-----------+-------------------+-------------+--------------------+---+---+
|ss_sold_date_sk|ss_sold_time_sk|ss_item_sk|ss_customer_sk|ss_cdemo_sk|ss_hdemo_sk|ss_store_sk|ss_addr_sk|ss_promo_sk|ss_ticket_number|ss_quantity|ss_wholesale_cost|ss_list_price|ss_sales_price|ss_ext_discount_amt|ss_ext_sales_price|ss_ext_wholesale_cost|ss_ext_list_price|ss_ext_tax|ss_coupon_amt|ss_net_paid|ss_net_paid_inc_tax|ss_net_profit|extraction_timestamp| op| pk|
+---------------+---------------+----------+--------------+-----------+-----------+-----------+----------+-----------+----------------+-----------+-----------------+-------------+--------------+-------------------+------------------+-------------------

In [14]:
def scd2_simple(updates_filepath, output_directory, future_end_datetime, primary_key):

    # read the new updates parquet file
    full_load_updates = spark.read.option('header','true').parquet(updates_filepath)
    # adds 3 new columns
    full_load_updates = full_load_updates.withColumn("start_datetime",F.col("extraction_timestamp"))
    full_load_updates = full_load_updates.withColumn("end_datetime", F.to_timestamp(F.lit(future_end_datetime), 'yyyy-MM-dd'))
    full_load_updates = full_load_updates.withColumn("is_current",F.lit(True))
    full_load_updates.createOrReplaceTempView(f"tmp_{table_name}_updates") 
    simple_merge_sql = f"""
    MERGE INTO {catalog_name}.{dest_database_name}.{table_name} dest
        USING (   
            SELECT
            ss_sold_date_sk,
            ss_sold_time_sk,
            ss_item_sk,
            ss_customer_sk,
            ss_cdemo_sk,
            ss_hdemo_sk,
            ss_addr_sk, 
            ss_store_sk,
            ss_promo_sk,
            ss_ticket_number,
            ss_quantity,
            ss_wholesale_cost,
            ss_list_price,
            ss_sales_price, 
            ss_ext_discount_amt,
            ss_ext_sales_price,
            ss_ext_wholesale_cost,
            ss_ext_list_price,
            ss_ext_tax,
            ss_coupon_amt,
            ss_net_paid, 
            ss_net_paid_inc_tax,
            ss_net_profit, 
            extraction_timestamp, 
            op, 
            pk,
            start_datetime, 
            end_datetime, 
            is_current
                FROM tmp_{table_name}_updates
        UNION ALL
            SELECT
            t.ss_sold_date_sk,
            t.ss_sold_time_sk, 
            t.ss_item_sk,
            t.ss_customer_sk,
            t.ss_cdemo_sk,
            t.ss_hdemo_sk,
            t.ss_addr_sk, 
            t.ss_store_sk,
            t.ss_promo_sk,
            t.ss_ticket_number,
            t.ss_quantity, 
            t.ss_wholesale_cost,
            t.ss_list_price,
            t.ss_sales_price, 
            t.ss_ext_discount_amt,
            t.ss_ext_sales_price,
            t.ss_ext_wholesale_cost, 
            t.ss_ext_list_price,
            t.ss_ext_tax,
            t.ss_coupon_amt,
            t.ss_net_paid, 
            t.ss_net_paid_inc_tax,
            t.ss_net_profit, 
            t.extraction_timestamp, 
            t.op, 
            t.pk,
            t.start_datetime,
            u.start_datetime AS end_datetime, 
            u.is_current
            FROM {catalog_name}.{dest_database_name}.{table_name} as t
            INNER JOIN tmp_{table_name}_updates as u ON t.pk = u.pk AND t.is_current = true
        ) AS src
        ON (dest.pk = src.pk AND dest.extraction_timestamp=src.extraction_timestamp)
        WHEN MATCHED THEN
            UPDATE SET end_datetime = src.end_datetime,is_current = false
        WHEN NOT MATCHED THEN 
            INSERT (
                ss_sold_date_sk, ss_sold_time_sk, ss_item_sk, ss_customer_sk, ss_cdemo_sk, ss_hdemo_sk, ss_addr_sk, 
                ss_store_sk, ss_promo_sk, ss_ticket_number, ss_quantity, ss_wholesale_cost, ss_list_price, ss_sales_price, 
                ss_ext_discount_amt, ss_ext_sales_price, ss_ext_wholesale_cost, ss_ext_list_price, ss_ext_tax, ss_coupon_amt, 
                ss_net_paid, ss_net_paid_inc_tax, ss_net_profit, extraction_timestamp, op, pk, start_datetime, end_datetime, is_current
            )
            VALUES (
                src.ss_sold_date_sk, src.ss_sold_time_sk, src.ss_item_sk, src.ss_customer_sk, src.ss_cdemo_sk, src.ss_hdemo_sk, 
                src.ss_addr_sk, src.ss_store_sk, src.ss_promo_sk, src.ss_ticket_number, src.ss_quantity, src.ss_wholesale_cost,
                src.ss_list_price, src.ss_sales_price, src.ss_ext_discount_amt, src.ss_ext_sales_price, src.ss_ext_wholesale_cost, 
                src.ss_ext_list_price, src.ss_ext_tax, src.ss_coupon_amt, src.ss_net_paid, src.ss_net_paid_inc_tax, src.ss_net_profit,
                src.extraction_timestamp, src.op, src.pk, src.extraction_timestamp, src.end_datetime, true
            )
            """   
    spark.sql(simple_merge_sql).writeTo(output_directory)


scd2_simple(f"{input_path}/cdc_1", output_directory,future_end_datetime, "pk")
spark.table(output_directory).sort("pk","extraction_timestamp").show()

+---------------+---------------+----------+--------------+-----------+-----------+-----------+----------+-----------+----------------+-----------+-----------------+-------------+--------------+-------------------+------------------+---------------------+-----------------+----------+-------------+-----------+-------------------+-------------+--------------------+----+---+-------------------+-------------------+----------+
|ss_sold_date_sk|ss_sold_time_sk|ss_item_sk|ss_customer_sk|ss_cdemo_sk|ss_hdemo_sk|ss_store_sk|ss_addr_sk|ss_promo_sk|ss_ticket_number|ss_quantity|ss_wholesale_cost|ss_list_price|ss_sales_price|ss_ext_discount_amt|ss_ext_sales_price|ss_ext_wholesale_cost|ss_ext_list_price|ss_ext_tax|ss_coupon_amt|ss_net_paid|ss_net_paid_inc_tax|ss_net_profit|extraction_timestamp|  op| pk|     start_datetime|       end_datetime|is_current|
+---------------+---------------+----------+--------------+-----------+-----------+-----------+----------+-----------+----------------+-----------+-

In [15]:
spark.read.option('header','true').parquet(f"{input_path}/cdc_2").show()

+---------------+---------------+----------+--------------+-----------+-----------+-----------+----------+-----------+----------------+-----------+-----------------+-------------+--------------+-------------------+------------------+---------------------+-----------------+----------+-------------+-----------+-------------------+-------------+--------------------+---+---+
|ss_sold_date_sk|ss_sold_time_sk|ss_item_sk|ss_customer_sk|ss_cdemo_sk|ss_hdemo_sk|ss_store_sk|ss_addr_sk|ss_promo_sk|ss_ticket_number|ss_quantity|ss_wholesale_cost|ss_list_price|ss_sales_price|ss_ext_discount_amt|ss_ext_sales_price|ss_ext_wholesale_cost|ss_ext_list_price|ss_ext_tax|ss_coupon_amt|ss_net_paid|ss_net_paid_inc_tax|ss_net_profit|extraction_timestamp| op| pk|
+---------------+---------------+----------+--------------+-----------+-----------+-----------+----------+-----------+----------------+-----------+-----------------+-------------+--------------+-------------------+------------------+-------------------

In [16]:
scd2_simple(f"{input_path}/cdc_2", output_directory,future_end_datetime, "pk")
spark.table(output_directory).sort("pk","extraction_timestamp").show()


AttributeError: 'DataFrame' object has no attribute 'order'


In [None]:
def scd2_complex(updates_filepath, output_directory, future_end_datetime, primary_key):

    # read the new updates parquet file
    full_load_updates = spark.read.option('header','true').parquet(updates_filepath)
    # adds 3 new columns
    full_load_updates = full_load_updates.withColumn("start_datetime",F.col("extraction_timestamp"))
    full_load_updates = full_load_updates.withColumn("end_datetime", F.to_timestamp(F.lit(future_end_datetime), 'yyyy-MM-dd'))
    full_load_updates = full_load_updates.withColumn("is_current",F.lit(True))
    full_load_updates.createOrReplaceTempView(f"tmp_{table_name}_updates") 
    simple_merge_sql = f"""
    MERGE INTO {catalog_name}.{dest_database_name}.{table_name} dest
        USING (   
            SELECT
            ss_sold_date_sk,
            ss_sold_time_sk,
            ss_item_sk,
            ss_customer_sk,
            ss_cdemo_sk,
            ss_hdemo_sk,
            ss_addr_sk, 
            ss_store_sk,
            ss_promo_sk,
            ss_ticket_number,
            ss_quantity,
            ss_wholesale_cost,
            ss_list_price,
            ss_sales_price, 
            ss_ext_discount_amt,
            ss_ext_sales_price,
            ss_ext_wholesale_cost,
            ss_ext_list_price,
            ss_ext_tax,
            ss_coupon_amt,
            ss_net_paid, 
            ss_net_paid_inc_tax,
            ss_net_profit, 
            extraction_timestamp, 
            op, 
            pk,
            start_datetime, 
            end_datetime, 
            is_current
                FROM tmp_{table_name}_updates
        UNION ALL
            SELECT
            t.ss_sold_date_sk,
            t.ss_sold_time_sk, 
            t.ss_item_sk,
            t.ss_customer_sk,
            t.ss_cdemo_sk,
            t.ss_hdemo_sk,
            t.ss_addr_sk, 
            t.ss_store_sk,
            t.ss_promo_sk,
            t.ss_ticket_number,
            t.ss_quantity, 
            t.ss_wholesale_cost,
            t.ss_list_price,
            t.ss_sales_price, 
            t.ss_ext_discount_amt,
            t.ss_ext_sales_price,
            t.ss_ext_wholesale_cost, 
            t.ss_ext_list_price,
            t.ss_ext_tax,
            t.ss_coupon_amt,
            t.ss_net_paid, 
            t.ss_net_paid_inc_tax,
            t.ss_net_profit, 
            t.extraction_timestamp, 
            t.op, 
            t.pk,
            t.start_datetime,
            u.start_datetime AS end_datetime, 
            u.is_current
            FROM {catalog_name}.{dest_database_name}.{table_name} as t
            INNER JOIN tmp_{table_name}_updates as u ON t.pk = u.pk AND t.is_current = true
        ) AS src
        ON (dest.pk = src.pk AND dest.extraction_timestamp=src.extraction_timestamp)
        WHEN MATCHED THEN
            UPDATE SET end_datetime = src.end_datetime,is_current = false
        WHEN NOT MATCHED THEN 
            INSERT (
                ss_sold_date_sk, ss_sold_time_sk, ss_item_sk, ss_customer_sk, ss_cdemo_sk, ss_hdemo_sk, ss_addr_sk, 
                ss_store_sk, ss_promo_sk, ss_ticket_number, ss_quantity, ss_wholesale_cost, ss_list_price, ss_sales_price, 
                ss_ext_discount_amt, ss_ext_sales_price, ss_ext_wholesale_cost, ss_ext_list_price, ss_ext_tax, ss_coupon_amt, 
                ss_net_paid, ss_net_paid_inc_tax, ss_net_profit, extraction_timestamp, op, pk, start_datetime, end_datetime, is_current
            )
            VALUES (
                src.ss_sold_date_sk, src.ss_sold_time_sk, src.ss_item_sk, src.ss_customer_sk, src.ss_cdemo_sk, src.ss_hdemo_sk, 
                src.ss_addr_sk, src.ss_store_sk, src.ss_promo_sk, src.ss_ticket_number, src.ss_quantity, src.ss_wholesale_cost,
                src.ss_list_price, src.ss_sales_price, src.ss_ext_discount_amt, src.ss_ext_sales_price, src.ss_ext_wholesale_cost, 
                src.ss_ext_list_price, src.ss_ext_tax, src.ss_coupon_amt, src.ss_net_paid, src.ss_net_paid_inc_tax, src.ss_net_profit,
                src.extraction_timestamp, src.op, src.pk, src.extraction_timestamp, src.end_datetime, true
            )
            """   
    spark.sql(simple_merge_sql).writeTo(output_directory)


scd2_simple(f"{input_path}/cdc_1", output_directory,future_end_datetime, "pk")
spark.table(output_directory).sort("pk","extraction_timestamp").show()