## Glue + Iceberg evaluation

###### Bulk Insert 
###### SCD2
###### Impute deletions
###### Deduplication


## Initialise SparkSession

## Bulk Insert 


In [None]:
## Initialise SparkSession

In [5]:
%session_id_prefix native-iceberg-dataframe-
%glue_version 3.0
%idle_timeout 60
%%configure 
{
  "--conf": "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
  "--datalake-formats": "iceberg"
}

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 0.37.3 
Setting session ID prefix to native-iceberg-dataframe-
Setting Glue version to: 3.0
Current idle_timeout is 2800 minutes.
idle_timeout has been set to 60 minutes.
The following configurations have been updated: {'--conf': 'spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions', '--datalake-formats': 'iceberg'}


In [12]:
catalog_name = "glue_catalog"
bucket_name = "sb-test-bucket-ireland"
bucket_prefix = "sb"
database_name = "n2_iceberg_dataframe"
table_name = "datagensb"
warehouse_path = f"s3://{bucket_name}/{bucket_prefix}"




In [13]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .config(f"spark.sql.catalog.{catalog_name}", "org.apache.iceberg.spark.SparkCatalog") \
    .config(f"spark.sql.catalog.{catalog_name}.warehouse", f"{warehouse_path}") \
    .config(f"spark.sql.catalog.{catalog_name}.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog") \
    .config(f"spark.sql.catalog.{catalog_name}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.extensions","org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()




In [14]:
query = f"""
CREATE DATABASE IF NOT EXISTS {catalog_name}.{database_name}
"""
spark.sql(query)

DataFrame[]


In [4]:
full_load = spark.read.option('header','true').parquet("s3://sb-test-bucket-ireland/data-engineering-use-cases/dummy-data/full_load.parquet")




In [6]:
#

In [5]:
import pyspark.sql.functions as f




In [6]:

full_load = full_load.withColumn("start_datetime",f.col("extraction_timestamp"))
full_load = full_load.withColumn("end_datetime", f.to_timestamp(f.lit(future_end_datetime), 'yyyy-MM-dd'))
full_load = full_load.withColumn("op",f.lit("None"))
full_load = full_load.withColumn("is_current",f.lit(True))





In [None]:
# delete later
full_load = full_load.withColumn("start_datetime",f.col("extraction_timestamp"))
full_load = full_load.withColumn("end_datetime", f.to_timestamp(f.lit("2050-01-01"), 'yyyy-MM-dd'))
full_load = full_load.withColumn("op",f.lit("None"))
full_load = full_load.withColumn("is_current",f.lit(True))

In [15]:
input_filepath = "s3://sb-test-bucket-ireland/data-engineering-use-cases/dummy-data/full_load.parquet"
output_directory = f"{catalog_name}.{database_name}.{table_name}"
future_end_datetime = "2050-01-01"




In [6]:
full_load=spark.read.option('header','true').parquet(input_filepath)




In [16]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, Row
import time


def bulk_insert(input_filepath,output_directory,future_end_datetime):
    start = time.time()
    full_load=spark.read.option('header','true').parquet(input_filepath)
    full_load = full_load.withColumn("start_datetime",F.col("extraction_timestamp"))
    full_load = full_load.withColumn("end_datetime", F.to_timestamp(F.lit(future_end_datetime), 'yyyy-MM-dd'))
    full_load = full_load.withColumn("op",F.lit("None"))
    full_load = full_load.withColumn("is_current",F.lit(True))
    full_load.sortWithinPartitions("product_name") \
    .writeTo(output_directory) \
    .create()
    print(time.time()-start)




In [17]:
bulk_insert(input_filepath,output_directory,future_end_datetime)

10.97949767112732


In [7]:
#full_load = full_load.withColumn("'end_datetime'",f.col("extraction_timestamp"))

In [8]:
#from pyspark.sql.types import StringType,BooleanType,DateType

In [9]:
#full_load=full_load.withColumn("op",full_load.op.cast(StringType))

In [7]:
full_load.show()

+----------+------------+-----+--------------------+----+
|product_id|product_name|price|extraction_timestamp|  op|
+----------+------------+-----+--------------------+----+
|     00001|      Heater|  250| 2022-01-01 01:01:01|null|
|     00002|  Thermostat|  400| 2022-01-01 01:01:01|null|
|     00003|  Television|  600| 2022-01-01 01:01:01|null|
|     00004|     Blender|  100| 2022-01-01 01:01:01|null|
|     00005| USB charger|   50| 2022-01-01 01:01:01|null|
+----------+------------+-----+--------------------+----+


In [8]:
full_load.sortWithinPartitions("product_name") \
    .writeTo(f"{catalog_name}.{database_name}.{table_name}") \
    .create()




In [9]:
spark.catalog.listTables(database_name)

[Table(name='datagensb', database='n1_iceberg_dataframe', description=None, tableType=None, isTemporary=False)]


In [10]:
spark.table(f"{catalog_name}.{database_name}.{table_name}") \
    .show()

+----------+------------+-----+--------------------+----+-------------------+-------------------+----------+
|product_id|product_name|price|extraction_timestamp|  op|     start_datetime|       end_datetime|is_current|
+----------+------------+-----+--------------------+----+-------------------+-------------------+----------+
|     00004|     Blender|  100| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00001|      Heater|  250| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00003|  Television|  600| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00002|  Thermostat|  400| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00005| USB charger|   50| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2050-01-01 00:00:00|      true|
+----------+------------+-----+--------------------+----+-------------------+-------------------+----------+


In [11]:
spark.table(f"{catalog_name}.{database_name}.{table_name}.history") \
    .show()

+--------------------+-----------------+---------+-------------------+
|     made_current_at|      snapshot_id|parent_id|is_current_ancestor|
+--------------------+-----------------+---------+-------------------+
|2023-05-25 10:40:...|78222291571215463|     null|               true|
+--------------------+-----------------+---------+-------------------+


## Slowly Changing Dimension Type 2 (SCD2)
The updates are created by replacing one column with the same value to simplify the testing. The soft deletes are not taken into account since very similar process from a performance perspective.

Steps:

Read updates
Join full load with updates on primary key
Set end_datetime to the extraction_timestamp of the updated records
Close the existing records
Add curation columms to updates
Append updated data to existing data

In [13]:
full_load_updates = spark.read.option('header','true').parquet("s3://sb-test-bucket-ireland/data-engineering-use-cases/dummy-data/updates.parquet")




In [15]:
full_load_updates = full_load_updates.withColumn("start_datetime",f.col("extraction_timestamp"))




In [16]:
full_load_updates = full_load_updates.withColumn("end_datetime", f.to_timestamp(f.lit("2050-01-01"), 'yyyy-MM-dd'))




In [17]:
full_load_updates = full_load_updates.withColumn("is_current",f.lit(True))




In [18]:
full_load_updates.show()

+----------+------------+-----+--------------------+---+-------------------+-------------------+----------+
|product_id|product_name|price|extraction_timestamp| op|     start_datetime|       end_datetime|is_current|
+----------+------------+-----+--------------------+---+-------------------+-------------------+----------+
|     00001|      Heater| 1000| 2023-01-01 01:01:01|  U|2023-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00002|  Thermostat| 1000| 2023-01-01 01:01:01|  U|2023-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00003|  Television| 1000| 2023-01-01 01:01:01|  U|2023-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00004|     Blender| 1000| 2023-01-01 01:01:01|  U|2023-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00005| USB charger| 1000| 2023-01-01 01:01:01|  U|2023-01-01 01:01:01|2050-01-01 00:00:00|      true|
+----------+------------+-----+--------------------+---+-------------------+-------------------+----------+


In [42]:
#from pyspark.sql.types import IntegerType

In [43]:
#full_load_updates = full_load_updates.withColumn("op",full_load_updates["op"].cast(IntegerType()))

In [19]:
full_load_updates.schema

StructType(List(StructField(product_id,StringType,true),StructField(product_name,StringType,true),StructField(price,LongType,true),StructField(extraction_timestamp,TimestampType,true),StructField(op,StringType,true),StructField(start_datetime,TimestampType,true),StructField(end_datetime,TimestampType,true),StructField(is_current,BooleanType,false)))


In [6]:
#full_load_updates.schema

In [20]:
full_load.schema

StructType(List(StructField(product_id,StringType,true),StructField(product_name,StringType,true),StructField(price,LongType,true),StructField(extraction_timestamp,TimestampType,true),StructField(op,StringType,false),StructField(start_datetime,TimestampType,true),StructField(end_datetime,TimestampType,true),StructField(is_current,BooleanType,false)))


In [21]:
full_load_updates.show()

+----------+------------+-----+--------------------+---+-------------------+-------------------+----------+
|product_id|product_name|price|extraction_timestamp| op|     start_datetime|       end_datetime|is_current|
+----------+------------+-----+--------------------+---+-------------------+-------------------+----------+
|     00001|      Heater| 1000| 2023-01-01 01:01:01|  U|2023-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00002|  Thermostat| 1000| 2023-01-01 01:01:01|  U|2023-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00003|  Television| 1000| 2023-01-01 01:01:01|  U|2023-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00004|     Blender| 1000| 2023-01-01 01:01:01|  U|2023-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00005| USB charger| 1000| 2023-01-01 01:01:01|  U|2023-01-01 01:01:01|2050-01-01 00:00:00|      true|
+----------+------------+-----+--------------------+---+-------------------+-------------------+----------+


In [22]:

full_load_updates.createOrReplaceTempView(f"tmp_{table_name}_updates")




In [23]:
query = f"""
MERGE INTO {catalog_name}.{database_name}.{table_name} AS f
USING (SELECT * FROM tmp_{table_name}_updates) AS u
ON f.product_id = u.product_id
WHEN MATCHED THEN UPDATE SET f.end_datetime = u.extraction_timestamp, f.is_current = False 

"""
spark.sql(query)

DataFrame[]


In [24]:
spark.table(f"{catalog_name}.{database_name}.{table_name}") \
    .show()

+----------+------------+-----+--------------------+----+-------------------+-------------------+----------+
|product_id|product_name|price|extraction_timestamp|  op|     start_datetime|       end_datetime|is_current|
+----------+------------+-----+--------------------+----+-------------------+-------------------+----------+
|     00004|     Blender|  100| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2023-01-01 01:01:01|     false|
|     00001|      Heater|  250| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2023-01-01 01:01:01|     false|
|     00003|  Television|  600| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2023-01-01 01:01:01|     false|
|     00002|  Thermostat|  400| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2023-01-01 01:01:01|     false|
|     00005| USB charger|   50| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2023-01-01 01:01:01|     false|
+----------+------------+-----+--------------------+----+-------------------+-------------------+----------+


In [None]:
# query = f"""
# SELECT tmp_{table_name}_updates.product_id as mergekey,tmp_{table_name}_updates.*
# from tmp_{table_name}_updates

# UNION ALL

# SELECT * FROM {catalog_name}.{database_name}.{table_name}
# ON {catalog_name}.{database_name}.{table_name}.product_id = tmp_{table_name}_updates.product_id
# WHEN MATCHED THEN UPDATE SET f.end_datetime = u.extraction_timestamp, f.is_current = False

# """
# spark.sql(query)

In [113]:
# query = f"""
# SELECT * FROM tmp_{table_name}_updates


# UNION ALL

# SELECT * FROM {catalog_name}.{database_name}.{table_name}


# """
# spark.sql(query)

DataFrame[product_id: string, product_name: string, price: bigint, extraction_timestamp: timestamp, op: string, start_datetime: timestamp, end_datetime: timestamp, is_current: boolean]


In [114]:
# spark.table(f"{catalog_name}.{database_name}.{table_name}") \
#     .show()

+----------+------------+-----+--------------------+---+-------------------+-------------------+----------+
|product_id|product_name|price|extraction_timestamp| op|     start_datetime|       end_datetime|is_current|
+----------+------------+-----+--------------------+---+-------------------+-------------------+----------+
|     00004|     Blender|  100| 2022-01-01 01:01:01|  U|2022-01-01 01:01:01|2023-01-01 00:00:00|     false|
|     00001|      Heater|  250| 2022-01-01 01:01:01|  U|2022-01-01 01:01:01|2023-01-01 00:00:00|     false|
|     00003|  Television|  600| 2022-01-01 01:01:01|  U|2022-01-01 01:01:01|2023-01-01 00:00:00|     false|
|     00002|  Thermostat|  400| 2022-01-01 01:01:01|  U|2022-01-01 01:01:01|2023-01-01 00:00:00|     false|
|     00005| USB charger|   50| 2022-01-01 01:01:01|  U|2022-01-01 01:01:01|2023-01-01 00:00:00|     false|
+----------+------------+-----+--------------------+---+-------------------+-------------------+----------+


In [38]:
# spark.table(f"{catalog_name}.{database_name}.{table_name}") \
#     .show()

+----------+------------+-----+--------------------+----+-------------------+-------------------+----------+
|product_id|product_name|price|extraction_timestamp|  op|     start_datetime|       end_datetime|is_current|
+----------+------------+-----+--------------------+----+-------------------+-------------------+----------+
|     00001|      Heater| 1000| 2023-01-01 00:00:00|null|2023-01-01 00:00:00|2050-01-01 00:00:00|     false|
|     00002|  Thermostat| 1000| 2023-01-01 00:00:00|null|2023-01-01 00:00:00|2050-01-01 00:00:00|     false|
|     00003|  Television| 1000| 2023-01-01 00:00:00|null|2023-01-01 00:00:00|2050-01-01 00:00:00|     false|
|     00004|     Blender| 1000| 2023-01-01 00:00:00|null|2023-01-01 00:00:00|2050-01-01 00:00:00|     false|
|     00005| USB charger| 1000| 2023-01-01 00:00:00|null|2023-01-01 00:00:00|2050-01-01 00:00:00|     false|
|     00004|     Blender|  100| 2022-01-01 01:01:01|null|2022-01-01 01:01:01|2023-01-01 00:00:00|     false|
|     00001|      H

In [25]:
full_load_updates.writeTo(f"{catalog_name}.{database_name}.{table_name}").append()




In [26]:
spark.table(f"{catalog_name}.{database_name}.{table_name}") \
    .show()

+----------+------------+-----+--------------------+----+-------------------+-------------------+----------+
|product_id|product_name|price|extraction_timestamp|  op|     start_datetime|       end_datetime|is_current|
+----------+------------+-----+--------------------+----+-------------------+-------------------+----------+
|     00001|      Heater| 1000| 2023-01-01 01:01:01|   U|2023-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00002|  Thermostat| 1000| 2023-01-01 01:01:01|   U|2023-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00003|  Television| 1000| 2023-01-01 01:01:01|   U|2023-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00004|     Blender| 1000| 2023-01-01 01:01:01|   U|2023-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00005| USB charger| 1000| 2023-01-01 01:01:01|   U|2023-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00004|     Blender|  100| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2023-01-01 01:01:01|     false|
|     00001|      H

In [13]:
#updates.writeTo(f"{catalog_name}.{database_name}.{table_name}").createOrReplace()




In [10]:
import pyspark.sql.functions as f




In [4]:
import time
start = time.time()
print(time.time()-start)

3.4332275390625e-05


In [8]:
round(start,1)

1684516751.7


In [11]:
start = time.time()
full_load = spark.read.option('header','true').parquet("s3://sb-test-bucket-ireland/dummy_data/full_load.parquet")
full_load = full_load.withColumn("start_datetime",f.col("extraction_timestamp"))
full_load = full_load.withColumn("end_datetime", f.to_timestamp(f.lit("2050-01-01"), 'yyyy-MM-dd'))
full_load = full_load.withColumn("op",f.lit("None"))
full_load = full_load.withColumn("is_current",f.lit(True))
full_load.sortWithinPartitions("product_name") \
    .writeTo(f"{catalog_name}.{database_name}.{table_name}") \
    .create()
spark.catalog.listTables(database_name)
full_load_updates = spark.read.option('header','true').parquet("s3://sb-test-bucket-ireland/dummy_data/updates.parquet")
full_load_updates = full_load_updates.withColumn("start_datetime",f.col("extraction_timestamp"))
full_load_updates = full_load_updates.withColumn("end_datetime", f.to_timestamp(f.lit("2050-01-01"), 'yyyy-MM-dd'))
full_load_updates = full_load_updates.withColumn("is_current",f.lit(True))

full_load_updates.createOrReplaceTempView(f"tmp_{table_name}_updates")
query = f"""
MERGE INTO {catalog_name}.{database_name}.{table_name} AS f
USING (SELECT * FROM tmp_{table_name}_updates) AS u
ON f.product_id = u.product_id
WHEN MATCHED THEN UPDATE SET f.end_datetime = u.extraction_timestamp, f.is_current = False 

"""
spark.sql(query)
full_load_updates.writeTo(f"{catalog_name}.{database_name}.{table_name}").append()
print(time.time()-start)

45.95531153678894


In [29]:
spark.table(f"{catalog_name}.{database_name}.{table_name}") \
    .show()

+----------+------------+-----+--------------------+----+-------------------+-------------------+----------+
|product_id|product_name|price|extraction_timestamp|  op|     start_datetime|       end_datetime|is_current|
+----------+------------+-----+--------------------+----+-------------------+-------------------+----------+
|     00004|     Blender|  100| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2023-01-01 01:01:01|     false|
|     00001|      Heater|  250| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2023-01-01 01:01:01|     false|
|     00003|  Television|  600| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2023-01-01 01:01:01|     false|
|     00002|  Thermostat|  400| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2023-01-01 01:01:01|     false|
|     00005| USB charger|   50| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2023-01-01 01:01:01|     false|
|     00001|      Heater| 1000| 2023-01-01 01:01:01|   U|2023-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00002|  Therm

## Slowly Changing Dimension Type 2 - Complex
This is a more complex SCD2 process which takes into account:
Late arriving records where an update is processed with an extraction_timestamp that is later than the extraction_timestamp of the last processed record
Batches which contain multiple updates to the same primary key
The process can be summarised as follows:

Concat/union updates with the existing data
Sort by primary key and extraction_timestamp
Window by primary key and set the end_datetime to the next record's extraction_timestamp, otherwise set it to a future distant timestamp
The process could be optimised by separating records which have not received any updates, but this is left out to make the logic easier to follow.

In [14]:
late_updates = spark.read.option('header','true').parquet("s3://sb-test-bucket-ireland/data-engineering-use-cases/dummy-data/late_updates.parquet")




Functions

In [None]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, Row
import time


def bulk_insert(input_filepath,output_directory,future_end_datetime):
    start = time.time()
    full_load=spark.read.option('header','true').parquet(input_filepath)
    full_load = full_load.withColumn("start_datetime",F.col("extraction_timestamp"))
    full_load = full_load.withColumn("end_datetime", F.to_timestamp(F.lit(future_end_datetime), 'yyyy-MM-dd'))
    full_load = full_load.withColumn("op",F.lit("None"))
    full_load = full_load.withColumn("is_current",F.lit(True))
    full_load.sortWithinPartitions("product_name") \
    .writeTo(output_directory) \
    .create()
    print(time.time()-start)

In [24]:
updates_filepath ="s3://sb-test-bucket-ireland/data-engineering-use-cases/dummy-data/updates.parquet"
primary_key = "product_id"




In [27]:
def scd2_simple(input_filepath, updates_filepath, output_directory, future_end_datetime, primary_key):
    start = time.time()
    full_load_updates = spark.read.option('header','true').parquet(updates_filepath)
    full_load_updates = full_load_updates.withColumn("start_datetime",F.col("extraction_timestamp"))
    full_load_updates = full_load_updates.withColumn("end_datetime", F.to_timestamp(F.lit(future_end_datetime), 'yyyy-MM-dd'))
    full_load_updates = full_load_updates.withColumn("is_current",F.lit(True))

    full_load_updates.createOrReplaceTempView(f"tmp_{table_name}_updates")
    query = f"""
    MERGE INTO {catalog_name}.{database_name}.{table_name} AS f
    USING (SELECT * FROM tmp_{table_name}_updates) AS u
    ON f.product_id = u.product_id
    WHEN MATCHED THEN UPDATE SET f.end_datetime = u.extraction_timestamp, f.is_current = False 

    """
    spark.sql(query)
    full_load_updates.writeTo(f"{catalog_name}.{database_name}.{table_name}").append()
    print(time.time()-start)




In [28]:
scd2_simple(input_filepath, updates_filepath, output_directory, future_end_datetime, primary_key)

23.577928066253662


In [None]:
def scd2_complex(input_filepath, updates_filepath, output_directory, future_end_datetime, primary_key):
    start = time.time()
    print(time.time()-start)