## Glue + Iceberg evaluation

###### Bulk Insert 
###### SCD2
###### Impute deletions
###### Deduplication


In [5]:
%session_id_prefix native-iceberg-dataframe-
%glue_version 3.0
%idle_timeout 60
%%configure 
{
  "--conf": "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
  "--datalake-formats": "iceberg"
}

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 0.37.3 
Setting session ID prefix to native-iceberg-dataframe-
Setting Glue version to: 3.0
Current idle_timeout is 2800 minutes.
idle_timeout has been set to 60 minutes.
The following configurations have been updated: {'--conf': 'spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions', '--datalake-formats': 'iceberg'}


In [22]:
catalog_name = "glue_catalog"
bucket_name = "sb-test-bucket-ireland"
bucket_prefix = "sb"
database_name = "sb13_iceberg_dataframe"
table_name = "datagensb"
warehouse_path = f"s3://{bucket_name}/{bucket_prefix}"




In [23]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .config(f"spark.sql.catalog.{catalog_name}", "org.apache.iceberg.spark.SparkCatalog") \
    .config(f"spark.sql.catalog.{catalog_name}.warehouse", f"{warehouse_path}") \
    .config(f"spark.sql.catalog.{catalog_name}.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog") \
    .config(f"spark.sql.catalog.{catalog_name}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.extensions","org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()




In [24]:
query = f"""
CREATE DATABASE IF NOT EXISTS {catalog_name}.{database_name}
"""
spark.sql(query)

DataFrame[]


In [25]:
input_filepath = "s3://sb-test-bucket-ireland/data-engineering-use-cases/dummy-data/full_load.parquet"
output_directory = f"{catalog_name}.{database_name}.{table_name}"
future_end_datetime = "2050-01-01"
updates_filepath ="s3://sb-test-bucket-ireland/data-engineering-use-cases/dummy-data/updates.parquet"
primary_key = "product_id"
late_updates_filepath="s3://sb-test-bucket-ireland/data-engineering-use-cases/dummy-data/late_updates.parquet"




Functions

In [26]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, Row
import time


def bulk_insert(input_filepath,output_directory,future_end_datetime):
    start = time.time()
    full_load=spark.read.option('header','true').parquet(input_filepath)
    full_load = full_load.withColumn("start_datetime",F.col("extraction_timestamp"))
    full_load = full_load.withColumn("end_datetime", F.to_timestamp(F.lit(future_end_datetime), 'yyyy-MM-dd'))
    full_load = full_load.withColumn("op",F.lit("None"))
    full_load = full_load.withColumn("is_current",F.lit(True))
    full_load.sortWithinPartitions("product_name") \
    .writeTo(output_directory) \
    .create()
    print(time.time()-start)




In [27]:
bulk_insert(input_filepath,output_directory,future_end_datetime)

3.3868606090545654


In [28]:
spark.table(f"{catalog_name}.{database_name}.{table_name}").show()

+----------+------------+-----+--------------------+----+-------------------+-------------------+----------+
|product_id|product_name|price|extraction_timestamp|  op|     start_datetime|       end_datetime|is_current|
+----------+------------+-----+--------------------+----+-------------------+-------------------+----------+
|     00004|     Blender|  100| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00001|      Heater|  250| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00003|  Television|  600| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00002|  Thermostat|  400| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00005| USB charger|   50| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2050-01-01 00:00:00|      true|
+----------+------------+-----+--------------------+----+-------------------+-------------------+----------+


In [7]:
# updates_filepath ="s3://sb-test-bucket-ireland/data-engineering-use-cases/dummy-data/updates.parquet"
# primary_key = "product_id"

In [29]:
def scd2_simple(input_filepath, updates_filepath, output_directory, future_end_datetime, primary_key):
    start = time.time()
    full_load_updates = spark.read.option('header','true').parquet(updates_filepath)
    full_load_updates = full_load_updates.withColumn("start_datetime",F.col("extraction_timestamp"))
    full_load_updates = full_load_updates.withColumn("end_datetime", F.to_timestamp(F.lit(future_end_datetime), 'yyyy-MM-dd'))
    full_load_updates = full_load_updates.withColumn("is_current",F.lit(True))

    full_load_updates.createOrReplaceTempView(f"tmp_{table_name}_updates")
    query = f"""
    MERGE INTO {catalog_name}.{database_name}.{table_name} AS f
    USING (SELECT * FROM tmp_{table_name}_updates) AS u
    ON f.{primary_key} = u.{primary_key}
    WHEN MATCHED THEN UPDATE SET f.end_datetime = u.extraction_timestamp, f.is_current = False 

    """
    spark.sql(query)
    full_load_updates.writeTo(f"{catalog_name}.{database_name}.{table_name}").append()
    print(time.time()-start)




In [30]:
scd2_simple(input_filepath, updates_filepath, output_directory, future_end_datetime, primary_key)

11.610728025436401


In [31]:
spark.table(f"{catalog_name}.{database_name}.{table_name}").show()

+----------+------------+-----+--------------------+----+-------------------+-------------------+----------+
|product_id|product_name|price|extraction_timestamp|  op|     start_datetime|       end_datetime|is_current|
+----------+------------+-----+--------------------+----+-------------------+-------------------+----------+
|     00004|     Blender|  100| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2023-01-01 01:01:01|     false|
|     00001|      Heater|  250| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2023-01-01 01:01:01|     false|
|     00003|  Television|  600| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2023-01-01 01:01:01|     false|
|     00002|  Thermostat|  400| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2023-01-01 01:01:01|     false|
|     00005| USB charger|   50| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2023-01-01 01:01:01|     false|
|     00001|      Heater| 1000| 2023-01-01 01:01:01|   U|2023-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00002|  Therm

In [None]:
# input_filepath = "s3://sb-test-bucket-ireland/data-engineering-use-cases/dummy-data/full_load.parquet"
# output_directory = f"{catalog_name}.{database_name}.{table_name}"
# future_end_datetime = "2050-01-01"
# late_updates_filepath="s3://sb-test-bucket-ireland/data-engineering-use-cases/dummy-data/late_updates.parquet"

In [32]:
spark.table(f"{catalog_name}.{database_name}.{table_name}").drop("end_datetime","is_current").writeTo(f"{catalog_name}.{database_name}.{table_name}").createOrReplace()




In [33]:
def scd2_complex(input_filepath, late_updates_filepath, output_directory, primary_key):
    start = time.time()
    late_updates = spark.read.option('header','true').parquet(late_updates_filepath)
    late_updates = late_updates.withColumn("start_datetime",F.col("extraction_timestamp"))
    late_updates.writeTo(output_directory).append()
    spark.table(output_directory).drop("end_datetime","is_current").writeTo(output_directory).createOrReplace()
    #spark.table(output_directory).writeTo(output_directory).drop("end_datetime","is_current").createOrReplace()
    query1 = f"""
    SELECT *,
    LEAD(extraction_timestamp,1,TO_TIMESTAMP('2050-01-01 00:00:00')) OVER(PARTITION BY {primary_key} ORDER BY extraction_timestamp) AS end_datetime

    FROM {catalog_name}.{database_name}.{table_name}

    ORDER BY {primary_key}, extraction_timestamp
    """
    spark.sql(query1)
    spark.sql(query1).writeTo(output_directory).createOrReplace()
    query2 = f"""
    SELECT *,
    CASE WHEN end_datetime = '2050-01-01 00:00:00' THEN True ELSE False END AS is_current

    FROM {catalog_name}.{database_name}.{table_name}

    ORDER BY {primary_key}, extraction_timestamp
    """
    spark.sql(query2)
    spark.sql(query2).writeTo(output_directory).createOrReplace()
    print(time.time()-start)




In [34]:
scd2_complex(input_filepath, late_updates_filepath, output_directory, primary_key)

7.640847444534302


In [35]:
spark.table(f"{catalog_name}.{database_name}.{table_name}").show()

+----------+------------+-----+--------------------+----+-------------------+-------------------+----------+
|product_id|product_name|price|extraction_timestamp|  op|     start_datetime|       end_datetime|is_current|
+----------+------------+-----+--------------------+----+-------------------+-------------------+----------+
|     00001|      Heater|  250| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2022-06-01 01:01:01|     false|
|     00001|      Heater|  500| 2022-06-01 01:01:01|   U|2022-06-01 01:01:01|2023-01-01 01:01:01|     false|
|     00001|      Heater| 1000| 2023-01-01 01:01:01|   U|2023-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00002|  Thermostat|  400| 2022-01-01 01:01:01|None|2022-01-01 01:01:01|2022-06-01 01:01:01|     false|
|     00002|  Thermostat|  500| 2022-06-01 01:01:01|   U|2022-06-01 01:01:01|2023-01-01 01:01:01|     false|
|     00002|  Thermostat| 1000| 2023-01-01 01:01:01|   U|2023-01-01 01:01:01|2050-01-01 00:00:00|      true|
|     00003|  Telev