# Athena Iceberg - Data Eng Use Cases

In [None]:
import awswrangler as wr
import pydbtools as pydb
import py_aws_vault_auth
import os

In [None]:
comparison = "athena_iceberg"
region = "eu-west-1"
bucketname = "sb-test-bucket-ireland"
db_name = "wto_hudi_iceberg"
s3_root_folder = "wo/de_use_cases"
s3_base_path = f"s3://{bucketname}/{s3_root_folder}/{comparison}"
db_base_path = f"{s3_base_path}database/"

environ_auth = py_aws_vault_auth.authenticate("sso-sandbox", prompt="python", return_as="environ")
os.environ.update(environ_auth)

## Bulk insert and add curation columns 

Set up variables for bulk insert test

In [None]:
senario = "bulk_insert"
source_fl = f"s3://sb-test-bucket-ireland/dummy_data/full_load/"
source_ud = f"s3://sb-test-bucket-ireland/dummy_data/updates/"
temp_table_name = f"{comparison}_{senario}_temp"
dest_table_name = f"{comparison}_{senario}_iceberg"

Create a temporary table from source

In [None]:
temp_table_sql_1 = f"""
    CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}.{temp_table_name} (
        product_id string,
        product_name string,
        price int,
        extraction_timestamp timestamp,
        op string
    )
    STORED AS PARQUET
    LOCATION '{source_fl}'
"""
#wr.athena.read_sql_query(sql=temp_table_sql, database=db_name, ctas_approach=False)
# wr.athena.read_sql_query(f"DROP TABLE {temp_table_name}", database=db_name, ctas_approach=False)


Check that the table is populated

In [None]:
wr.athena.read_sql_query(f"SELECT * FROM {temp_table_name}", database=db_name, ctas_approach=False)

Create an iceberg table from source table

In [None]:
dest_table_sql = f"""
    CREATE TABLE IF NOT EXISTS {db_name}.{dest_table_name}
        WITH (table_type='ICEBERG',
        location='{db_base_path}{senario}/',
        format='PARQUET',
        is_external=false)
        AS SELECT
            product_id,
            product_name,
            price,
            CAST(extraction_timestamp AS timestamp(6)) AS extraction_timestamp,
            op 
           FROM {db_name}.{temp_table_name};
"""
wr.athena.read_sql_query(sql=dest_table_sql, database=db_name, ctas_approach=False, workgroup='Athena3')
wr.athena.read_sql_query(f"SELECT * FROM {dest_table_name}", database=db_name, ctas_approach=False, workgroup='Athena3')
##wr.athena.delete_table(database=db_name, table=temp_table_name)

## 13 sec

Update destination iceberg table with new columns

In [None]:
add_columns_sql = f"""
    ALTER TABLE {db_name}.{dest_table_name}
    ADD COLUMNS (start_datetime TIMESTAMP, end_datetime TIMESTAMP, is_current BOOLEAN)
"""
update_values_sql = f"""
    UPDATE {db_name}.{dest_table_name}
    SET start_datetime = extraction_timestamp, 
        end_datetime = CAST(TIMESTAMP '2250-01-01' as TIMESTAMP(6)), 
        is_current = true
"""
#wr.athena.read_sql_query(sql=update_values_sql, database=db_name, ctas_approach=False, workgroup='Athena3')
print("Updated values")
wr.athena.read_sql_query(f"SELECT * FROM {dest_table_name}", database=db_name, ctas_approach=False, workgroup='Athena3')
##wr.athena.delete_table(database=db_name, table=temp_table_name)

Delete and recreate tempory table from update file

In [None]:
wr.athena.read_sql_query(f"DROP TABLE IF EXISTS {temp_table_name}", database=db_name, ctas_approach=False)
temp_table_sql = f"""
    CREATE EXTERNAL TABLE {db_name}.{temp_table_name} (
        product_id string,
        product_name string,
        price int,
        extraction_timestamp timestamp,
        op string
    )
    STORED AS PARQUET
    LOCATION '{source_ud}'
"""
wr.athena.read_sql_query(sql=temp_table_sql, database=db_name, ctas_approach=False)
wr.athena.read_sql_query(f"SELECT * FROM {temp_table_name}", database=db_name, ctas_approach=False)


Update destination table when key is source (CDC / update) table

In [None]:
update_dest_sql = f"""
    MERGE INTO {db_name}.{dest_table_name} dest
        USING {db_name}.{temp_table_name} sour
            ON sour.product_id = dest.product_id
    WHEN MATCHED AND dest.is_current = TRUE AND sour.extraction_timestamp > dest.extraction_timestamp
        THEN UPDATE
            SET end_datetime = sour.extraction_timestamp, is_current = FALSE;
"""
wr.athena.read_sql_query(sql=update_dest_sql, database=db_name, ctas_approach=False)
wr.athena.read_sql_query(f"SELECT * FROM {dest_table_name}", database=db_name, ctas_approach=False)

Insert all updates from source table

In [None]:
insert_dest_sql = f"""
INSERT INTO {db_name}.{dest_table_name}
    SELECT product_id, product_name, price, CAST(extraction_timestamp AS TIMESTAMP(6)), op, 
      CAST(extraction_timestamp AS TIMESTAMP(6)), CAST(TIMESTAMP '2250-01-01' as TIMESTAMP(6)),TRUE
    FROM {db_name}.{temp_table_name}
"""
wr.athena.read_sql_query(sql=update_dest_sql, database=db_name, ctas_approach=False)
wr.athena.read_sql_query(f"SELECT * FROM {dest_table_name}", database=db_name, ctas_approach=False)

In [None]:
wr.athena.read_sql_query(sql=temp_table_sql, database=db_name, ctas_approach=False)
wr.athena.read_sql_query(f"DROP TABLE {temp_table_name}", database=db_name, ctas_approach=False)


## Summary of changes

**SETUP**
1. Create a temp table from FL
2. Create iceberg full load via CTAS, adding mojap fields
3. Create CDC temp table 
4. Create a CDC view adding mojap fields (didnt actualy do this last light as plain insert was quick enough for Sou's critera)

**PROCESSING**
1. Use merge to close is_current records in iceberg that exist in CDC (there is an issue of closing date flif multiple CDC)
2. Insert CDC into iceberg

**NEXT STEPS**
1. Run processing as a single step
2. Update the cdc insrt to a view
3. Close multiple CDC updates with previous date

