In [5]:
from utils import get_spark_session
import os

spark = get_spark_session("iceberg_transactions_SQL")

[Stage 0:>                                                          (0 + 2) / 2]

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- salary: double (nullable = true)
 |-- last_purchase: timestamp (nullable = true)
 |-- last_purchase_date: string (nullable = true)



                                                                                

## INSERT INTO

Insert Data to a table

spark.sql(f"""
INSERT INTO {TABLE_NAME_2} VALUES
    (1, 'Alice', 30, 1000, '2021-01-01'),
    (2, 'Bob', 25, 1200, '2021-01-01'),
    (3, 'Charlie', 35, 1300, '2021-01-01'),
    (4, 'David', 40, 1500, '2021-01-01'),
    (5, 'Eve', 45, 2000, '2021-01-01'),
    (6, 'Frank', 50, 2500, '2021-01-01'),
    (7, 'Grace', 55, 3000, '2021-01-01'),
    (8, 'Helen', 60, 3500, '2021-01-01'),
    (9, 'Ivan', 65, 4000, '2021-01-01'),
    (10, 'John', 70, 4500, '2021-01-01')
""")

spark.table(TABLE_NAME_2).show()

spark.sql(f"""
INSERT INTO {TABLE_NAME_2} VALUES
    (1, 'Alice', 30, 1000, '2021-01-01'),
    (2, 'Bob', 25, 1200, '2021-01-01'),
    (3, 'Charlie', 35, 1300, '2021-01-01')
""")

spark.table(TABLE_NAME_2).show()


## 3. INSERT OVERWRITE

In [None]:
spark.sql(f"""
INSERT OVERWRITE {TABLE_NAME_2}
SELECT DISTINCT(*) FROM {TABLE_NAME_2}
""")

spark.table(TABLE_NAME_2).show(50)

## 2. MERGE INTO

In [None]:
from pyspark.sql.functions import col


data = [
    (5, 'Marco', 33, 10000, '2024-08-20'),
    (6, 'Kandão', 35, 7000, '2018-08-23'),
    (7, 'Alécio', 33, 15000, '2022-08-22'),
    (11, 'Rosemberg', 33, 20000, '2024-12-11')
]

df = spark.createDataFrame(data, ["id", "name", "age", "salary", "purchase_date"]).withColumn("salary", col("salary").cast("double"))
df.createOrReplaceTempView("temp_view")

spark.sql(f"""
MERGE INTO {TABLE_NAME_2} t   -- a target table
USING temp_view s             -- the source updates
ON t.id = s.id                -- condition to find updates for target rows
WHEN MATCHED AND s.salary > 8000 THEN UPDATE SET t.name = s.name , t.salary = s.salary
WHEN NOT MATCHED THEN INSERT * 
""")

spark.table(TABLE_NAME_2).show(50)

# Iceberg Catalogs

### What can be used an Iceberg Catalog

Catalogs help track Iceberg Tables and provide locking mechanisms for ACID Guarantees;
Thing to keep in mind is that while many engines may support Iceberg Tables they may not support connections to all catalogs;

### Project Nessie

- Pros: Git Like functionality, Cloud Managed Service (Arctic);
- Cons: Spport from engines beyond Spark and Dremio;

### Hive Metastore

- Pros: Can use existing Hive Metastore
- Cons: You have to deploy and maintain a hive metastore;

### AWS Glue

- Pros: Interop with AWS Services;
- Cons: Support outside of AWS, Spark and Dremio;


- **Copy-on-write**: Daily batch jobs where write speed is less a priority and read time is a high priority;
- **Merge-on-read (Position Deletes)**: Streaming and higher frequency batch (hourly) where write speed is very important with minor cost to read times. Regular compaction should be scheduled;
- **Merge-on-read (Equality Deletes)**: Very write intensive jobs where position deletes still aren't fast enough, much larger cost to reading so frequent compaction jobs will be necessary to manage;

In [4]:
spark.stop()