In [23]:
import os
from pyspark.sql import SparkSession, DataFrame, functions as F, types as T

In [24]:
WORK_DIR = f'{os.getenv("HOME")}/work'

spark = (SparkSession.builder.appName("Iceberg")
         .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:1.1.0")
         .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
         .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog")
         .config("spark.sql.catalog.spark_catalog.type", "hive")
         .config("spark.sql.catalog.demo", "org.apache.iceberg.spark.SparkCatalog")
         .config("spark.sql.catalog.demo.type", "hadoop")
         .config("spark.sql.catalog.demo.warehouse", f"{WORK_DIR}/datalake/trusted/iceberg/warehouse")
         .config("spark.sql.defaultCatalog", "demo")
         .getOrCreate()
        )

## First Time

### Read a Raw Data

In [6]:
datalake = f'{WORK_DIR}/datalake'
raw = f'{datalake}/raw'
trusted = f'{datalake}/trusted'

In [7]:
df = spark.read.parquet(f'{raw}/data').where('year = 2023 and month = 02 and day = 04')

In [8]:
df.toPandas()

Unnamed: 0,created,id,updated,value,year,month,day
0,2023/02/03,0,2023/02/04,morango,2023,2,4
1,2023/02/03,2,2023/02/04,morango,2023,2,4
2,2023/02/03,3,2023/02/04,limão,2023,2,4
3,2023/02/03,4,2023/02/04,banana,2023,2,4
4,2023/02/03,5,2023/02/04,morango,2023,2,4
...,...,...,...,...,...,...,...
3148,2023/02/03,4990,2023/02/04,abacaxi,2023,2,4
3149,2023/02/03,4991,2023/02/04,maça,2023,2,4
3150,2023/02/03,4992,2023/02/04,banana,2023,2,4
3151,2023/02/03,4993,2023/02/04,limão,2023,2,4


### Write

In [9]:
df.writeTo("demo.iceberg.fruit").create()

## Upsert 

In [10]:
df = spark.read.parquet(f"{raw}/data").where("year = 2023 and month = 02 and day = 05")

In [11]:
df.toPandas()

Unnamed: 0,created,id,updated,value,year,month,day
0,2023/02/03,0,2023/02/05,banana,2023,2,5
1,2023/02/03,1,2023/02/05,morango,2023,2,5
2,2023/02/03,8,2023/02/05,limão,2023,2,5
3,2023/02/03,9,2023/02/05,maça,2023,2,5
4,2023/02/03,10,2023/02/05,maça,2023,2,5
...,...,...,...,...,...,...,...
3185,2023/02/03,4994,2023/02/05,abacaxi,2023,2,5
3186,2023/02/03,4996,2023/02/05,maça,2023,2,5
3187,2023/02/03,4997,2023/02/05,banana,2023,2,5
3188,2023/02/03,4998,2023/02/05,morango,2023,2,5


In [36]:
df.createOrReplaceTempView('fruit_temp')

In [41]:
script_sql = """
MERGE INTO demo.iceberg.fruit t USING (SELECT * FROM fruit_temp) u ON t.id = u.id
WHEN NOT MATCHED THEN INSERT *
"""

In [42]:
spark.sql(script_sql)

DataFrame[]

In [52]:
df_destination = spark.sql('select * from demo.iceberg.fruit')

In [53]:
df_destination.toPandas()

Unnamed: 0,created,id,updated,value,year,month,day
0,2023/02/03,0,2023/02/04,morango,2023,2,4
1,2023/02/03,2,2023/02/04,morango,2023,2,4
2,2023/02/03,3,2023/02/04,limão,2023,2,4
3,2023/02/03,4,2023/02/04,banana,2023,2,4
4,2023/02/03,5,2023/02/04,morango,2023,2,4
...,...,...,...,...,...,...,...
4308,2023/02/03,4994,2023/02/05,abacaxi,2023,2,5
4309,2023/02/03,4996,2023/02/05,maça,2023,2,5
4310,2023/02/03,4997,2023/02/05,banana,2023,2,5
4311,2023/02/03,4998,2023/02/05,morango,2023,2,5


## Comparing Results

In [54]:
df_destination.groupby('updated').count().toPandas()

Unnamed: 0,updated,count
0,2023/02/04,3153
1,2023/02/05,1160


## Incremental Data

In [55]:
dfsnap = spark.sql('select * from demo.iceberg.fruit.snapshots')

In [58]:
dfsnap.show()

+--------------------+-------------------+-------------------+---------+--------------------+--------------------+
|        committed_at|        snapshot_id|          parent_id|operation|       manifest_list|             summary|
+--------------------+-------------------+-------------------+---------+--------------------+--------------------+
|2023-02-07 12:51:...|9189953912558619203|               null|   append|/home/jovyan/work...|{spark.app.id -> ...|
|2023-02-07 13:57:...|8833439022224482470|9189953912558619203|   append|/home/jovyan/work...|{spark.app.id -> ...|
|2023-02-07 13:58:...|3492994344799829888|8833439022224482470|   append|/home/jovyan/work...|{spark.app.id -> ...|
|2023-02-07 13:59:...|5263910916207477539|3492994344799829888|   append|/home/jovyan/work...|{spark.app.id -> ...|
+--------------------+-------------------+-------------------+---------+--------------------+--------------------+



In [59]:
version = dfsnap.select(F.max(F.col('snapshot_id'))).collect()[0][0]

In [60]:
df_incremental = spark.read.format("iceberg").option("start-snapshot-id", version).load("demo.iceberg.fruit")

In [61]:
df_incremental.toPandas()

Unnamed: 0,created,id,updated,value,year,month,day
0,2023/02/03,1,2023/02/05,morango,2023,2,5
1,2023/02/03,10,2023/02/05,maça,2023,2,5
2,2023/02/03,13,2023/02/05,abacaxi,2023,2,5
3,2023/02/03,14,2023/02/05,maça,2023,2,5
4,2023/02/03,16,2023/02/05,abacaxi,2023,2,5
...,...,...,...,...,...,...,...
1155,2023/02/03,4994,2023/02/05,abacaxi,2023,2,5
1156,2023/02/03,4996,2023/02/05,maça,2023,2,5
1157,2023/02/03,4997,2023/02/05,banana,2023,2,5
1158,2023/02/03,4998,2023/02/05,morango,2023,2,5
