In [1]:
import os
from pyspark.sql import SparkSession, DataFrame, functions as F

In [2]:
spark = (SparkSession.builder.appName("Hudi-App")
         .config("spark.jars.packages", "org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.2")
         .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
         .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog")
         .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension")
         .getOrCreate()
        )

WORK_DIR = f'{os.getenv("HOME")}/work'

## First Time

### Read a Raw Data

In [3]:
datalake = f'{WORK_DIR}/datalake'
raw = f'{datalake}/raw'
trusted = f'{datalake}/trusted'

table_name = 'hudi'
id_col = 'id'
partition_col = 'created'
operation = 'upsert'
order_col = 'updated'
parallelism = 2

In [4]:
df = spark.read.parquet(f'{raw}/data').where('year = 2023 and month = 02 and day = 04')

In [5]:
df.toPandas()

Unnamed: 0,created,id,updated,value,year,month,day
0,2023/02/03,0,2023/02/04,morango,2023,2,4
1,2023/02/03,2,2023/02/04,morango,2023,2,4
2,2023/02/03,3,2023/02/04,limão,2023,2,4
3,2023/02/03,4,2023/02/04,banana,2023,2,4
4,2023/02/03,5,2023/02/04,morango,2023,2,4
...,...,...,...,...,...,...,...
3148,2023/02/03,4990,2023/02/04,abacaxi,2023,2,4
3149,2023/02/03,4991,2023/02/04,maça,2023,2,4
3150,2023/02/03,4992,2023/02/04,banana,2023,2,4
3151,2023/02/03,4993,2023/02/04,limão,2023,2,4


Looking results it's possible observe a sample of data and quantity rows to be insert in destination.

### Write

In [6]:
hudi_options = {
    'hoodie.table.name': table_name,
    'hoodie.datasource.write.recordkey.field': id_col,
    'hoodie.datasource.write.partitionpath.field': partition_col,
    'hoodie.datasource.write.table.name': table_name,
    'hoodie.datasource.write.operation': operation,
    'hoodie.datasource.write.precombine.field': order_col,
    'hoodie.upsert.shuffle.parallelism': parallelism,
    'hoodie.insert.shuffle.parallelism': parallelism
}

df.write.format("hudi").options(**hudi_options).mode("overwrite").save(f'{trusted}/hudi')

## Upsert 

In [7]:
df = spark.read.parquet(f"{raw}/data").where("year = 2023 and month = 02 and day = 05")

In [9]:
df.toPandas()

Unnamed: 0,created,id,updated,value,year,month,day
0,2023/02/03,0,2023/02/05,banana,2023,2,5
1,2023/02/03,1,2023/02/05,morango,2023,2,5
2,2023/02/03,8,2023/02/05,limão,2023,2,5
3,2023/02/03,9,2023/02/05,maça,2023,2,5
4,2023/02/03,10,2023/02/05,maça,2023,2,5
...,...,...,...,...,...,...,...
3185,2023/02/03,4994,2023/02/05,abacaxi,2023,2,5
3186,2023/02/03,4996,2023/02/05,maça,2023,2,5
3187,2023/02/03,4997,2023/02/05,banana,2023,2,5
3188,2023/02/03,4998,2023/02/05,morango,2023,2,5


In [8]:
df.write.format("hudi").options(**hudi_options).mode("append").save(f'{trusted}/hudi')

## Read Trusted with Delta

In [11]:
df_destination = spark.read.format('hudi').load(f'{trusted}/hudi')

In [12]:
df_destination.toPandas()

Unnamed: 0,_hoodie_commit_time,_hoodie_commit_seqno,_hoodie_record_key,_hoodie_partition_path,_hoodie_file_name,id,updated,value,year,month,day,created
0,20230206164949662,20230206164949662_0_0,2574,2023/02/03,61f2d27b-dcf8-4f36-8270-541e5bae409b-0_0-62-47...,2574,2023/02/04,abacaxi,2023,2,4,2023/02/03
1,20230206164949662,20230206164949662_0_1,959,2023/02/03,61f2d27b-dcf8-4f36-8270-541e5bae409b-0_0-62-47...,959,2023/02/04,morango,2023,2,4,2023/02/03
2,20230206164949662,20230206164949662_0_2,1404,2023/02/03,61f2d27b-dcf8-4f36-8270-541e5bae409b-0_0-62-47...,1404,2023/02/04,limão,2023,2,4,2023/02/03
3,20230206165632932,20230206165632932_0_3,346,2023/02/03,61f2d27b-dcf8-4f36-8270-541e5bae409b-0_0-62-47...,346,2023/02/05,banana,2023,2,5,2023/02/03
4,20230206165632932,20230206165632932_0_4,665,2023/02/03,61f2d27b-dcf8-4f36-8270-541e5bae409b-0_0-62-47...,665,2023/02/05,maça,2023,2,5,2023/02/03
...,...,...,...,...,...,...,...,...,...,...,...,...
4308,20230206165632932,20230206165632932_0_4308,4948,2023/02/03,61f2d27b-dcf8-4f36-8270-541e5bae409b-0_0-62-47...,4948,2023/02/05,maça,2023,2,5,2023/02/03
4309,20230206165632932,20230206165632932_0_4309,4946,2023/02/03,61f2d27b-dcf8-4f36-8270-541e5bae409b-0_0-62-47...,4946,2023/02/05,maça,2023,2,5,2023/02/03
4310,20230206165632932,20230206165632932_0_4310,4952,2023/02/03,61f2d27b-dcf8-4f36-8270-541e5bae409b-0_0-62-47...,4952,2023/02/05,morango,2023,2,5,2023/02/03
4311,20230206165632932,20230206165632932_0_4311,4956,2023/02/03,61f2d27b-dcf8-4f36-8270-541e5bae409b-0_0-62-47...,4956,2023/02/05,banana,2023,2,5,2023/02/03


## Comparing Results

In [13]:
df_destination.groupby('updated').count().toPandas()

Unnamed: 0,updated,count
0,2023/02/04,1123
1,2023/02/05,3190


In [15]:
df_destination.groupby('_hoodie_commit_time').count().toPandas()

Unnamed: 0,_hoodie_commit_time,count
0,20230206164949662,1123
1,20230206165632932,3190


## Incremental Data

In [21]:
df_destination.createOrReplaceTempView("hudi_trips_snapshot")

In [22]:
commits = list(map(lambda row: row[0], spark.sql("select distinct(_hoodie_commit_time) as commitTime from  hudi_trips_snapshot order by commitTime").limit(50).collect()))
beginTime = commits[len(commits) - 2] 

In [23]:
beginTime

'20230206164949662'

In [24]:
incremental_read_options = {
  'hoodie.datasource.query.type': 'incremental',
  'hoodie.datasource.read.begin.instanttime': beginTime,
}

In [25]:
tripsIncrementalDF = spark.read.format("hudi").options(**incremental_read_options).load(f'{trusted}/hudi')

In [27]:
tripsIncrementalDF.toPandas()

Unnamed: 0,_hoodie_commit_time,_hoodie_commit_seqno,_hoodie_record_key,_hoodie_partition_path,_hoodie_file_name,created,id,updated,value,year,month,day
0,20230206165632932,20230206165632932_0_3,346,2023/02/03,61f2d27b-dcf8-4f36-8270-541e5bae409b-0_0-62-47...,2023/02/03,346,2023/02/05,banana,2023,2,5
1,20230206165632932,20230206165632932_0_4,665,2023/02/03,61f2d27b-dcf8-4f36-8270-541e5bae409b-0_0-62-47...,2023/02/03,665,2023/02/05,maça,2023,2,5
2,20230206165632932,20230206165632932_0_5,2237,2023/02/03,61f2d27b-dcf8-4f36-8270-541e5bae409b-0_0-62-47...,2023/02/03,2237,2023/02/05,maça,2023,2,5
3,20230206165632932,20230206165632932_0_7,2645,2023/02/03,61f2d27b-dcf8-4f36-8270-541e5bae409b-0_0-62-47...,2023/02/03,2645,2023/02/05,maça,2023,2,5
4,20230206165632932,20230206165632932_0_9,935,2023/02/03,61f2d27b-dcf8-4f36-8270-541e5bae409b-0_0-62-47...,2023/02/03,935,2023/02/05,morango,2023,2,5
...,...,...,...,...,...,...,...,...,...,...,...,...
3185,20230206165632932,20230206165632932_0_4308,4948,2023/02/03,61f2d27b-dcf8-4f36-8270-541e5bae409b-0_0-62-47...,2023/02/03,4948,2023/02/05,maça,2023,2,5
3186,20230206165632932,20230206165632932_0_4309,4946,2023/02/03,61f2d27b-dcf8-4f36-8270-541e5bae409b-0_0-62-47...,2023/02/03,4946,2023/02/05,maça,2023,2,5
3187,20230206165632932,20230206165632932_0_4310,4952,2023/02/03,61f2d27b-dcf8-4f36-8270-541e5bae409b-0_0-62-47...,2023/02/03,4952,2023/02/05,morango,2023,2,5
3188,20230206165632932,20230206165632932_0_4311,4956,2023/02/03,61f2d27b-dcf8-4f36-8270-541e5bae409b-0_0-62-47...,2023/02/03,4956,2023/02/05,banana,2023,2,5


## References

https://hudi.apache.org/docs/overview

https://hudi.apache.org/docs/quick-start-guide