## Hudi Workshop

### 准备 Hudi 环境

In [2]:
%%sh
# deploy hudi bundle jar
hdfs dfs -copyFromLocal -f /usr/lib/hudi/hudi-spark-bundle.jar /tmp/hudi-spark-bundle.jar
hdfs dfs -ls /tmp/hudi-spark-bundle.jar

-rw-r--r--   1 emr-notebook hdfsadmingroup   61369223 2024-01-24 10:48 /tmp/hudi-spark-bundle.jar


#### 添加环境配置，如果是通过Spark代码操作Hudi表，也需要在代码中添加以下配置

In [3]:
%%configure -f
{
    "conf" : {
        "spark.jars":"hdfs:///tmp/hudi-spark-bundle.jar",            
        "spark.serializer":"org.apache.spark.serializer.KryoSerializer",
        "spark.sql.extensions":"org.apache.spark.sql.hudi.HoodieSparkSessionExtension",
        "spark.sql.catalog.spark_catalog":"org.apache.spark.sql.hudi.catalog.HoodieCatalog"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1,application_1706091808767_0002,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1,application_1706091808767_0002,pyspark,idle,Link,Link,,✔


### SQL 操作

完成环境设置后，就可以开始操作hudi表。<br>

#### Create Database

In [1]:
%%sql
create database hudi_db;

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
0,application_1706091808767_0001,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
[SCHEMA_ALREADY_EXISTS] Cannot create schema `hudi_db` because it already exists.
Choose a different name, drop the existing schema, or add the IF NOT EXISTS clause to tolerate pre-existing schema.
Traceback (most recent call last):
  File "/mnt/yarn/usercache/livy/appcache/application_1706091808767_0001/container_1706091808767_0001_01_000001/pyspark.zip/pyspark/sql/session.py", line 1440, in sql
    return DataFrame(self._jsparkSession.sql(sqlQuery, litArgs), self)
  File "/mnt/yarn/usercache/livy/appcache/application_1706091808767_0001/container_1706091808767_0001_01_000001/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1323, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/mnt/yarn/usercache/livy/appcache/application_1706091808767_0001/container_1706091808767_0001_01_000001/pyspark.zip/pyspark/errors/exceptions/captured.py", line 175, in deco
    raise converted from None
pyspark.errors.exceptions.captured.AnalysisExceptio

#### Create Table
1. 创建 COW 表，指定主键字段为 id


In [28]:
%%sql
CREATE TABLE hudi_db.sample_cow (
id int,
a string,
b int
)
USING hudi
OPTIONS (
'primaryKey' = 'id',
'preCombineField' = 'id',
'type' = 'cow'
)
LOCATION 's3://myemr-bucket-01/data/hudi/sample_cow'

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

#### 插入数据

In [29]:
%%sql
insert into hudi_db.sample_cow values 
(1,'test01',10),
(2,'test02',20),
(3,'test03',30);

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

#### 查询数据

In [30]:
%%sql
select * from hudi_db.sample_cow;

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(children=(HTML(value='Type:'), Button(description='Table', layout=Layout(width='70px'), st…

Output()

#### 修改一条记录

由于Hudi是有主键约束的，同样通过 Insert 语法插入一条已经存在的记录时，它会自动的更新列。

In [31]:
%%sql
update hudi_db.sample_cow set a = 'test01_update' where id = 1;

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

#### 验证记录修改的情况

In [32]:
%%sql
select * from hudi_db.sample_cow where id = 1;

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(children=(HTML(value='Type:'), Button(description='Table', layout=Layout(width='70px'), st…

Output()

#### 查看文件结构

In [33]:
%%sh
aws s3 ls s3://myemr-bucket-01/data/hudi/sample_cow/

                           PRE .hoodie/
2024-01-24 12:52:08          0 .hoodie_$folder$
2024-01-24 12:52:26         96 .hoodie_partition_metadata
2024-01-24 12:52:28     434736 c3ae63d4-2ef3-4ec8-8355-a52cba4c4cd7-0_0-139-174_20240124125215354.parquet
2024-01-24 12:53:27     434739 c3ae63d4-2ef3-4ec8-8355-a52cba4c4cd7-0_0-178-208_20240124125321046.parquet


### MOR 表

In [35]:
%%sql
CREATE TABLE hudi_db.sample_mor (
id int,
a string,
b int
)
USING hudi
OPTIONS (
'primaryKey' = 'id',
'preCombineField' = 'id',
'type' = 'mor'
)
LOCATION 's3://myemr-bucket-01/data/hudi/sample_mor'

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

#### 插入数据

In [36]:
%%sql
insert into hudi_db.sample_mor values 
(1,'test01',10),
(2,'test02',20),
(3,'test03',30);

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

#### 修改一条记录

In [38]:
%%sql
update hudi_db.sample_mor set a = 'test01_update' where id = 1;

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

In [37]:
%%sql
select * from hudi_db.sample_mor where id = 1;

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(children=(HTML(value='Type:'), Button(description='Table', layout=Layout(width='70px'), st…

Output()

#### 查看文件结构

In [24]:
%%sh
aws s3 ls s3://myemr-bucket-01/data/hudi/sample_mor/

                           PRE .hoodie/
2024-01-24 11:46:32          0 .hoodie_$folder$
2024-01-24 11:47:17         96 .hoodie_partition_metadata
2024-01-24 11:47:18     434728 e5abd50b-ae4b-4df6-8adf-c189d625d0f3-0_0-73-91_20240124114700558.parquet
2024-01-24 11:55:30     434740 e5abd50b-ae4b-4df6-8adf-c189d625d0f3-0_0-95-113_20240124115516414.parquet
