# init spark session

In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .master("spark://spark-master:7077")
    .appName("test_iceberg")
    .config("spark.driver.memory", "1g")
    .config("hive.metastore.uris", "thrift://hive-metastore:9083")
    .config("spark.sql.warehouse.dir", "hdfs://namenode:9000/user/hive/warehouse/default")
    .config("spark.jars", """
        hdfs://namenode:9000/user/hive/spark_jars/iceberg-hive-runtime-1.4.3.jar,
        hdfs://namenode:9000/user/hive/spark_jars/iceberg-spark-runtime-3.4_2.12-1.4.3.jar
    """)
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog")
    .config("spark.executor.cores", 3)
    .config("spark.executor.memory", "6g")
    .config("spark.executor.instances", 2)
    .enableHiveSupport()
    .getOrCreate()
)

24/07/24 14:53:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
spark

# test hive table

In [8]:
spark.sql(f"""
create database staging
location 'hdfs://namenode:9000/user/hive/warehouse/staging'
""").show()

++
||
++
++



In [9]:
spark.sql(f"""
use staging
""").show()

++
||
++
++



In [14]:
spark.sql(f"""
create table staging.test_hive (key int, value string)
stored as orc
""").show()

++
||
++
++



In [15]:
spark.sql(f"""
insert into staging.test_hive
values (1, "2")
""").show()

++
||
++
++



In [16]:
spark.sql(f"""
select * from staging.test_hive
""").show()

[Stage 3:>                                                          (0 + 1) / 1]

+---+-----+
|key|value|
+---+-----+
|  1|    2|
+---+-----+



                                                                                

# test iceberg table

In [21]:
spark.sql(f"""
create database rawvault
location 'hdfs://namenode:9000/user/hive/warehouse/rawvault'
""").show()

++
||
++
++



In [22]:
spark.sql(f"""
use rawvault
""").show()

++
||
++
++



In [23]:
spark.sql(f"""
create external table rawvault.test_iceberg (key int, value string)
using iceberg
location 'hdfs://namenode:9000/user/hive/warehouse/rawvault/test_iceberg'
tblproperties(
    'objcapabilities'='extread,extwrite',
    'engine.hive.enabled'='true',
    'write.delete.mode'='copy-on-write',
    'write.update.mode'='copy-on-write',
    'write.merge.mode'='copy-on-write',
    'external.table.purge'='true',
    'iceberg.file_format'='parquet',
    'format-version'='2',
    'read.parquet.vectorization.batch-size'='10000',
    'read.parquet.vectorization.enabled'='false'
)
""").show()

++
||
++
++



In [24]:
spark.sql(f"""
insert into rawvault.test_iceberg
values (1, "2")
""").show()

                                                                                

++
||
++
++



In [25]:
spark.sql(f"""
select * from rawvault.test_iceberg
""").show()

+---+-----+
|key|value|
+---+-----+
|  1|    2|
+---+-----+

