# init spark session

In [1]:
from IPython.core.display import HTML
display(HTML("""<style>pre { white-space: pre !important; }.container { width:100% !important; }</style>"""))

In [2]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .master("spark://spark-master:7077")
    .appName("staging-to-rawvault-streaming")
    .config("spark.driver.memory", "1g")
    .config("hive.metastore.uris", "thrift://hive-metastore:9083")
    .config("spark.sql.warehouse.dir", "hdfs://namenode:9000/user/hive/warehouse/default")
    .config("spark.jars", """
        hdfs://namenode:9000/user/hive/spark_jars/iceberg-hive-runtime-1.4.3.jar,
        hdfs://namenode:9000/user/hive/spark_jars/iceberg-spark-runtime-3.4_2.12-1.4.3.jar
    """)
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog")
    .config("spark.executor.cores", 3)
    .config("spark.executor.memory", "6g")
    .config("spark.executor.instances", 1)
    .enableHiveSupport()
    .getOrCreate()
)

25/06/19 02:19:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/19 02:19:33 WARN TransportClientFactory: DNS resolution succeed for spark-master/172.18.0.16:7077 took 5001 ms
----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 48216)
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/local/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/usr/local/lib/python3.10/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/local/lib/python3.10/socketserver.py",

# create staging tables

In [7]:
df = (
        spark.read.format('parquet')
        .options(header=True, inferSchema=True)
        .load("/user/hive/warehouse/staging/topics/product1.public.customer")
    )

df.show(10, False)

+-----------+----------+----------+----------+-----------------+------------+------------------+-----------------+-------------------+----+--------+-----------------------+----------+
|customer_id|first_name|last_name |birth_date|address          |phone_number|email             |job_title        |updated_datetime   |__op|__lsn   |__src_ts_ms            |__ds      |
+-----------+----------+----------+----------+-----------------+------------+------------------+-----------------+-------------------+----+--------+-----------------------+----------+
|1          |Vu        |Tran Trieu|2001-11-17|123 ABC, MNO, XYZ|0865937123  |nitsvutt@gmail.com|Data Engineer    |2025-06-18 23:14:26|u   |27033800|2025-06-18 16:14:26.994|2025-06-18|
|1          |Vu        |Tran Trie |2001-11-17|123 ABC, MNO, XYZ|0865937123  |nitsvutt@gmail.com|Data Engineer    |2025-06-18 23:24:38|u   |27041384|2025-06-18 16:24:38.498|2025-06-18|
|1          |Vu        |Tran Trieu|2001-11-17|123 ABC, MNO, XYZ|0865937123  |nit

In [4]:
import re
from subprocess import Popen, PIPE

topics_path = "/user/hive/warehouse/staging/topics"
pattern = re.compile(r"product1.public.*")

p = Popen(f"hdfs dfs -ls {topics_path}", shell=True, stdout=PIPE, stderr=PIPE)
all_paths = [path.decode("utf-8").split()[-1].split("/")[-1] for path in p.stdout][1:]
all_tables = [path for path in all_paths if pattern.match(path)]

for table in all_tables:
    hdfs_path = f"hdfs://namenode:9000{topics_path}/{table}"
    table_name = table.replace(".", "_")
    df = (
        spark.read.format('parquet')
        .options(header=True, inferSchema=True)
        .load(hdfs_path)
    )
    if spark.sql(f"show tables from staging like '{table_name}'").isEmpty():
        print(table_name)
        spark.sql(f"""
            create external table if not exists staging.{table_name}
            ({', '.join([col + ' ' + dtype for col, dtype in df.dtypes])})
            partitioned by (__ds)
            stored as parquet
            location "{hdfs_path}"
        """)
        spark.sql(f'repair table staging.{table_name}')

In [5]:
spark.sql("select * from staging.product1_public_customer").show(10, False)

+-----------+----------+---------+----------+-----------------+------------+------------------+-----------------+-------------------+----+--------+-----------------------+----------+
|customer_id|first_name|last_name|birth_date|address          |phone_number|email             |job_title        |updated_datetime   |__op|__lsn   |__src_ts_ms            |__ds      |
+-----------+----------+---------+----------+-----------------+------------+------------------+-----------------+-------------------+----+--------+-----------------------+----------+
|6          |Cuong     |Vo       |2001-07-07|12 ABC, MNO, XYZ |012356      |cuongvo2@gmail.com|Data Engineer    |2024-08-17 16:23:58|r   |27032456|2025-06-18 14:37:21.557|2025-06-18|
|1          |Vu        |Tran     |2001-11-17|123 ABC, MNO, XYZ|0865937123  |nitsvutt@gmail.com|Data Engineer    |2024-11-17 17:37:11|r   |27032456|2025-06-18 14:37:21.557|2025-06-18|
|5          |Cuong     |Vo       |2025-01-15|12 ABC, MNO, XYZ |01235       |cuongvo1@

# create rawvault tables

In [3]:
staging_table_list = (
    spark.sql("show tables from staging like 'product1_public_*'")
        .select("tableName")
        .rdd.flatMap(lambda x: x).collect()
)
rawvault_table_list = (
    spark.sql("show tables from rawvault like 'product1_public_*'")
        .select("tableName")
        .rdd.flatMap(lambda x: x).collect()
)
new_tables = list(set(staging_table_list) - set(rawvault_table_list))
new_tables

                                                                                

[]

In [7]:
for table in new_tables:
    tables = {"derived": f"{table}_der", "snapshot": f"{table}_snp", "main": table}
    for table_type, table_name in tables.items():
        print(table_name)
        df = spark.sql(f"select * from staging.{tables['main']}")
        spark.sql(f"""
        create external table if not exists rawvault.{table_name}
        ({', '.join([col + ' ' + dtype for col, dtype in df.dtypes])})
        using iceberg
        {'partitioned by (days(updated_datetime))' if table_type == 'main' else ''}
        location 'hdfs://namenode:9000/user/hive/warehouse/rawvault'
        tblproperties(
            'objcapabilities'='extread,extwrite',
            'engine.hive.enabled'='true',
            'write.delete.mode'='copy-on-write',
            'write.update.mode'='copy-on-write',
            'write.merge.mode'='copy-on-write',
            'external.table.purge'='true',
            'iceberg.file_format'='parquet',
            'format-version'='2',
            'read.parquet.vectorization.batch-size'='10000',
            'read.parquet.vectorization.enabled'='false'
        )
        """)

# read and write stream

In [4]:
staging_path = "hdfs://namenode:9000/user/hive/warehouse/staging"
for staging_table in staging_table_list:
    stream_reader = (
        spark.readStream
        .schema(spark.sql(f"select * from staging.{staging_table}").schema)
        .parquet(f"{staging_path}/topics/{staging_table.replace('_', '.')}")
    )
    stream_query = (
        stream_reader.writeStream
        .outputMode("append").format("iceberg")
        .option("checkpointLocation", f"{staging_path}/checkpoints/{staging_table}")
        .trigger(processingTime="10 seconds")
        .toTable(f"rawvault.{staging_table}_der")
    )
spark.streams.awaitAnyTermination()

25/06/18 16:39:25 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/06/18 16:39:25 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
ERROR:root:KeyboardInterrupt while sending command.                             
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/local/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [46]:
df = (
        spark.read.format('parquet')
        .options(header=True, inferSchema=True)
        .load("/user/hive/warehouse/staging/topics/product1.public.service")
    )

df.show(20, False)

+----------+-------------------+---------+--------------+-----------+-------------+-------------------+----+--------+-----------------------+----------+
|service_id|name               |price    |image         |active_date|inactive_date|updated_datetime   |__op|__lsn   |__src_ts_ms            |__ds      |
+----------+-------------------+---------+--------------+-----------+-------------+-------------------+----+--------+-----------------------+----------+
|12        |Electrocity check 2|123.00000|abc/xyz/ec.png|2025-06-18 |2027-06-18   |2025-06-19 14:47:50|c   |27090704|2025-06-19 07:47:50.115|2025-06-19|
|12        |Electrocity check 2|300.00000|abc/xyz/ec.png|2025-06-18 |2027-06-18   |2025-06-19 14:47:58|u   |27094400|2025-06-19 07:47:58.449|2025-06-19|
|1         |House Keeping      |123.45600|string        |2024-08-17 |9999-01-01   |2024-08-17 14:43:01|r   |27090504|2025-06-19 07:46:48.749|2025-06-19|
|2         |Office for lease   |301.00000|string        |2024-08-17 |9999-01-01   

25/06/19 07:53:52 ERROR TaskSchedulerImpl: Lost executor 2 on 172.18.0.17: worker lost
25/06/19 07:53:52 ERROR TaskSchedulerImpl: Lost executor 3 on 172.18.0.17: worker lost
25/06/19 07:54:02 WARN StandaloneAppClient$ClientEndpoint: Connection to spark-master:7077 failed; waiting for master to reconnect...
25/06/19 07:54:02 WARN StandaloneSchedulerBackend: Disconnected from Spark cluster! Waiting for reconnection...
25/06/19 07:54:02 WARN StandaloneAppClient$ClientEndpoint: Connection to spark-master:7077 failed; waiting for master to reconnect...


In [19]:
spark.sql("select * from rawvault.product1_public_service").show(10, False)

+----------+-----------------+----------+---------------+-----------+-------------+-------------------+----+--------+-----------------------+----------+
|service_id|name             |price     |image          |active_date|inactive_date|updated_datetime   |__op|__lsn   |__src_ts_ms            |__ds      |
+----------+-----------------+----------+---------------+-----------+-------------+-------------------+----+--------+-----------------------+----------+
|4         |Electrocity check|123.00000 |abc/xyz/ec.png |2025-06-18 |2027-06-18   |2025-06-18 23:41:08|c   |27051912|2025-06-18 16:41:08.902|2025-06-18|
|4         |Electrocity check|124.00000 |abc/xyz/ec.png |2025-06-18 |2027-06-18   |2025-06-18 23:41:53|u   |27052688|2025-06-18 16:41:53.729|2025-06-18|
|3         |Cleaning         |124.00000 |abc/xyz/mno.png|2025-06-18 |2026-06-18   |2025-06-18 23:44:33|u   |27053280|2025-06-18 16:44:33.207|2025-06-18|
|1         |House Keeping    |123.45600 |string         |2024-08-17 |9999-01-01   