# init spark session

In [2]:
from IPython.core.display import HTML
display(HTML("""<style>pre { white-space: pre !important; }.container { width:100% !important; }</style>"""))

In [3]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .master("spark://spark-master:7077")
    .appName("staging-to-rawvault-streaming")
    .config("spark.driver.memory", "1g")
    .config("hive.metastore.uris", "thrift://hive-metastore:9083")
    .config("spark.sql.warehouse.dir", "hdfs://namenode:9000/user/hive/warehouse/default")
    .config("spark.jars", """
        hdfs://namenode:9000/user/hive/spark_jars/iceberg-hive-runtime-1.4.3.jar,
        hdfs://namenode:9000/user/hive/spark_jars/iceberg-spark-runtime-3.4_2.12-1.4.3.jar
    """)
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog")
    .config("spark.executor.cores", 3)
    .config("spark.executor.memory", "6g")
    .config("spark.executor.instances", 2)
    .enableHiveSupport()
    .getOrCreate()
)

25/01/16 03:06:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# create tables

In [5]:
df = (
        spark.read.format('parquet')
        .options(header=True, inferSchema=True)
        .load("/user/hive/warehouse/staging/topics/product1.public.customer")
    )

df.show(10, False)

[Stage 2:>                                                          (0 + 1) / 1]

+-----------+----------+---------+----------+-----------------+------------+------------------+-----------------+-------------------+----+--------+-----------------------+----------+
|customer_id|first_name|last_name|birth_date|address          |phone_number|email             |job_title        |updated_datetime   |__op|__lsn   |__src_ts_ms            |__ds      |
+-----------+----------+---------+----------+-----------------+------------+------------------+-----------------+-------------------+----+--------+-----------------------+----------+
|6          |Cuong     |Vo       |2001-07-07|12 ABC, MNO, XYZ |012356      |cuongvo2@gmail.com|Data Engineer    |2024-08-17 16:23:58|r   |27031720|2025-01-16 03:05:40.137|2025-01-16|
|1          |Vu        |Tran     |2001-11-17|123 ABC, MNO, XYZ|0865937123  |nitsvutt@gmail.com|Data Engineer    |2024-11-17 17:37:11|r   |27031720|2025-01-16 03:05:40.137|2025-01-16|
|5          |Cuong     |Vo       |2025-01-15|12 ABC, MNO, XYZ |01235       |cuongvo1@

                                                                                

In [19]:
import re
from subprocess import Popen, PIPE

topics_path = "/user/hive/warehouse/staging/topics"
pattern = re.compile(r"product1.public.*")

p = Popen(f"hdfs dfs -ls {topics_path}", shell=True, stdout=PIPE, stderr=PIPE)
all_paths = [path.decode("utf-8").split()[-1].split("/")[-1] for path in p.stdout][1:]
all_tables = [path for path in all_paths if pattern.match(path)]

for table in all_tables:
    hdfs_path = f"hdfs://namenode:9000{topics_path}/{table}"
    print(hdfs_path)
    df = (
        spark.read.format('parquet')
        .options(header=True, inferSchema=True)
        .load(hdfs_path)
    )
    spark.sql(f"""
        create external table if not exists staging.{table.replace(".", "_")}
        ({', '.join([col + ' ' + dtype for col, dtype in df.dtypes])})
        partitioned by (__ds)
        stored as parquet
        location "{hdfs_path}"
    """)
    spark.sql(f'repair table staging.{table.replace(".", "_")}')

hdfs://namenode:9000/user/hive/warehouse/staging/topics/product1.public.customer


25/01/15 09:29:23 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.


# check staging

In [20]:
table = "product1_public_customer"
staging_path = "hdfs://namenode:9000/user/hive/warehouse/staging"

In [49]:
stream_reader = (
    spark.readStream
    .schema(spark.sql(f"select * from staging.{table}").schema)
    .parquet(f"{staging_path}/topics/{table.replace('_', '.')}")
)
stream_writer = (
    stream_reader.writeStream
    .outputMode("append").format("console")
    .option("checkpointLocation", f"{staging_path}/checkpoints/{table}")
    .start()
)

stream_writer.awaitTermination()

25/01/15 09:06:02 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


-------------------------------------------
Batch: 0
-------------------------------------------
+-----------+----------+---------+----------+-----------------+------------+------------------+-----------------+----------------+----+--------+-------------+----------+
|customer_id|first_name|last_name|birth_date|          address|phone_number|             email|        job_title|updated_datetime|__op|   __lsn|  __src_ts_ms|      __ds|
+-----------+----------+---------+----------+-----------------+------------+------------------+-----------------+----------------+----+--------+-------------+----------+
|          6|     Cuong|       Vo|     11510| 12 ABC, MNO, XYZ|      012356|cuongvo2@gmail.com|    Data Engineer|1723911838000000|   r|27023448|1736926016362|2025-01-15|
|          1|        Vu|     Tran|     11643|123 ABC, MNO, XYZ|  0865937123|nitsvutt@gmail.com|    Data Engineer|1731865031000000|   r|27023448|1736926016362|2025-01-15|
|          5|     Cuong|       Vo|     11510| 12 ABC,

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/local/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 