In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
def main():
   # Khởi SparkSession (nạp spark-defaults.conf và hive-site.xml)
   spark = SparkSession.builder \
       .appName("IcebergHiveCatalogExample") \
       .master("spark://spark-master:7077") \
       .enableHiveSupport() \
       .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") \
       .config("spark.sql.catalog.spark_catalog.type", "hive") \
       .config("spark.sql.catalog.spark_catalog.uri", "thrift://hive-metastore:9083") \
       .config("spark.jars", "/home/jovyan/jars/iceberg-spark-runtime-3.5_2.12-1.9.2.jar") \
       .getOrCreate()
   
   # Tạo DataFrame mẫu
   schema = StructType([
       StructField("id",   IntegerType(), False),
       StructField("name", StringType(),  False),
       StructField("age",  IntegerType(), True)
   ])
   data = [
       (1, "Nguyễn Văn A", 30),
       (2, "Trần Thị B",   25),
       (3, "Lê Văn C",     28),
       (4, "Nguyễn Lương Hoàng Tùng", 21)
   ]
   df = spark.createDataFrame(data, schema)
   
   print("=== Dữ liệu mẫu ===")
   df.show(truncate=False)
   
   # Tạo namespace (database) nếu chưa có
   spark.sql("CREATE NAMESPACE IF NOT EXISTS hive_catalog.default")
   
   # Tạo table Iceberg (nếu chưa tồn tại)
   spark.sql("""
     CREATE TABLE IF NOT EXISTS hive_catalog.default.users (
       id   INT,
       name STRING,
       age  INT
     ) USING iceberg
   """)
   
   # Ghi dữ liệu vào bảng (append)
   df.writeTo("hive_catalog.default.users").append()
   print(">>> Đã ghi dữ liệu vào hive_catalog.default.users")
   
   # Đọc lại và hiển thị
   spark.table("hive_catalog.default.users").show(truncate=False)
   print("=== Dữ liệu trong bảng Iceberg ===")
   
   # Hiển thị lịch sử commit, sắp xếp theo made_current_at DESC
   hist_df = spark.table("hive_catalog.default.users.history") \
                 .orderBy(col("made_current_at").desc())
   hist_df.show(10, truncate=False)
   print("=== Lịch sử commit của bảng (mới nhất trước) ===")
   
   # Xem danh sách snapshots
   snap_df = spark.table("hive_catalog.default.users.snapshots")
   snap_df.show(truncate=True)
   print("=== Danh sách snapshot hiện tại ===")
   
   spark.stop()

if __name__ == "__main__":
   main()

25/08/06 07:45:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/08/06 07:45:18 WARN DependencyUtils: Local jar /home/jovyan/jars/iceberg-spark-runtime-3.5_2.12-1.9.2.jar does not exist, skipping.
25/08/06 07:45:18 INFO SparkContext: Running Spark version 3.4.1
25/08/06 07:45:18 INFO ResourceUtils: No custom resources configured for spark.driver.
25/08/06 07:45:18 INFO SparkContext: Submitted application: IcebergHiveCatalogExample
25/08/06 07:45:18 INFO ResourceProfile: Default ResourceProfile created, executor resources: Map(memory -> name: memory, amount: 1024, script: , vendor: , offHeap -> name: offHeap, amount: 0, script: , vendor: ), task resources: Map(cpus -> name: cpus, amount: 1.0)
25/08/06 07:45:18 INFO ResourceProfile: Limiting resource is cpu
25/08/06 07:45:18 INFO ResourceProfileManager: Added ResourceProfile id: 0
25/08/06 07:45:18 INFO SecurityManager: Changing view acls to: root,spark
25/0

=== Dữ liệu mẫu ===


25/08/06 07:45:23 INFO CodeGenerator: Code generated in 234.179755 ms
25/08/06 07:45:23 INFO SparkContext: Starting job: showString at NativeMethodAccessorImpl.java:0
25/08/06 07:45:23 INFO DAGScheduler: Got job 0 (showString at NativeMethodAccessorImpl.java:0) with 1 output partitions
25/08/06 07:45:23 INFO DAGScheduler: Final stage: ResultStage 0 (showString at NativeMethodAccessorImpl.java:0)
25/08/06 07:45:23 INFO DAGScheduler: Parents of final stage: List()
25/08/06 07:45:23 INFO DAGScheduler: Missing parents: List()
25/08/06 07:45:23 INFO DAGScheduler: Submitting ResultStage 0 (MapPartitionsRDD[6] at showString at NativeMethodAccessorImpl.java:0), which has no missing parents
25/08/06 07:45:23 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 12.5 KiB, free 434.4 MiB)
25/08/06 07:45:23 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 6.7 KiB, free 434.4 MiB)
25/08/06 07:45:23 INFO BlockManagerInfo: Added broadcast_0

+---+-----------------------+---+
|id |name                   |age|
+---+-----------------------+---+
|1  |Nguyễn Văn A           |30 |
|2  |Trần Thị B             |25 |
|3  |Lê Văn C               |28 |
|4  |Nguyễn Lương Hoàng Tùng|21 |
+---+-----------------------+---+



25/08/06 07:45:25 INFO HiveConf: Found configuration file null
25/08/06 07:45:25 INFO metastore: Trying to connect to metastore with URI thrift://hive-metastore:9083
25/08/06 07:45:25 INFO metastore: Opened a connection to metastore, current connections: 1
25/08/06 07:45:25 INFO metastore: Connected to metastore.
25/08/06 07:45:26 INFO BaseMetastoreCatalog: Table properties set at catalog level through catalog properties: {}
25/08/06 07:45:26 INFO BaseMetastoreCatalog: Table properties enforced at catalog level through catalog properties: {}
25/08/06 07:45:26 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
25/08/06 07:45:26 INFO MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).
25/08/06 07:45:26 INFO MetricsSystemImpl: s3a-file-system metrics system started
25/08/06 07:45:26 INFO BlockManagerInfo: Removed broadcast_0_piece0 on 701b21a81e6e:46863 in memory (size: 6.7 KiB, free: 434.4 MiB)
25/

>>> Đã ghi dữ liệu vào hive_catalog.default.users


25/08/06 07:45:33 INFO V2ScanRelationPushDown: 
Output: id#25, name#26, age#27
         
25/08/06 07:45:33 INFO SnapshotScan: Scanning table hive_catalog.default.users snapshot 1847557235031030640 created at 2025-08-06T07:45:33.258+00:00 with filter true
25/08/06 07:45:33 INFO BaseDistributedDataScan: Planning file tasks locally for table hive_catalog.default.users
25/08/06 07:45:33 INFO BlockManagerInfo: Removed broadcast_3_piece0 on 701b21a81e6e:46863 in memory (size: 7.6 KiB, free: 434.4 MiB)
25/08/06 07:45:33 INFO BlockManagerInfo: Removed broadcast_3_piece0 on 172.18.0.8:46295 in memory (size: 7.6 KiB, free: 434.4 MiB)
25/08/06 07:45:33 INFO BlockManagerInfo: Removed broadcast_2_piece0 on 701b21a81e6e:46863 in memory (size: 29.4 KiB, free: 434.4 MiB)
25/08/06 07:45:33 INFO BlockManagerInfo: Removed broadcast_2_piece0 on 172.18.0.8:46295 in memory (size: 29.4 KiB, free: 434.4 MiB)
25/08/06 07:45:33 INFO SparkPartitioningAwareScan: Reporting UnknownPartitioning with 1 partition(s) f

+---+-----------------------+---+
|id |name                   |age|
+---+-----------------------+---+
|1  |Nguyễn Văn A           |30 |
|2  |Trần Thị B             |25 |
|3  |Lê Văn C               |28 |
|4  |Nguyễn Lương Hoàng Tùng|21 |
+---+-----------------------+---+

=== Dữ liệu trong bảng Iceberg ===


25/08/06 07:45:34 INFO BaseMetastoreCatalog: Table loaded by catalog: hive_catalog.default.users.history
25/08/06 07:45:35 INFO V2ScanRelationPushDown: 
Output: made_current_at#52, snapshot_id#53L, parent_id#54L, is_current_ancestor#55
         
25/08/06 07:45:35 INFO SparkPartitioningAwareScan: Reporting UnknownPartitioning with 1 partition(s) for table hive_catalog.default.users.history
25/08/06 07:45:35 INFO MemoryStore: Block broadcast_7 stored as values in memory (estimated size 32.0 KiB, free 434.2 MiB)
25/08/06 07:45:35 INFO MemoryStore: Block broadcast_7_piece0 stored as bytes in memory (estimated size 29.7 KiB, free 434.2 MiB)
25/08/06 07:45:35 INFO BlockManagerInfo: Removed broadcast_6_piece0 on 701b21a81e6e:46863 in memory (size: 5.9 KiB, free: 434.3 MiB)
25/08/06 07:45:35 INFO BlockManagerInfo: Added broadcast_7_piece0 in memory on 701b21a81e6e:46863 (size: 29.7 KiB, free: 434.3 MiB)
25/08/06 07:45:35 INFO SparkContext: Created broadcast 7 from broadcast at SparkBatch.java:

+-----------------------+-------------------+---------+-------------------+
|made_current_at        |snapshot_id        |parent_id|is_current_ancestor|
+-----------------------+-------------------+---------+-------------------+
|2025-08-06 07:45:33.258|1847557235031030640|null     |true               |
+-----------------------+-------------------+---------+-------------------+

=== Lịch sử commit của bảng (mới nhất trước) ===


25/08/06 07:45:35 INFO CodeGenerator: Code generated in 24.149128 ms
25/08/06 07:45:35 INFO SparkContext: Starting job: showString at NativeMethodAccessorImpl.java:0
25/08/06 07:45:35 INFO DAGScheduler: Got job 5 (showString at NativeMethodAccessorImpl.java:0) with 1 output partitions
25/08/06 07:45:35 INFO DAGScheduler: Final stage: ResultStage 5 (showString at NativeMethodAccessorImpl.java:0)
25/08/06 07:45:35 INFO DAGScheduler: Parents of final stage: List()
25/08/06 07:45:35 INFO DAGScheduler: Missing parents: List()
25/08/06 07:45:35 INFO DAGScheduler: Submitting ResultStage 5 (MapPartitionsRDD[20] at showString at NativeMethodAccessorImpl.java:0), which has no missing parents
25/08/06 07:45:35 INFO MemoryStore: Block broadcast_12 stored as values in memory (estimated size 15.3 KiB, free 434.0 MiB)
25/08/06 07:45:35 INFO MemoryStore: Block broadcast_12_piece0 stored as bytes in memory (estimated size 6.3 KiB, free 434.0 MiB)
25/08/06 07:45:35 INFO BlockManagerInfo: Added broadcast

+--------------------+-------------------+---------+---------+--------------------+--------------------+
|        committed_at|        snapshot_id|parent_id|operation|       manifest_list|             summary|
+--------------------+-------------------+---------+---------+--------------------+--------------------+
|2025-08-06 07:45:...|1847557235031030640|     null|   append|s3a://iceberg-war...|{spark.app.id -> ...|
+--------------------+-------------------+---------+---------+--------------------+--------------------+

=== Danh sách snapshot hiện tại ===


25/08/06 07:45:35 INFO SparkUI: Stopped Spark web UI at http://701b21a81e6e:4040
25/08/06 07:45:35 INFO StandaloneSchedulerBackend: Shutting down all executors
25/08/06 07:45:35 INFO StandaloneSchedulerBackend$StandaloneDriverEndpoint: Asking each executor to shut down
25/08/06 07:45:35 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
25/08/06 07:45:35 INFO MemoryStore: MemoryStore cleared
25/08/06 07:45:35 INFO BlockManager: BlockManager stopped
25/08/06 07:45:35 INFO BlockManagerMaster: BlockManagerMaster stopped
25/08/06 07:45:35 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
25/08/06 07:45:35 INFO SparkContext: Successfully stopped SparkContext


In [10]:
spark.sparkContext.master

'spark://spark-master:7077'

In [1]:
from pyspark.sql import SparkSession

def main():
   spark = (
       SparkSession.builder
           .appName("IcebergTimeTravelExample") \
           .master("spark://spark-master:7077") \
           .enableHiveSupport() \
           .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") \
           .config("spark.sql.catalog.spark_catalog.type", "hive") \
           .config("spark.sql.catalog.spark_catalog.uri", "thrift://hive-metastore:9083") \
           .getOrCreate()
   )
   
   catalog_table = "hive_catalog.default.users"
   
   # Liệt kê các snapshot hiện có
   snapshots_df = spark.sql(f"SELECT snapshot_id, committed_at, summary FROM {catalog_table}.snapshots")
   snapshots_df.orderBy("committed_at").show(truncate=True)
   print("=== Danh sách snapshots ===")
   
   # Giả sử chúng ta lấy snapshot đầu tiên (cũ nhất) để time-travel:
   first_snapshot_id = snapshots_df.orderBy("committed_at").first()["snapshot_id"]
   print(f">>> Sẽ time-travel về snapshot_id = {first_snapshot_id}")
   
   # Đọc dữ liệu tại snapshot đó (version-as-of)
   df_time_travel = spark.read \
       .format("iceberg") \
       .option("snapshot-id", first_snapshot_id) \
       .load(catalog_table)
   df_time_travel.show(truncate=False)
   print(f"=== Dữ liệu tại snapshot {first_snapshot_id} ===")
   
   # Hoặc dùng SQL cú pháp VERSION AS OF
   df_sql = spark.sql(f"SELECT * FROM {catalog_table} VERSION AS OF {first_snapshot_id}")
   df_sql.show(truncate=False)
   print(f"=== (SQL) Dữ liệu tại snapshot {first_snapshot_id} ===")
   
   # Time-travel theo timestamp (ví dụ 5 phút trước)
   import datetime, pytz
   ts = (datetime.datetime.now(pytz.UTC) - datetime.timedelta(minutes=5)).strftime("%Y-%m-%d %H:%M:%S")
   print(f">>> Sẽ time-travel theo timestamp = {ts}")
   df_ts = spark.read \
       .format("iceberg") \
       .option("timestamp-as-of", ts) \
       .load(catalog_table)
   df_ts.show(truncate=False)
   print(f"=== Dữ liệu tại timestamp {ts} ===")
   
   spark.stop()

if __name__ == "__main__":
   main()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/31 08:34:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/07/31 08:34:36 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


NameError: name 'T' is not defined