In [1]:
from pyspark.sql import SparkSession

packages = ",".join([
    "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.2",
    "org.apache.hadoop:hadoop-aws:3.3.4",
    "com.amazonaws:aws-java-sdk-bundle:1.12.530",
])

spark = (
    SparkSession.builder
      .appName("Iceberg Lakehouse on MinIO")
      .config("spark.jars.packages", packages)
      .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
      .config("spark.sql.defaultCatalog", "lake")

      # Iceberg каталог (HadoopCatalog) 
      .config("spark.sql.catalog.lake", "org.apache.iceberg.spark.SparkCatalog")
      .config("spark.sql.catalog.lake.type", "hadoop")
      .config("spark.sql.catalog.lake.warehouse", "s3a://my-bucket/iceberg-lakehouse")

      .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
      .config("spark.hadoop.fs.s3a.path.style.access", "true")
      .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
      .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
      .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
      .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")

      .getOrCreate()
)

print("Spark:", spark.version)


Spark: 3.5.3


In [3]:
spark.catalog.setCurrentCatalog("lake")
spark.sql("CREATE NAMESPACE IF NOT EXISTS nyc")

spark.sql("""
CREATE TABLE IF NOT EXISTS nyc.taxis
USING iceberg
AS
SELECT * FROM parquet.`s3a://my-bucket/iceberg-lakehouse/yellow_tripdata_2025-05.parquet`
UNION ALL
SELECT * FROM parquet.`s3a://my-bucket/iceberg-lakehouse/yellow_tripdata_2025-06.parquet`
""")


DataFrame[]

In [8]:
# 
spark.sql("""
SELECT * FROM nyc.taxis.history ORDER BY made_current_at DESC;
""").show(truncate=False)


+-----------------------+-------------------+---------+-------------------+
|made_current_at        |snapshot_id        |parent_id|is_current_ancestor|
+-----------------------+-------------------+---------+-------------------+
|2025-08-08 11:46:26.317|1295814594149623800|NULL     |true               |
+-----------------------+-------------------+---------+-------------------+



In [9]:
# 
spark.sql("""
SELECT content, file_path, record_count, file_size_in_bytes
FROM nyc.taxis.files;
""").show(truncate=False)


+-------+--------------------------------------------------------------------------------------------------------------+------------+------------------+
|content|file_path                                                                                                     |record_count|file_size_in_bytes|
+-------+--------------------------------------------------------------------------------------------------------------+------------+------------------+
|0      |s3a://my-bucket/iceberg-lakehouse/nyc/taxis/data/00001-3-900a1090-458d-49a7-aaa0-faf16f9bc521-0-00001.parquet |1048576     |16886663          |
|0      |s3a://my-bucket/iceberg-lakehouse/nyc/taxis/data/00003-5-900a1090-458d-49a7-aaa0-faf16f9bc521-0-00001.parquet |1048576     |16960598          |
|0      |s3a://my-bucket/iceberg-lakehouse/nyc/taxis/data/00006-8-900a1090-458d-49a7-aaa0-faf16f9bc521-0-00001.parquet |1048576     |16872467          |
|0      |s3a://my-bucket/iceberg-lakehouse/nyc/taxis/data/00009-11-900a1090-458d-4

In [11]:
spark.sql("""-- simulate another batch arriving
INSERT INTO nyc.taxis
SELECT * FROM parquet.`s3a://my-bucket/iceberg-lakehouse/yellow_tripdata_2025-06.parquet` WHERE rand() < 0.10;
""")

spark.sql("""-- simulate another batch arriving
SELECT * FROM nyc.taxis.history ORDER BY made_current_at DESC;
""").show(truncate=False)


+-----------------------+-------------------+-------------------+-------------------+
|made_current_at        |snapshot_id        |parent_id          |is_current_ancestor|
+-----------------------+-------------------+-------------------+-------------------+
|2025-08-08 13:31:27.705|2624229959482912117|1295814594149623800|true               |
|2025-08-08 11:46:26.317|1295814594149623800|NULL               |true               |
+-----------------------+-------------------+-------------------+-------------------+



In [16]:

spark.sql("""-- get a snapshot_id from history, then query that point in time
SELECT count(*) FROM nyc.taxis VERSION AS OF 1295814594149623800 LIMIT 5;
""").show(truncate=False)

spark.sql("""-- get a snapshot_id from history, then query that point in time
SELECT count(*) FROM nyc.taxis VERSION AS OF 2624229959482912117 LIMIT 5;
""").show(truncate=False)


+--------+
|count(1)|
+--------+
|8914805 |
+--------+

+--------+
|count(1)|
+--------+
|9348730 |
+--------+



In [18]:
spark.sql(""" 
CALL lake.system.rollback_to_snapshot('nyc.taxis', 1295814594149623800)
""")


DataFrame[previous_snapshot_id: bigint, current_snapshot_id: bigint]

In [19]:

spark.sql("""-- verify it took effect
SELECT * FROM nyc.taxis.history ORDER BY made_current_at DESC;
""").show(truncate=False)

+-----------------------+-------------------+-------------------+-------------------+
|made_current_at        |snapshot_id        |parent_id          |is_current_ancestor|
+-----------------------+-------------------+-------------------+-------------------+
|2025-08-08 13:42:59.89 |1295814594149623800|NULL               |true               |
|2025-08-08 13:31:27.705|2624229959482912117|1295814594149623800|false              |
|2025-08-08 11:46:26.317|1295814594149623800|NULL               |true               |
+-----------------------+-------------------+-------------------+-------------------+



In [20]:
spark.sql("""-- get a snapshot_id from history, then query that point in time
SELECT count(*) FROM nyc.taxis  ;
""").show(truncate=False)

+--------+
|count(1)|
+--------+
|8914805 |
+--------+



In [21]:


spark.sql("""ALTER TABLE nyc.taxis ADD COLUMN fare_bucket STRING AFTER total_amount;
""")

spark.sql("""-- update a derived field to prove reads see the new column
UPDATE nyc.taxis
SET fare_bucket = CASE
  WHEN total_amount < 10 THEN 'low'
  WHEN total_amount < 30 THEN 'mid'
  ELSE 'high'
END
WHERE total_amount IS NOT NULL;
""")

spark.sql("""SELECT fare_bucket, COUNT(*) FROM nyc.taxis GROUP BY fare_bucket;
""").show(truncate=False)


+-----------+--------+
|fare_bucket|count(1)|
+-----------+--------+
|low        |765654  |
|mid        |5743528 |
|high       |2405623 |
+-----------+--------+

