In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, window, count, avg, sum, to_date, to_timestamp

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("BucketPartition") \
    .master("yarn") \
    .config("spark.submit.deployMode", "client") \
    .config("spark.sql.shuffle.partitions", "4") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog")\
    .config("spark.sql.catalog.iceberg.type", "hadoop")\
    .config("spark.sql.catalog.iceberg.warehouse", "gs://gks-tpch/iceberg_warehouse")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/07 01:36:53 INFO SparkEnv: Registering MapOutputTracker
25/02/07 01:36:54 INFO SparkEnv: Registering BlockManagerMaster
25/02/07 01:36:54 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
25/02/07 01:36:54 INFO SparkEnv: Registering OutputCommitCoordinator


In [None]:
"""
Bucket partitioning helps distribute data evenly across partitions based on a hashing function, 
which is useful when dealing with high-cardinality columns like customer_id.
"""

In [2]:
spark.sql("CREATE DATABASE IF NOT EXISTS iceberg.partdemo")

DataFrame[]

In [3]:
# BUCKET(4, customer_id): Customer IDs will be bucketed into 4 buckets, distributing rows evenly

spark.sql ("""
CREATE TABLE iceberg.partdemo.orders_bucket (
    order_id BIGINT,
    customer_id BIGINT,
    order_status STRING,
    total_price DOUBLE,
    order_date DATE,
    region STRING
) USING iceberg
PARTITIONED BY (region, BUCKET(4, customer_id))
""")

DataFrame[]

In [4]:
spark.sql(
    """
    INSERT INTO iceberg.partdemo.orders_bucket VALUES
(1, 101, 'Shipped', 250.75, CAST('2024-01-01' AS DATE), 'North'),
(2, 102, 'Pending', 100.50, CAST('2024-01-02' AS DATE), 'South'),
(3, 103, 'Cancelled', 300.00, CAST('2024-01-03' AS DATE), 'East'),
(4, 104, 'Shipped', 175.20, CAST('2024-01-04' AS DATE), 'West'),
(5, 105, 'Processing', 150.00, CAST('2024-02-01' AS DATE), 'North');
""")

                                                                                

DataFrame[]

In [5]:
spark.sql("""
SELECT * FROM iceberg.partdemo.orders_bucket WHERE region='North'
""").show()

                                                                                

+--------+-----------+------------+-----------+----------+------+
|order_id|customer_id|order_status|total_price|order_date|region|
+--------+-----------+------------+-----------+----------+------+
|       1|        101|     Shipped|     250.75|2024-01-01| North|
|       5|        105|  Processing|      150.0|2024-02-01| North|
+--------+-----------+------------+-----------+----------+------+



In [6]:
spark.sql("""
SELECT partition, file_path 
FROM iceberg.partdemo.orders_bucket.files
""").show()

+----------+--------------------+
| partition|           file_path|
+----------+--------------------+
|{South, 0}|gs://gks-tpch/ice...|
|{North, 0}|gs://gks-tpch/ice...|
| {West, 0}|gs://gks-tpch/ice...|
| {East, 3}|gs://gks-tpch/ice...|
+----------+--------------------+



In [8]:
spark.sql("""
SELECT partition, file_path 
FROM iceberg.partdemo.orders_bucket.files
""").show()

+----------+--------------------+
| partition|           file_path|
+----------+--------------------+
| {West, 0}|gs://gks-tpch/ice...|
|{North, 0}|gs://gks-tpch/ice...|
| {West, 0}|gs://gks-tpch/ice...|
| {East, 3}|gs://gks-tpch/ice...|
+----------+--------------------+



In [9]:
#  Iceberg automatically manages partition metadata, so you can update partition columns without manually handling partitions.
spark.sql(
"""
UPDATE iceberg.partdemo.orders_bucket 
SET region = 'West' 
WHERE region = 'South'
""").show()

++
||
++
++



In [10]:
#  Iceberg does not move files physically, but the data might be redistributed in a new bucket.
spark.sql("""
UPDATE iceberg.partdemo.orders_bucket 
SET customer_id = 110 
WHERE customer_id = 105
""")

DataFrame[]

In [11]:
spark.sql("""
SELECT partition, file_path 
FROM iceberg.partdemo.orders_bucket.files
""").show()

+----------+--------------------+
| partition|           file_path|
+----------+--------------------+
|{North, 0}|gs://gks-tpch/ice...|
|{North, 2}|gs://gks-tpch/ice...|
| {West, 0}|gs://gks-tpch/ice...|
| {West, 0}|gs://gks-tpch/ice...|
| {East, 3}|gs://gks-tpch/ice...|
+----------+--------------------+



In [12]:
# Deletes are metadata-based, so Iceberg does not physically delete files immediately.


spark.sql(
"""
    DELETE FROM iceberg.partdemo.orders_bucket WHERE order_id = 3
""")

DataFrame[]

In [13]:
spark.sql("""
SELECT * FROM iceberg.partdemo.orders_bucket.history
""").show()

+--------------------+-------------------+-------------------+-------------------+
|     made_current_at|        snapshot_id|          parent_id|is_current_ancestor|
+--------------------+-------------------+-------------------+-------------------+
|2025-02-07 01:38:...|7684064402365346734|               NULL|               true|
|2025-02-07 01:40:...| 766112837030824669|7684064402365346734|               true|
|2025-02-07 01:41:...|6952680204985605665| 766112837030824669|               true|
|2025-02-07 01:41:...| 198893602058254204|6952680204985605665|               true|
|2025-02-07 01:42:...|6750072295676235037| 198893602058254204|               true|
+--------------------+-------------------+-------------------+-------------------+



In [14]:
spark.sql("""
CALL iceberg.system.rollback_to_snapshot('iceberg.partdemo.orders_bucket', 766112837030824669)
""").show()

+--------------------+-------------------+
|previous_snapshot_id|current_snapshot_id|
+--------------------+-------------------+
| 6750072295676235037| 766112837030824669|
+--------------------+-------------------+



In [15]:
spark.sql("""
SELECT * FROM iceberg.partdemo.orders_bucket
""").show()

+--------+-----------+------------+-----------+----------+------+
|order_id|customer_id|order_status|total_price|order_date|region|
+--------+-----------+------------+-----------+----------+------+
|       2|        102|     Pending|      100.5|2024-01-02|  West|
|       1|        101|     Shipped|     250.75|2024-01-01| North|
|       5|        105|  Processing|      150.0|2024-02-01| North|
|       4|        104|     Shipped|      175.2|2024-01-04|  West|
|       3|        103|   Cancelled|      300.0|2024-01-03|  East|
+--------+-----------+------------+-----------+----------+------+



                                                                                

In [16]:
spark.stop()