In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, window, count, avg, sum, to_date, to_timestamp

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("TruncatePartition") \
    .master("yarn") \
    .config("spark.submit.deployMode", "client") \
    .config("spark.sql.shuffle.partitions", "4") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog")\
    .config("spark.sql.catalog.iceberg.type", "hadoop")\
    .config("spark.sql.catalog.iceberg.warehouse", "gs://gks-tpch/iceberg_warehouse")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/07 01:51:53 INFO SparkEnv: Registering MapOutputTracker
25/02/07 01:51:53 INFO SparkEnv: Registering BlockManagerMaster
25/02/07 01:51:53 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
25/02/07 01:51:53 INFO SparkEnv: Registering OutputCommitCoordinator


In [2]:
"""
Truncate few characters in the column value for partititions like product code, serial number etc
"""

'\nTruncate few characters in the column value for partititions like product code, serial number etc\n'

In [3]:
spark.sql("""
CREATE TABLE iceberg.partdemo.products (
    product_id BIGINT,
    product_code STRING,
    product_name STRING,
    category STRING,
    price DOUBLE
) USING iceberg
PARTITIONED BY (TRUNCATE(4, product_code))
""")

DataFrame[]

In [4]:
"""
 Electronics products (ELEC123, ELEC456) will be stored under partition ELEC.
 Home appliances (HOME789, HOME567) will be stored under partition HOME.
 Fashion (FASH001) gets its own partition (FASH).
"""

spark.sql("""
INSERT INTO iceberg.partdemo.products VALUES
(1, 'ELEC123', 'Smartphone', 'Electronics', 499.99),
(2, 'ELEC456', 'Laptop', 'Electronics', 999.99),
(3, 'HOME789', 'Vacuum Cleaner', 'Home Appliances', 199.99),
(4, 'HOME567', 'Air Purifier', 'Home Appliances', 299.99),
(5, 'FASH001', 'Sneakers', 'Fashion', 79.99);
""")

                                                                                

DataFrame[]

In [5]:
# Partition Pruning
# Only scans products under the ELEC partition (Partition Pruning).

spark.sql("""
SELECT * FROM iceberg.partdemo.products WHERE product_code LIKE 'ELEC%';
""").show()

                                                                                

+----------+------------+------------+-----------+------+
|product_id|product_code|product_name|   category| price|
+----------+------------+------------+-----------+------+
|         1|     ELEC123|  Smartphone|Electronics|499.99|
|         2|     ELEC456|      Laptop|Electronics|999.99|
+----------+------------+------------+-----------+------+



In [6]:
spark.sql("""
SELECT partition, file_path FROM iceberg.partdemo.products.files;
""").show()

+---------+--------------------+
|partition|           file_path|
+---------+--------------------+
|   {ELEC}|gs://gks-tpch/ice...|
|   {HOME}|gs://gks-tpch/ice...|
|   {FASH}|gs://gks-tpch/ice...|
+---------+--------------------+



In [7]:
spark.sql("""
UPDATE iceberg.partdemo.products 
SET product_code = 'ELEC999' 
WHERE product_id = 1;
""")

                                                                                

DataFrame[]

In [8]:
spark.sql("""
SELECT partition, file_path FROM iceberg.partdemo.products.files;
""").show()

+---------+--------------------+
|partition|           file_path|
+---------+--------------------+
|   {ELEC}|gs://gks-tpch/ice...|
|   {HOME}|gs://gks-tpch/ice...|
|   {FASH}|gs://gks-tpch/ice...|
+---------+--------------------+



In [9]:
spark.sql("""
DELETE FROM iceberg.partdemo.products WHERE product_code = 'HOME789';
""")

DataFrame[]

In [11]:
spark.sql("""
SELECT * FROM iceberg.partdemo.products.history;
""").show()

+--------------------+-------------------+-------------------+-------------------+
|     made_current_at|        snapshot_id|          parent_id|is_current_ancestor|
+--------------------+-------------------+-------------------+-------------------+
|2025-02-07 01:53:...| 917093135482276915|               NULL|               true|
|2025-02-07 01:54:...|6860277881744466835| 917093135482276915|               true|
|2025-02-07 01:54:...|4632508004002034742|6860277881744466835|               true|
+--------------------+-------------------+-------------------+-------------------+



In [13]:
spark.sql("""
CALL iceberg.system.rollback_to_snapshot('iceberg.partdemo.products', 6860277881744466835)
""")

DataFrame[previous_snapshot_id: bigint, current_snapshot_id: bigint]

In [14]:
spark.sql("""
SELECT * FROM iceberg.partdemo.products.history;
""").show()

+--------------------+-------------------+-------------------+-------------------+
|     made_current_at|        snapshot_id|          parent_id|is_current_ancestor|
+--------------------+-------------------+-------------------+-------------------+
|2025-02-07 01:53:...| 917093135482276915|               NULL|               true|
|2025-02-07 01:54:...|6860277881744466835| 917093135482276915|               true|
|2025-02-07 01:54:...|4632508004002034742|6860277881744466835|              false|
|2025-02-07 01:55:...|6860277881744466835| 917093135482276915|               true|
+--------------------+-------------------+-------------------+-------------------+



In [15]:
spark.sql("""
SELECT * FROM iceberg.partdemo.products;
""").show()

+----------+------------+--------------+---------------+------+
|product_id|product_code|  product_name|       category| price|
+----------+------------+--------------+---------------+------+
|         1|     ELEC999|    Smartphone|    Electronics|499.99|
|         2|     ELEC456|        Laptop|    Electronics|999.99|
|         3|     HOME789|Vacuum Cleaner|Home Appliances|199.99|
|         4|     HOME567|  Air Purifier|Home Appliances|299.99|
|         5|     FASH001|      Sneakers|        Fashion| 79.99|
+----------+------------+--------------+---------------+------+



In [16]:
spark.stop()