In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, window, count, avg, sum, to_date, to_timestamp

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("IdenityPartition") \
    .master("yarn") \
    .config("spark.submit.deployMode", "client") \
    .config("spark.sql.shuffle.partitions", "4") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog")\
    .config("spark.sql.catalog.iceberg.type", "hadoop")\
    .config("spark.sql.catalog.iceberg.warehouse", "gs://gks-tpch/iceberg_warehouse")\
    .getOrCreate()

25/02/07 01:24:07 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [None]:
"""
Hidden Partitioning: Iceberg does not use physical partitions, so changing a partition column’s value does not require physically moving data.
Partition Evolution Support: Iceberg dynamically manages partition mappings in metadata.
Query Optimization Stays Intact: The query planner will still use partition pruning efficiently.
"""

In [12]:
spark.sql("CREATE DATABASE IF NOT EXISTS iceberg.partdemo");

In [13]:
spark.sql("SHOW DATABASES in iceberg").show()

+---------+
|namespace|
+---------+
|     tpch|
| partdemo|
|movielens|
+---------+



In [16]:
spark.sql("""
CREATE TABLE IF NOT EXISTS iceberg.partdemo.orders (
    order_id BIGINT,
    customer_id BIGINT,
    order_status STRING,
    total_price DOUBLE,
    order_date DATE,
    region STRING
) USING iceberg
PARTITIONED BY (region);
""")

DataFrame[]

In [18]:
spark.sql("""
INSERT INTO iceberg.partdemo.orders 
SELECT 1, 101, 'Shipped', 250.75, CAST('2024-01-01' AS DATE), 'North'
UNION ALL
SELECT 2, 102, 'Pending', 100.50, CAST('2024-01-02' AS DATE), 'South'
UNION ALL
SELECT 3, 103, 'Cancelled', 300.00, CAST('2024-01-03' AS DATE), 'East'
UNION ALL
SELECT 4, 104, 'Shipped', 175.20, CAST('2024-01-04' AS DATE), 'West'
""")

                                                                                

DataFrame[]

In [21]:
spark.sql("DESCRIBE TABLE iceberg.partdemo.orders").show()

+--------------------+---------+-------+
|            col_name|data_type|comment|
+--------------------+---------+-------+
|            order_id|   bigint|   NULL|
|         customer_id|   bigint|   NULL|
|        order_status|   string|   NULL|
|         total_price|   double|   NULL|
|          order_date|     date|   NULL|
|              region|   string|   NULL|
|# Partition Infor...|         |       |
|          # col_name|data_type|comment|
|              region|   string|   NULL|
+--------------------+---------+-------+



In [22]:
spark.sql("SELECT * FROM iceberg.partdemo.orders.files").show(truncate=False)

[Stage 70:>                                                         (0 + 1) / 1]

+-------+---------------------------------------------------------------------------------------------------------------------------------+-----------+-------+---------+------------+------------------+------------------------------------------------------+------------------------------------------------+------------------------------------------------+----------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------+------------+-------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|content|file_path                  

                                                                                

In [23]:
spark.sql("SELECT DISTINCT partition FROM iceberg.partdemo.orders.files").show()

+---------+
|partition|
+---------+
|   {West}|
|  {North}|
|   {East}|
|  {South}|
+---------+



In [24]:
spark.sql("SELECT * FROM iceberg.partdemo.orders").show()

[Stage 74:>                                                         (0 + 1) / 1]

+--------+-----------+------------+-----------+----------+------+
|order_id|customer_id|order_status|total_price|order_date|region|
+--------+-----------+------------+-----------+----------+------+
|       4|        104|     Shipped|      175.2|2024-01-04|  West|
|       2|        102|     Pending|      100.5|2024-01-02| South|
|       1|        101|     Shipped|     250.75|2024-01-01| North|
|       3|        103|   Cancelled|      300.0|2024-01-03|  East|
+--------+-----------+------------+-----------+----------+------+



                                                                                

In [25]:
spark.sql("""
UPDATE iceberg.partdemo.orders 
SET region = 'West' 
WHERE region = 'South'
""")

DataFrame[]

In [26]:
spark.sql("SELECT * FROM iceberg.partdemo.orders").show()

+--------+-----------+------------+-----------+----------+------+
|order_id|customer_id|order_status|total_price|order_date|region|
+--------+-----------+------------+-----------+----------+------+
|       2|        102|     Pending|      100.5|2024-01-02|  West|
|       4|        104|     Shipped|      175.2|2024-01-04|  West|
|       1|        101|     Shipped|     250.75|2024-01-01| North|
|       3|        103|   Cancelled|      300.0|2024-01-03|  East|
+--------+-----------+------------+-----------+----------+------+



In [27]:
spark.sql("SELECT DISTINCT partition FROM iceberg.partdemo.orders.files").show()

+---------+
|partition|
+---------+
|   {West}|
|  {North}|
|   {East}|
+---------+



In [28]:
spark.stop()