In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, window, count, avg, sum, to_date, to_timestamp

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("YearMonthDayHourPartition") \
    .master("yarn") \
    .config("spark.submit.deployMode", "client") \
    .config("spark.sql.shuffle.partitions", "4") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog")\
    .config("spark.sql.catalog.iceberg.type", "hadoop")\
    .config("spark.sql.catalog.iceberg.warehouse", "gs://gks-tpch/iceberg_warehouse")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/07 02:01:30 INFO SparkEnv: Registering MapOutputTracker
25/02/07 02:01:30 INFO SparkEnv: Registering BlockManagerMaster
25/02/07 02:01:30 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
25/02/07 02:01:30 INFO SparkEnv: Registering OutputCommitCoordinator


In [3]:
# Partitions data by sale_date and sale_time, allowing efficient time-based queries.
# Supports fast filtering by year, month, day, and even hour for detailed analytics.

spark.sql("""
CREATE TABLE iceberg.partdemo.sales (
    sale_id BIGINT,
    customer_id BIGINT,
    sale_date DATE,
    sale_time TIMESTAMP,
    product STRING,
    amount DOUBLE
) USING iceberg
PARTITIONED BY (DAY(sale_date), HOUR(sale_time));
""")


DataFrame[]

In [4]:
spark.sql("""
INSERT INTO iceberg.partdemo.sales VALUES
(1, 101, CAST('2024-01-10' AS DATE), CAST('2024-01-10 10:30:00' AS TIMESTAMP), 'Laptop', 1200.50),
(2, 102, CAST('2024-02-15' AS DATE), CAST('2024-02-15 14:45:00' AS TIMESTAMP), 'Smartphone', 800.00),
(3, 103, CAST('2024-03-20' AS DATE), CAST('2024-03-20 09:15:00' AS TIMESTAMP), 'Headphones', 150.75),
(4, 104, CAST('2024-04-05' AS DATE), CAST('2024-04-05 20:00:00' AS TIMESTAMP), 'Tablet', 300.99);
""")


                                                                                

DataFrame[]

In [5]:
# Get Sales for a Specific Day
spark.sql("""
SELECT * FROM iceberg.partdemo.sales WHERE sale_date = CAST('2024-02-15' AS DATE);
""").show()

                                                                                

+-------+-----------+----------+-------------------+----------+------+
|sale_id|customer_id| sale_date|          sale_time|   product|amount|
+-------+-----------+----------+-------------------+----------+------+
|      2|        102|2024-02-15|2024-02-15 14:45:00|Smartphone| 800.0|
+-------+-----------+----------+-------------------+----------+------+



In [6]:
# Retrieves sales made between 10:00 - 10:59 AM.

spark.sql("""
SELECT * FROM iceberg.partdemo.sales WHERE HOUR(sale_time) = 10;
""").show()

[Stage 4:>                                                          (0 + 1) / 1]

+-------+-----------+----------+-------------------+-------+------+
|sale_id|customer_id| sale_date|          sale_time|product|amount|
+-------+-----------+----------+-------------------+-------+------+
|      1|        101|2024-01-10|2024-01-10 10:30:00| Laptop|1200.5|
+-------+-----------+----------+-------------------+-------+------+



                                                                                

In [8]:
"""
{2024-01-10, 473578} - 473578 represent hour in seconds format
The number 473578, 475652, etc. corresponds to the number of seconds since midnight (00:00:00 UTC) on that day.
"""
spark.sql("""
SELECT partition, file_path FROM iceberg.partdemo.sales.files;
""").show()

+--------------------+--------------------+
|           partition|           file_path|
+--------------------+--------------------+
|{2024-01-10, 473578}|gs://gks-tpch/ice...|
|{2024-04-05, 475652}|gs://gks-tpch/ice...|
|{2024-03-20, 475257}|gs://gks-tpch/ice...|
|{2024-02-15, 474446}|gs://gks-tpch/ice...|
+--------------------+--------------------+



In [9]:
spark.sql("""
UPDATE iceberg.partdemo.sales 
SET sale_date = CAST('2024-05-01' AS DATE), sale_time = CAST('2024-05-01 08:00:00' AS TIMESTAMP)
WHERE sale_id = 1;
""")

DataFrame[]

In [10]:
spark.sql("""
DELETE FROM iceberg.partdemo.sales WHERE sale_date = CAST('2024-02-15' AS DATE);
""")

DataFrame[]

In [12]:
spark.sql("""
SELECT * FROM iceberg.partdemo.sales.history;
""").show()

+--------------------+-------------------+-------------------+-------------------+
|     made_current_at|        snapshot_id|          parent_id|is_current_ancestor|
+--------------------+-------------------+-------------------+-------------------+
|2025-02-07 02:05:...|7121681092154638402|               NULL|               true|
|2025-02-07 02:07:...|3076000011733852005|7121681092154638402|               true|
|2025-02-07 02:07:...|7213889487843157847|3076000011733852005|               true|
+--------------------+-------------------+-------------------+-------------------+



In [13]:
spark.sql("""
CALL iceberg.system.rollback_to_snapshot('iceberg.partdemo.sales', 3076000011733852005)
""")

DataFrame[previous_snapshot_id: bigint, current_snapshot_id: bigint]

In [14]:
spark.sql("""
SELECT * FROM iceberg.partdemo.sales;
""").show()

[Stage 13:>                                                         (0 + 1) / 1]

+-------+-----------+----------+-------------------+----------+------+
|sale_id|customer_id| sale_date|          sale_time|   product|amount|
+-------+-----------+----------+-------------------+----------+------+
|      4|        104|2024-04-05|2024-04-05 20:00:00|    Tablet|300.99|
|      3|        103|2024-03-20|2024-03-20 09:15:00|Headphones|150.75|
|      2|        102|2024-02-15|2024-02-15 14:45:00|Smartphone| 800.0|
|      1|        101|2024-05-01|2024-05-01 08:00:00|    Laptop|1200.5|
+-------+-----------+----------+-------------------+----------+------+



                                                                                