### Import PySpark and Create SparkSession
 - The SparkSession should be automatically configured by PYSPARK_SUBMIT_ARGS
 - defined in docker-compose.yml for the jupyterlab service.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import os

In [2]:
import pyspark

In [3]:
pyspark.__version__

'3.5.6'

### Create SparkSession, manually set configs

In [4]:
spark = (SparkSession.builder
    .master("spark://spark-master:7077")
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.iceberg_jdbc", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.iceberg_jdbc.uri", "jdbc:postgresql://postgres:5432/iceberg_catalog")
    .config("spark.sql.catalog.iceberg_jdbc.jdbc.user", "iceberg")
    .config("spark.sql.catalog.iceberg_jdbc.jdbc.password", "icebergpassword")
    .config("spark.sql.catalog.iceberg_jdbc.warehouse", "s3a://iceberg-warehouse/")
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .config("spark.hadoop.fs.s3a.endpoint.region","eu-central-1")
    .config("spark.hadoop.fs.s3a.access.key", "admin")
    .config("spark.hadoop.fs.s3a.secret.key", "password")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    ).getOrCreate()

:: loading settings :: url = jar:file:/opt/conda/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
org.postgresql#postgresql added as a dependency
org.apache.spark#spark-avro_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-5935c306-9df2-463d-849d-ef98cbd507a0;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.9.1 in central
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found org.postgresql#postgresql;42.6.0 in central
	found org.checkerframework#checker-qual;3.31.0 in central
	found org.apache.spark#spark-avro_2.12;3.5.6 in central
	found org.tukaani#xz;1.9 in central
:: resolution report :: resolve 132ms :: artifacts dl 10ms
	:: modules in use:


In [5]:
spark

In [6]:
print("SparkSession created successfully!")
print(f"Spark version: {spark.version}")
# Verify Iceberg catalog configuration - Should show default once catalog is used
spark.sql("SHOW CURRENT NAMESPACE").show()
# Define the catalog name we configured

SparkSession created successfully!
Spark version: 3.5.6
+-------------+---------+
|      catalog|namespace|
+-------------+---------+
|spark_catalog|  default|
+-------------+---------+



In [7]:
iceberg_catalog_name = "iceberg_jdbc" # Must match spark.sql.catalog.iceberg_jdbc in config

Create a Database/Schema in Iceberg using Spark

In [8]:
db_name = "spark_schema"

In [9]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS {iceberg_catalog_name}.{db_name}").show()
spark.sql(f"USE {iceberg_catalog_name}.{db_name}")
spark.sql(f"SHOW DATABASES IN {iceberg_catalog_name}").show()
# In Spark, `DATABASE` and `SCHEMA` are often used interchangeably.
# Iceberg uses `NAMESPACE`.

++
||
++
++

+------------+
|   namespace|
+------------+
|spark_schema|
+------------+



 Create an Iceberg Table with Spark SQL

In [10]:
table_name = "spark_orders"
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {iceberg_catalog_name}.{db_name}.{table_name} (
    order_id STRING,
    customer_id STRING,
    order_date DATE,
    amount DECIMAL(10, 2),
    category STRING
)
USING iceberg
PARTITIONED BY (category, days(order_date)) -- Column partitioning and hidden partitioning by day
TBLPROPERTIES (
    'write.format.default'='parquet',
    'format-version'='2'
)
""").show()

25/07/10 10:39:30 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


++
||
++
++



In [11]:
spark.sql(f"SHOW TABLES IN {iceberg_catalog_name}.{db_name}").show()

+------------+------------+-----------+
|   namespace|   tableName|isTemporary|
+------------+------------+-----------+
|spark_schema|spark_orders|      false|
+------------+------------+-----------+



Insert Data using Spark SQL

In [12]:
spark.sql(f"""
INSERT INTO {iceberg_catalog_name}.{db_name}.{table_name} VALUES
('ORD001', 'CUST101', DATE '2023-01-15', 100.50, 'electronics'),
('ORD002', 'CUST102', DATE '2023-01-16', 75.20, 'books'),
('ORD003', 'CUST101', DATE '2023-01-16', 250.00, 'electronics'),
('ORD004', 'CUST103', DATE '2023-01-17', 45.99, 'home'),
('ORD005', 'CUST102', DATE '2023-01-18', 120.00, 'books')
""")
print(f"Data inserted into {db_name}.{table_name}")

                                                                                

Data inserted into spark_schema.spark_orders


In [13]:
print(f"Querying all data from {db_name}.{table_name}:")
spark.sql(f"SELECT * FROM {iceberg_catalog_name}.{db_name}.{table_name} ORDER BY order_date").show()

Querying all data from spark_schema.spark_orders:
+--------+-----------+----------+------+-----------+
|order_id|customer_id|order_date|amount|   category|
+--------+-----------+----------+------+-----------+
|  ORD001|    CUST101|2023-01-15|100.50|electronics|
|  ORD003|    CUST101|2023-01-16|250.00|electronics|
|  ORD002|    CUST102|2023-01-16| 75.20|      books|
|  ORD004|    CUST103|2023-01-17| 45.99|       home|
|  ORD005|    CUST102|2023-01-18|120.00|      books|
+--------+-----------+----------+------+-----------+



Select Data using Spark SQL

In [14]:
print(f"Querying all data from {db_name}.{table_name}:")
spark.sql(f"SELECT * FROM {iceberg_catalog_name}.{db_name}.{table_name} ORDER BY order_date").show()

Querying all data from spark_schema.spark_orders:
+--------+-----------+----------+------+-----------+
|order_id|customer_id|order_date|amount|   category|
+--------+-----------+----------+------+-----------+
|  ORD001|    CUST101|2023-01-15|100.50|electronics|
|  ORD003|    CUST101|2023-01-16|250.00|electronics|
|  ORD002|    CUST102|2023-01-16| 75.20|      books|
|  ORD004|    CUST103|2023-01-17| 45.99|       home|
|  ORD005|    CUST102|2023-01-18|120.00|      books|
+--------+-----------+----------+------+-----------+



### Querying electronics orders (demonstrating partition filter pushdown):

In [15]:
spark.sql(f"""
SELECT * FROM {iceberg_catalog_name}.{db_name}.{table_name}
WHERE category = 'electronics' AND order_date = DATE '2023-01-16'
""").show()

+--------+-----------+----------+------+-----------+
|order_id|customer_id|order_date|amount|   category|
+--------+-----------+----------+------+-----------+
|  ORD003|    CUST101|2023-01-16|250.00|electronics|
+--------+-----------+----------+------+-----------+



To see the query plan:

In [16]:
print(spark.sql(f"""EXPLAIN SELECT * FROM {iceberg_catalog_name}.{db_name}.{table_name} 
          WHERE category = 'electronics' AND order_date = DATE '2023-01-16'""")\
.collect()[0][0])#.show(truncate=False)

== Physical Plan ==
*(1) Filter (order_date#206 = 2023-01-16)
+- *(1) ColumnarToRow
   +- BatchScan iceberg_jdbc.spark_schema.spark_orders[order_id#204, customer_id#205, order_date#206, amount#207, category#208] iceberg_jdbc.spark_schema.spark_orders (branch=null) [filters=category IS NOT NULL, order_date IS NOT NULL, category = 'electronics', order_date = 19373, groupedBy=] RuntimeFilters: []




### DataFrame API for Writing and Reading

In [17]:
from pyspark.sql.functions import col, to_date, lit
from pyspark.sql.types import StructType, StructField, StringType, DateType,FloatType

In [18]:
schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("order_date", DateType(), True),
    StructField("amount", FloatType(), True),
    StructField("category", StringType(), True)
])
data = [
    ('ORD006', 'CUST104', '2023-01-19', 300.20, 'electronics'),
    ('ORD007', 'CUST105', '2023-01-19', 22.2, 'books')
]

In [19]:
# Convert string dates to DateType for DataFrame creation
from datetime import datetime
data_typed = [(r[0], r[1], datetime.strptime(r[2], '%Y-%m-%d').date(), r[3], r[4]) for r in data]

In [20]:
data_typed

[('ORD006', 'CUST104', datetime.date(2023, 1, 19), 300.2, 'electronics'),
 ('ORD007', 'CUST105', datetime.date(2023, 1, 19), 22.2, 'books')]

In [21]:
new_orders_df = spark.createDataFrame(data_typed, schema=schema)
print("\nNew orders DataFrame:")
new_orders_df.show()


New orders DataFrame:


[Stage 6:>                                                          (0 + 1) / 1]

+--------+-----------+----------+------+-----------+
|order_id|customer_id|order_date|amount|   category|
+--------+-----------+----------+------+-----------+
|  ORD006|    CUST104|2023-01-19| 300.2|electronics|
|  ORD007|    CUST105|2023-01-19|  22.2|      books|
+--------+-----------+----------+------+-----------+



                                                                                

In [22]:
# Append data using DataFrameWriter
new_orders_df.writeTo(f"{iceberg_catalog_name}.{db_name}.{table_name}").append()

print(f"\nData after DataFrame append:")
spark.sql(f"""SELECT * FROM {iceberg_catalog_name}.{db_name}.{table_name}
           ORDER BY order_date, order_id""").show()


Data after DataFrame append:
+--------+-----------+----------+------+-----------+
|order_id|customer_id|order_date|amount|   category|
+--------+-----------+----------+------+-----------+
|  ORD001|    CUST101|2023-01-15|100.50|electronics|
|  ORD002|    CUST102|2023-01-16| 75.20|      books|
|  ORD003|    CUST101|2023-01-16|250.00|electronics|
|  ORD004|    CUST103|2023-01-17| 45.99|       home|
|  ORD005|    CUST102|2023-01-18|120.00|      books|
|  ORD006|    CUST104|2023-01-19|300.20|electronics|
|  ORD007|    CUST105|2023-01-19| 22.20|      books|
+--------+-----------+----------+------+-----------+



#### Iceberg Metadata - Snapshots, History, Manifests, Files

In [23]:
print(f"\nSnapshots for {db_name}.{table_name}:")
spark.sql(f"SELECT * FROM {iceberg_catalog_name}.{db_name}.{table_name}.snapshots").show()


Snapshots for spark_schema.spark_orders:
+--------------------+-------------------+-------------------+---------+--------------------+--------------------+
|        committed_at|        snapshot_id|          parent_id|operation|       manifest_list|             summary|
+--------------------+-------------------+-------------------+---------+--------------------+--------------------+
|2025-07-10 10:39:...|7702743561916277481|               NULL|   append|s3a://iceberg-war...|{spark.app.id -> ...|
|2025-07-10 10:39:...|2531062654959842889|7702743561916277481|   append|s3a://iceberg-war...|{spark.app.id -> ...|
+--------------------+-------------------+-------------------+---------+--------------------+--------------------+



In [24]:
print(f"\nManifests for {db_name}.{table_name}:")
spark.sql(f"SELECT * FROM {iceberg_catalog_name}.{db_name}.{table_name}.manifests").toPandas()


Manifests for spark_schema.spark_orders:


Unnamed: 0,content,path,length,partition_spec_id,added_snapshot_id,added_data_files_count,existing_data_files_count,deleted_data_files_count,added_delete_files_count,existing_delete_files_count,deleted_delete_files_count,partition_summaries
0,0,s3a://iceberg-warehouse/spark_schema/spark_ord...,7828,0,2531062654959842889,2,0,0,0,0,0,"[(False, False, books, electronics), (False, F..."
1,0,s3a://iceberg-warehouse/spark_schema/spark_ord...,7946,0,7702743561916277481,5,0,0,0,0,0,"[(False, False, books, home), (False, False, 2..."


In [25]:
print(f"\nData Files for {db_name}.{table_name}:")
spark.sql(f"SELECT file_path, record_count, partition FROM {iceberg_catalog_name}.{db_name}.{table_name}.files")\
.show(truncate=False)


Data Files for spark_schema.spark_orders:
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------------------+
|file_path                                                                                                                                                          |record_count|partition                |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------------------+
|s3a://iceberg-warehouse/spark_schema/spark_orders/data/category=electronics/order_date_day=2023-01-19/00000-12-3d818398-c4b4-4230-bbf0-4dc4f2e2691c-0-00001.parquet|1           |{electronics, 2023-01-19}|
|s3a://iceberg-warehouse/spark_schema/spark_orders/data/category=books/order_date_day=2023-01-19/00000-12-3d818398-c4b4-4230-bbf0-4dc4f2e

In [26]:
print(f"\nPartitions for {db_name}.{table_name}:")
# This shows how data is partitioned based on `category` and `order_date_day` (hidden transform)
spark.sql(f"SELECT * FROM {iceberg_catalog_name}.{db_name}.{table_name}.partitions")\
    .toPandas()


Partitions for spark_schema.spark_orders:


Unnamed: 0,partition,spec_id,record_count,file_count,total_data_file_size_in_bytes,position_delete_record_count,position_delete_file_count,equality_delete_record_count,equality_delete_file_count,last_updated_at,last_updated_snapshot_id
0,"(electronics, 2023-01-19)",0,1,1,1655,0,0,0,0,2025-07-10 10:39:46.212,2531062654959842889
1,"(books, 2023-01-19)",0,1,1,1613,0,0,0,0,2025-07-10 10:39:46.212,2531062654959842889
2,"(books, 2023-01-16)",0,1,1,1563,0,0,0,0,2025-07-10 10:39:41.795,7702743561916277481
3,"(books, 2023-01-18)",0,1,1,1564,0,0,0,0,2025-07-10 10:39:41.795,7702743561916277481
4,"(home, 2023-01-17)",0,1,1,1556,0,0,0,0,2025-07-10 10:39:41.795,7702743561916277481
5,"(electronics, 2023-01-15)",0,1,1,1606,0,0,0,0,2025-07-10 10:39:41.795,7702743561916277481
6,"(electronics, 2023-01-16)",0,1,1,1606,0,0,0,0,2025-07-10 10:39:41.795,7702743561916277481


#### Time Travel Queries

In [27]:
history_df = spark.sql(f"SELECT * FROM {iceberg_catalog_name}.{db_name}.{table_name}.history ORDER BY made_current_at")
history_list = history_df.collect()

In [28]:
history_df.show(truncate=False)

+-----------------------+-------------------+-------------------+-------------------+
|made_current_at        |snapshot_id        |parent_id          |is_current_ancestor|
+-----------------------+-------------------+-------------------+-------------------+
|2025-07-10 10:39:41.795|7702743561916277481|NULL               |true               |
|2025-07-10 10:39:46.212|2531062654959842889|7702743561916277481|true               |
+-----------------------+-------------------+-------------------+-------------------+



In [29]:
snapshots_df = spark.sql(f"SELECT * FROM {iceberg_catalog_name}.{db_name}.{table_name}.snapshots ORDER BY committed_at")
snapshot_list = snapshots_df.collect()

In [30]:
if len(snapshot_list) > 1:
    # Get the snapshot ID of the first insert operation (before the DataFrame append)
    # Assuming the first operation was the SQL INSERT and second was DataFrame append
    first_op_snapshot_id = history_list[0]["snapshot_id"] # The very first snapshot after table creation might be empty if no data was inserted then.
                                                            # The first data snapshot is what we usually want.
    
    # Find the snapshot *after* the first batch of INSERTs
    # The first row is the earliest snapshot.
    if len(snapshot_list) >= 2 and snapshot_list[1]["operation"] == "append": # Assuming first user data insert is an append
            target_snapshot_id = snapshot_list[1]["snapshot_id"] # This would be after the first SQL INSERT
            print(f"\nQuerying table state AS OF VERSION {target_snapshot_id} (after initial SQL INSERT):")
            spark.read.option("snapshot-id", target_snapshot_id)\
                .table(f"{iceberg_catalog_name}.{db_name}.{table_name}")\
                .orderBy("order_date", "order_id").show()


Querying table state AS OF VERSION 2531062654959842889 (after initial SQL INSERT):
+--------+-----------+----------+------+-----------+
|order_id|customer_id|order_date|amount|   category|
+--------+-----------+----------+------+-----------+
|  ORD001|    CUST101|2023-01-15|100.50|electronics|
|  ORD002|    CUST102|2023-01-16| 75.20|      books|
|  ORD003|    CUST101|2023-01-16|250.00|electronics|
|  ORD004|    CUST103|2023-01-17| 45.99|       home|
|  ORD005|    CUST102|2023-01-18|120.00|      books|
|  ORD006|    CUST104|2023-01-19|300.20|electronics|
|  ORD007|    CUST105|2023-01-19| 22.20|      books|
+--------+-----------+----------+------+-----------+



In [31]:
latest_snapshot_id = snapshot_list[-1]["snapshot_id"]
print(f"\nQuerying table state AS OF LATEST SNAPSHOT {latest_snapshot_id} (current state):")
spark.read.table(f"{iceberg_catalog_name}.{db_name}.{table_name}").orderBy("order_date", "order_id").toPandas()


Querying table state AS OF LATEST SNAPSHOT 2531062654959842889 (current state):


Unnamed: 0,order_id,customer_id,order_date,amount,category
0,ORD001,CUST101,2023-01-15,100.5,electronics
1,ORD002,CUST102,2023-01-16,75.2,books
2,ORD003,CUST101,2023-01-16,250.0,electronics
3,ORD004,CUST103,2023-01-17,45.99,home
4,ORD005,CUST102,2023-01-18,120.0,books
5,ORD006,CUST104,2023-01-19,300.2,electronics
6,ORD007,CUST105,2023-01-19,22.2,books


In [32]:
# Time travel using timestamp (requires made_current_at timestamp)
if len(snapshot_list) >= 2:
    timestamp_before_last_append = history_list[-2]["made_current_at"] # Timestamp of the snapshot before the last one
    print(f"\nQuerying table state AS OF TIMESTAMP '{timestamp_before_last_append}':")
    spark.read.option("as-of-timestamp", str(int(timestamp_before_last_append.timestamp() * 1000))) \
         .table(f"{iceberg_catalog_name}.{db_name}.{table_name}") \
         .orderBy("order_date", "order_id").show()


Querying table state AS OF TIMESTAMP '2025-07-10 10:39:41.795000':
+--------+-----------+----------+------+-----------+
|order_id|customer_id|order_date|amount|   category|
+--------+-----------+----------+------+-----------+
|  ORD001|    CUST101|2023-01-15|100.50|electronics|
|  ORD002|    CUST102|2023-01-16| 75.20|      books|
|  ORD003|    CUST101|2023-01-16|250.00|electronics|
|  ORD004|    CUST103|2023-01-17| 45.99|       home|
|  ORD005|    CUST102|2023-01-18|120.00|      books|
+--------+-----------+----------+------+-----------+



Schema Evolution (Example: Add a new column)

In [33]:
print(f"\nSchema before evolution:")
spark.sql(f"DESCRIBE {iceberg_catalog_name}.{db_name}.{table_name}").show()


Schema before evolution:
+--------------+----------------+-------+
|      col_name|       data_type|comment|
+--------------+----------------+-------+
|      order_id|          string|   NULL|
|   customer_id|          string|   NULL|
|    order_date|            date|   NULL|
|        amount|   decimal(10,2)|   NULL|
|      category|          string|   NULL|
|              |                |       |
|# Partitioning|                |       |
|        Part 0|        category|       |
|        Part 1|days(order_date)|       |
+--------------+----------------+-------+



In [34]:
# Add a new column 'is_returned'
spark.sql(f"ALTER TABLE {iceberg_catalog_name}.{db_name}.{table_name} ADD COLUMN is_returned BOOLEAN")
print(f"\nSchema after adding 'is_returned' column:")
spark.sql(f"DESCRIBE {iceberg_catalog_name}.{db_name}.{table_name}").show()


Schema after adding 'is_returned' column:
+--------------+----------------+-------+
|      col_name|       data_type|comment|
+--------------+----------------+-------+
|      order_id|          string|   NULL|
|   customer_id|          string|   NULL|
|    order_date|            date|   NULL|
|        amount|   decimal(10,2)|   NULL|
|      category|          string|   NULL|
|   is_returned|         boolean|   NULL|
|              |                |       |
|# Partitioning|                |       |
|        Part 0|        category|       |
|        Part 1|days(order_date)|       |
+--------------+----------------+-------+



In [35]:
# Insert data with the new column
spark.sql(f"""
INSERT INTO {iceberg_catalog_name}.{db_name}.{table_name}
VALUES ('ORD008', 'CUST101', DATE '2023-01-20', 55.00, 'home', true)
""")

print(f"\nData after inserting with new column (old rows will have null for 'is_returned'):")
spark.sql(f"""SELECT order_id, category, order_date, amount, is_returned 
          FROM {iceberg_catalog_name}.{db_name}.{table_name} 
          ORDER BY order_date, order_id""")\
        .show()


Data after inserting with new column (old rows will have null for 'is_returned'):
+--------+-----------+----------+------+-----------+
|order_id|   category|order_date|amount|is_returned|
+--------+-----------+----------+------+-----------+
|  ORD001|electronics|2023-01-15|100.50|       NULL|
|  ORD002|      books|2023-01-16| 75.20|       NULL|
|  ORD003|electronics|2023-01-16|250.00|       NULL|
|  ORD004|       home|2023-01-17| 45.99|       NULL|
|  ORD005|      books|2023-01-18|120.00|       NULL|
|  ORD006|electronics|2023-01-19|300.20|       NULL|
|  ORD007|      books|2023-01-19| 22.20|       NULL|
|  ORD008|       home|2023-01-20| 55.00|       true|
+--------+-----------+----------+------+-----------+




Data Compaction (Rewrite Data Files - Small File Compaction)
- Iceberg procedures are called using CALL
- Insert some more data to potentially create small files

In [36]:
spark.sql(f"""INSERT INTO {iceberg_catalog_name}.{db_name}.{table_name} 
          VALUES ('ORD009', 'CUST106', DATE '2023-01-21', 10.00, 'books', false)""")
spark.sql(f"""INSERT INTO {iceberg_catalog_name}.{db_name}.{table_name}
          VALUES ('ORD010', 'CUST106', DATE '2023-01-21', 12.00, 'books', false)""")

print(f"\nData Files before compaction for {db_name}.{table_name} (partition: category=books):")
spark.sql(f"""
    SELECT file_path, record_count, file_size_in_bytes
    FROM {iceberg_catalog_name}.{db_name}.{table_name}.files
    WHERE partition.category = 'books'
""").show(truncate=False)


Data Files before compaction for spark_schema.spark_orders (partition: category=books):
+-------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+------------------+
|file_path                                                                                                                                                    |record_count|file_size_in_bytes|
+-------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+------------------+
|s3a://iceberg-warehouse/spark_schema/spark_orders/data/category=books/order_date_day=2023-01-21/00000-42-f367cde7-290f-479d-8e38-721a67e2ae05-0-00001.parquet|1           |1799              |
|s3a://iceberg-warehouse/spark_schema/spark_orders/data/category=books/order_date_day=2023-01-21/00000-40-f8195a6d-d91f-4f8c-a657-2c96046dabc3-

Execute rewrite_data_files procedure for compaction
- This will compact small files into larger ones, default strategy is 'sort' which also sorts data within files
- For very small tables, the effect might be limited or group all into one file per partition.

In [37]:
print(f"\nRunning data compaction (rewrite_data_files) for table {db_name}.{table_name}...")
# You can specify sorting options or a where clause to limit compaction scope
result_df = spark.sql(f"""CALL {iceberg_catalog_name}.system.rewrite_data_files(
                      table => '{db_name}.{table_name}', 
                      strategy => 'sort', sort_order => 'order_date ASC, amount DESC', 
                      options => map('min-input-files','1'))""")
# options => map('min-input-files','1') to force compaction even with few files for demo
result_df.show()
print("Compaction procedure finished.")


Running data compaction (rewrite_data_files) for table spark_schema.spark_orders...
+--------------------------+----------------------+---------------------+-----------------------+
|rewritten_data_files_count|added_data_files_count|rewritten_bytes_count|failed_data_files_count|
+--------------------------+----------------------+---------------------+-----------------------+
|                         2|                     1|                 3597|                      0|
+--------------------------+----------------------+---------------------+-----------------------+

Compaction procedure finished.


In [38]:
print(f"\nData Files after compaction for {db_name}.{table_name} (partition: category=books):")
spark.sql(f"""
    SELECT file_path, record_count, file_size_in_bytes
    FROM {iceberg_catalog_name}.{db_name}.{table_name}.files
    WHERE partition.category = 'books'
""").show(truncate=False)


Data Files after compaction for spark_schema.spark_orders (partition: category=books):
+-------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+------------------+
|file_path                                                                                                                                                    |record_count|file_size_in_bytes|
+-------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+------------------+
|s3a://iceberg-warehouse/spark_schema/spark_orders/data/category=books/order_date_day=2023-01-21/00000-45-c6668fad-6c6d-4093-ac5d-36cba9abc5fd-0-00001.parquet|2           |1974              |
|s3a://iceberg-warehouse/spark_schema/spark_orders/data/category=books/order_date_day=2023-01-19/00000-12-3d818398-c4b4-4230-bbf0-4dc4f2e2691c-0

In [39]:
print(f"\nSnapshots after compaction:")
spark.sql(f"SELECT * FROM {iceberg_catalog_name}.{db_name}.{table_name}.snapshots ORDER BY committed_at DESC").show()


Snapshots after compaction:
+--------------------+-------------------+-------------------+---------+--------------------+--------------------+
|        committed_at|        snapshot_id|          parent_id|operation|       manifest_list|             summary|
+--------------------+-------------------+-------------------+---------+--------------------+--------------------+
|2025-07-10 10:39:...|8755651545572138546|5272369974064174615|  replace|s3a://iceberg-war...|{added-data-files...|
|2025-07-10 10:39:...|5272369974064174615|8777658452431709262|   append|s3a://iceberg-war...|{spark.app.id -> ...|
|2025-07-10 10:39:...|8777658452431709262|3887949544037912632|   append|s3a://iceberg-war...|{spark.app.id -> ...|
|2025-07-10 10:39:...|3887949544037912632|2531062654959842889|   append|s3a://iceberg-war...|{spark.app.id -> ...|
|2025-07-10 10:39:...|2531062654959842889|7702743561916277481|   append|s3a://iceberg-war...|{spark.app.id -> ...|
|2025-07-10 10:39:...|7702743561916277481|         

Filter Pushdown Demonstration
- The table is partitioned by (category, days(order_date))
- Spark should be able to push down filters on `category` and `order_date`.

In [40]:
query_with_filters = f"""
SELECT * FROM {iceberg_catalog_name}.{db_name}.{table_name}
WHERE category = 'electronics' AND order_date >= DATE '2023-01-15' AND order_date < DATE '2023-01-20'
"""
print("\nQuery with partition column filters:")
spark.sql(query_with_filters).show()


Query with partition column filters:
+--------+-----------+----------+------+-----------+-----------+
|order_id|customer_id|order_date|amount|   category|is_returned|
+--------+-----------+----------+------+-----------+-----------+
|  ORD001|    CUST101|2023-01-15|100.50|electronics|       NULL|
|  ORD003|    CUST101|2023-01-16|250.00|electronics|       NULL|
|  ORD006|    CUST104|2023-01-19|300.20|electronics|       NULL|
+--------+-----------+----------+------+-----------+-----------+



In [41]:
print("\nEXPLAIN plan for the query (look for PushedFilters in ParquetScan or IcebergScan):")
print(spark.sql(f"EXPLAIN {query_with_filters}").collect()[0][0])


EXPLAIN plan for the query (look for PushedFilters in ParquetScan or IcebergScan):
== Physical Plan ==
*(1) ColumnarToRow
+- BatchScan iceberg_jdbc.spark_schema.spark_orders[order_id#1083, customer_id#1084, order_date#1085, amount#1086, category#1087, is_returned#1088] iceberg_jdbc.spark_schema.spark_orders (branch=null) [filters=category IS NOT NULL, order_date IS NOT NULL, category = 'electronics', order_date >= 19372, order_date < 19377, groupedBy=] RuntimeFilters: []




Interoperability Check - Read data created by Trino (if trino_schema.employees exists)
 - Ensure the Trino notebook (01-trino-iceberg-getting-started.ipynb) has been run to create this table.

In [42]:
spark.conf.set("spark.sql.catalog.iceberg_catalog", "org.apache.iceberg.spark.SparkCatalog")
spark.conf.set("spark.sql.catalog.iceberg_catalog.catalog-impl", "org.apache.iceberg.jdbc.JdbcCatalog") 
spark.conf.set("spark.sql.catalog.iceberg_catalog.uri", "jdbc:postgresql://postgres:5432/iceberg_catalog")
spark.conf.set("spark.sql.catalog.iceberg_catalog.jdbc.user", "iceberg") 
spark.conf.set("spark.sql.catalog.iceberg_catalog.jdbc.password", "icebergpassword") 
spark.conf.set("spark.sql.catalog.iceberg_catalog.warehouse", "s3a://iceberg-warehouse/")

In [43]:
spark.catalog.setCurrentCatalog('iceberg_catalog')

In [44]:
spark.catalog.listCatalogs()

[CatalogMetadata(name='default_cache_iceberg', description=None),
 CatalogMetadata(name='iceberg_catalog', description=None),
 CatalogMetadata(name='iceberg_jdbc', description=None),
 CatalogMetadata(name='spark_catalog', description=None)]

In [45]:
iceberg_trino_catalog_name='iceberg_catalog'
trino_table_name = "trino_schema.employees" # From Trino notebook

In [46]:
print(f"\nAttempting to read Trino-created table: {iceberg_trino_catalog_name}.{trino_table_name}")
# Spark needs to know the schema. If Trino created it, Spark should discover it via the JDBC catalog.
spark.sql(f"DESCRIBE {iceberg_trino_catalog_name}.{trino_table_name}").show()
spark.sql(f"SELECT * FROM {iceberg_trino_catalog_name}.{trino_table_name} LIMIT 5").show()
print("Successfully read Trino-created table with Spark.")


Attempting to read Trino-created table: iceberg_catalog.trino_schema.employees
+--------------------+-------------+-------+
|            col_name|    data_type|comment|
+--------------------+-------------+-------+
|                  id|          int|   NULL|
|                name|       string|   NULL|
|          department|       string|   NULL|
|              salary|decimal(10,2)|   NULL|
|           hire_date|         date|   NULL|
|# Partition Infor...|             |       |
|          # col_name|    data_type|comment|
|          department|       string|   NULL|
+--------------------+-------------+-------+

+---+-------------+----------+---------+----------+
| id|         name|department|   salary| hire_date|
+---+-------------+----------+---------+----------+
|  4|  Diana Green|     Sales| 95000.00|2018-05-22|
|  5| Edward Black|     Sales|105000.00|2017-11-30|
|  7|George Yellow|        HR| 65000.00|2023-03-15|
|  7|George Yellow|        HR| 65000.00|2023-03-15|
|  3|Charlie Br

In [47]:
# spark.stop() # Commented out to allow re-running cells easily
print("\nSpark Iceberg Datalakehouse Demo (Phase 2) completed.")


Spark Iceberg Datalakehouse Demo (Phase 2) completed.


In [48]:
# spark.stop() # Commented out to allow re-running cells easily
print("\nSpark Iceberg Datalakehouse Demo (Phase 2) completed.")


Spark Iceberg Datalakehouse Demo (Phase 2) completed.
