In [34]:
from pyspark.sql.functions import col, length, upper, rand, expr

In [26]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS local.db.sales_data (
        id INT,
        region STRING,
        product STRING,
        amount DOUBLE,
        sales_date DATE
    )
    USING iceberg
    PARTITIONED BY (region)
""")

DataFrame[]

In [27]:
# Insert sample data
spark.sql("""
    INSERT INTO local.db.sales_data VALUES
    (1, 'North', 'Widget', 100.0, DATE('2024-01-01')),
    (2, 'South', 'Widget', 200.0, DATE('2024-01-02')),
    (3, 'East',  'Gadget', 150.0, DATE('2024-02-01')),
    (4, 'West',  'Gadget', 300.0, DATE('2024-03-01')),
    (5, 'North', 'Gizmo',  250.0, DATE('2024-03-15'))
""")

DataFrame[]

In [30]:
# Run filter query with region = 'North'
print("Filtered Data (region = 'North'):")
df_filtered = spark.read.format("iceberg").load("local.db.sales_data").filter(col("region") == "North")
df_filtered.show()

Filtered Data (region = 'North'):
+---+------+-------+------+----------+
| id|region|product|amount|sales_date|
+---+------+-------+------+----------+
|  1| North| Widget| 100.0|2024-01-01|
|  5| North|  Gizmo| 250.0|2024-03-15|
+---+------+-------+------+----------+



In [31]:

# View the query execution plan to see filter pushdown
print("Query Execution Plan:")
df_filtered.explain(extended=True)

Query Execution Plan:
== Parsed Logical Plan ==
'Filter ('region = North)
+- RelationV2[id#540, region#541, product#542, amount#543, sales_date#544] local.db.sales_data local.db.sales_data

== Analyzed Logical Plan ==
id: int, region: string, product: string, amount: double, sales_date: date
Filter (region#541 = North)
+- RelationV2[id#540, region#541, product#542, amount#543, sales_date#544] local.db.sales_data local.db.sales_data

== Optimized Logical Plan ==
RelationV2[id#540, region#541, product#542, amount#543, sales_date#544] local.db.sales_data

== Physical Plan ==
*(1) ColumnarToRow
+- BatchScan local.db.sales_data[id#540, region#541, product#542, amount#543, sales_date#544] local.db.sales_data (branch=null) [filters=region IS NOT NULL, region = 'North', groupedBy=] RuntimeFilters: []



In [32]:
from pyspark.sql.functions import length

# Filter on a derived column (not eligible for pushdown)
df_derived = spark.read.format("iceberg").load("local.db.sales_data") \
    .filter(length(col("region")) > 5)

# Show results
print("Filtered Data (length(region) > 5):")
df_derived.show()

# Show the query execution plan
print("Execution Plan for Non-Pushdown Filter:")
df_derived.explain(extended=True)


Filtered Data (length(region) > 5):
+---+------+-------+------+----------+
| id|region|product|amount|sales_date|
+---+------+-------+------+----------+
+---+------+-------+------+----------+

Execution Plan for Non-Pushdown Filter:
== Parsed Logical Plan ==
'Filter (length('region) > 5)
+- RelationV2[id#586, region#587, product#588, amount#589, sales_date#590] local.db.sales_data local.db.sales_data

== Analyzed Logical Plan ==
id: int, region: string, product: string, amount: double, sales_date: date
Filter (length(region#587) > 5)
+- RelationV2[id#586, region#587, product#588, amount#589, sales_date#590] local.db.sales_data local.db.sales_data

== Optimized Logical Plan ==
Filter (length(region#587) > 5)
+- RelationV2[id#586, region#587, product#588, amount#589, sales_date#590] local.db.sales_data

== Physical Plan ==
*(1) Filter (length(region#587) > 5)
+- *(1) ColumnarToRow
   +- BatchScan local.db.sales_data[id#586, region#587, product#588, amount#589, sales_date#590] local.db.sa

In [36]:

# Create test table
spark.sql("""
    CREATE TABLE IF NOT EXISTS local.db.filter_test_data (
        id INT,
        region STRING,
        product STRING,
        amount DOUBLE,
        city STRING
    )
    USING iceberg
""")

# Insert sample data
spark.sql("""
    INSERT INTO local.db.filter_test_data VALUES
    (1, 'North', 'Widget', 120.0, 'Paris'),
    (2, 'South', 'Gadget', 80.0, 'London'),
    (3, 'West', 'Gizmo', 200.0, NULL),
    (4, 'East', 'Widget', 300.0, 'Berlin'),
    (5, NULL, 'Gadget', 50.0, 'Paris')
""")

# List of filter scenarios (description, DataFrame with filter)
scenarios = [
    ("1. Simple filter (pushdown)", spark.read.format("iceberg").load("local.db.filter_test_data").filter(col("region") == "North")),
    ("2. Derived expression (no pushdown)", spark.read.format("iceberg").load("local.db.filter_test_data").filter(length(col("region")) > 5)),
    ("3. Arithmetic expression (no pushdown)", spark.read.format("iceberg").load("local.db.filter_test_data").filter(col("amount") + 10 > 100)),
    ("4. Function on column (no pushdown)", spark.read.format("iceberg").load("local.db.filter_test_data").filter(upper(col("product")) == "WIDGET")),
    ("5. Non-deterministic function (no pushdown)", spark.read.format("iceberg").load("local.db.filter_test_data").filter(rand() > 0.5)),
    ("6. Compound NULL logic (no pushdown)", spark.read.format("iceberg").load("local.db.filter_test_data").filter((col("region").isNull()) | (col("region") == "North"))),
    ("7. CASE WHEN expression (no pushdown)", spark.read.format("iceberg").load("local.db.filter_test_data").filter(expr("CASE WHEN amount > 100 THEN 1 ELSE 0 END = 1"))),
    ("8. Constant on left (no pushdown)", spark.read.format("iceberg").load("local.db.filter_test_data").filter(expr("'North' = region"))),
    ("9. OR expression (no pushdown)", spark.read.format("iceberg").load("local.db.filter_test_data").filter((col("region") == "North") | (length(col("region")) > 5))),
]

# Evaluate and show plans
for desc, df in scenarios:
    print(f"\n--- {desc} ---")
    df.show()
    df.explain(extended=True)


--- 1. Simple filter (pushdown) ---
+---+------+-------+------+-----+
| id|region|product|amount| city|
+---+------+-------+------+-----+
|  1| North| Widget| 120.0|Paris|
+---+------+-------+------+-----+

== Parsed Logical Plan ==
'Filter ('region = North)
+- RelationV2[id#984, region#985, product#986, amount#987, city#988] local.db.filter_test_data local.db.filter_test_data

== Analyzed Logical Plan ==
id: int, region: string, product: string, amount: double, city: string
Filter (region#985 = North)
+- RelationV2[id#984, region#985, product#986, amount#987, city#988] local.db.filter_test_data local.db.filter_test_data

== Optimized Logical Plan ==
Filter (isnotnull(region#985) AND (region#985 = North))
+- RelationV2[id#984, region#985, product#986, amount#987, city#988] local.db.filter_test_data

== Physical Plan ==
*(1) Filter (isnotnull(region#985) AND (region#985 = North))
+- *(1) ColumnarToRow
   +- BatchScan local.db.filter_test_data[id#984, region#985, product#986, amount#987