In [6]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date
from datetime import datetime

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("PartitionPruning_PredicatePushdown") \
    .config("spark.sql.sources.partitionOverwriteMode", "dynamic") \
    .getOrCreate()

# ============================================================
# STEP 1: Read raw data from S3
# ============================================================
raw_path = '/Users/pavanhalde/Downloads/refined/customer/date_partition=1999-12-23/'

df_raw = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .parquet(raw_path)

print("Raw Data Schema:")
df_raw.printSchema()
df_raw.show(10)


Raw Data Schema:
root
 |-- OrderID: integer (nullable = true)
 |-- OrderName: string (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)

+-------+---------+--------+----------+----+----+----+
|OrderID|OrderName|Customer|      Date| _c4| _c5| _c6|
+-------+---------+--------+----------+----+----+----+
|      9|  Order_I|    John|23-12-1999|NULL|NULL|NULL|
|     10|  Order_J| Michael|23-12-1999|NULL|NULL|NULL|
|     11|  Order_K|    Anna|23-12-1999|NULL|NULL|NULL|
|     12|  Order_L|   Chris|23-12-1999|NULL|NULL|NULL|
|     13|  Order_M|    John|23-12-1999|NULL|NULL|NULL|
|     25|  Order_Y| Michael|23-12-1999|NULL|NULL|NULL|
|     30| Order_AD|    John|23-12-1999|NULL|NULL|NULL|
|     35| Order_AI| Michael|23-12-1999|NULL|NULL|NULL|
|     40| Order_AN|    Mike|23-12-1999|NULL|NULL|NULL|
|     45| Order_AS|   Sarah|23-12-1999|NULL|NULL|NULL|


In [8]:

raw_path = '/Users/pavanhalde/Downloads/refined/customer/'

df_raw = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .parquet(raw_path)


df_filtered_partition = df_raw.filter(col("date_partition") == "1999-12-23")

print("Raw Data Schema:")
df_filtered_partition.printSchema()
df_filtered_partition.show(10)

df_filtered_partition.explain(True)


Raw Data Schema:
root
 |-- OrderID: integer (nullable = true)
 |-- OrderName: string (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- date_partition: date (nullable = true)

+-------+---------+--------+----------+----+----+----+--------------+
|OrderID|OrderName|Customer|      Date| _c4| _c5| _c6|date_partition|
+-------+---------+--------+----------+----+----+----+--------------+
|      9|  Order_I|    John|23-12-1999|NULL|NULL|NULL|    1999-12-23|
|     10|  Order_J| Michael|23-12-1999|NULL|NULL|NULL|    1999-12-23|
|     11|  Order_K|    Anna|23-12-1999|NULL|NULL|NULL|    1999-12-23|
|     12|  Order_L|   Chris|23-12-1999|NULL|NULL|NULL|    1999-12-23|
|     13|  Order_M|    John|23-12-1999|NULL|NULL|NULL|    1999-12-23|
|     25|  Order_Y| Michael|23-12-1999|NULL|NULL|NULL|    1999-12-23|
|     30| Order_AD|    John|23-12-1999|NU

In [9]:

raw_path = '/Users/pavanhalde/Downloads/refined/customer/'

df_raw = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .parquet(raw_path)


df_filtered_data = df_raw.filter(col("Customer") == "John")

print("Raw Data Schema:")
df_filtered_partition.printSchema()
df_filtered_partition.show(10)

df_filtered_partition.explain(True)


Raw Data Schema:
root
 |-- OrderID: integer (nullable = true)
 |-- OrderName: string (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- date_partition: date (nullable = true)

+-------+---------+--------+----------+----+----+----+--------------+
|OrderID|OrderName|Customer|      Date| _c4| _c5| _c6|date_partition|
+-------+---------+--------+----------+----+----+----+--------------+
|      9|  Order_I|    John|23-12-1999|NULL|NULL|NULL|    1999-12-23|
|     10|  Order_J| Michael|23-12-1999|NULL|NULL|NULL|    1999-12-23|
|     11|  Order_K|    Anna|23-12-1999|NULL|NULL|NULL|    1999-12-23|
|     12|  Order_L|   Chris|23-12-1999|NULL|NULL|NULL|    1999-12-23|
|     13|  Order_M|    John|23-12-1999|NULL|NULL|NULL|    1999-12-23|
|     25|  Order_Y| Michael|23-12-1999|NULL|NULL|NULL|    1999-12-23|
|     30| Order_AD|    John|23-12-1999|NU

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date

# Initialize Spark
spark = SparkSession.builder \
    .appName("PredicatePushdownTest") \
    .master("local[*]") \
    .getOrCreate()

# Create sample data
data = [
    (1, "Order_A", "John", "21-12-1999"),
    (2, "Order_B", "Jane", "21-12-1999"),
    (3, "Order_C", "Mike", "21-12-1999"),
    (4, "Order_D", "Sarah", "21-12-1999"),
    (5, "Order_E", "John", "22-12-1999"),
    (9, "Order_I", "John", "23-12-1999"),
    (10, "Order_J", "Michael", "23-12-1999"),
    (13, "Order_M", "John", "23-12-1999"),
    (18, "Order_R", "John", "25-12-1999"),
    (22, "Order_V", "John", "25-12-1999"),
]

df = spark.createDataFrame(data, ["OrderID", "OrderName", "Customer", "Date"])
df = df.withColumn("date_column", to_date(col("Date"), "dd-MM-yyyy"))

print("="*80)
print("SETUP: Writing data in two formats")
print("="*80)

# ============================================================
# 1. Write NON-PARTITIONED data (to see predicate pushdown)
# ============================================================
non_partitioned_path = "/tmp/customer_no_partition"
df.coalesce(1).write.mode("overwrite").parquet(non_partitioned_path)
print(f"\nâœ… Written NON-partitioned data to: {non_partitioned_path}")

# ============================================================
# 2. Write PARTITIONED data (to see partition pruning)
# ============================================================
partitioned_path = "/tmp/customer_partitioned"
df.write.mode("overwrite").partitionBy("date_column").parquet(partitioned_path)
print(f"âœ… Written PARTITIONED data to: {partitioned_path}")

print("\n" + "="*80)
print("TEST 1: PREDICATE PUSHDOWN (Non-partitioned data)")
print("="*80)
print("Query: SELECT * WHERE Customer = 'John'")
print("\nðŸ‘‡ Look for 'PushedFilters:' in the Physical Plan below:")

df_non_part = spark.read.parquet(non_partitioned_path)
df_filtered = df_non_part.filter(col("Customer") == "John")

df_filtered.explain()
print("\nâœ… Results:")
df_filtered.show()

print("\n" + "="*80)
print("TEST 2: PARTITION PRUNING (Partitioned data)")
print("="*80)
print("Query: SELECT * WHERE date_column = '1999-12-23'")
print("\nðŸ‘‡ Look for 'PartitionFilters:' in the Physical Plan below:")

df_part = spark.read.parquet(partitioned_path)
df_partition_filter = df_part.filter(col("date_column") == "1999-12-23")

df_partition_filter.explain()
print("\nâœ… Results:")
df_partition_filter.show()

print("\n" + "="*80)
print("TEST 3: BOTH OPTIMIZATIONS (Partitioned data + data filter)")
print("="*80)
print("Query: SELECT * WHERE date_column = '1999-12-23' AND Customer = 'John'")
print("\nðŸ‘‡ Look for BOTH 'PartitionFilters:' AND 'PushedFilters:' below:")

df_combined = df_part.filter(
    (col("date_column") == "1999-12-23") & 
    (col("Customer") == "John")
)

df_combined.explain()
print("\nâœ… Results:")
df_combined.show()

print("\n" + "="*80)
print("ðŸ“š SUMMARY")
print("="*80)
print("""
1. PREDICATE PUSHDOWN (Test 1):
   âœ“ PushedFilters: [IsNotNull(Customer), EqualTo(Customer,John)]
   - Filter pushed to Parquet reader
   - Skips rows at file read level
   
2. PARTITION PRUNING (Test 2):
   âœ“ PartitionFilters: [isnotnull(date_column#X), (date_column#X = 1999-12-23)]
   - Only reads specific partition folder
   - Skips entire partitions
   
3. COMBINED (Test 3):
   âœ“ PartitionFilters: [date filter]
   âœ“ PushedFilters: [Customer filter]
   - Best of both worlds!
   
WHY YOU DIDN'T SEE PREDICATE PUSHDOWN:
- Your data was already partitioned
- Small files without row group statistics
- Need larger, non-partitioned files to see it clearly
""")

spark.stop()

25/11/15 15:12:39 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


SETUP: Writing data in two formats

âœ… Written NON-partitioned data to: /tmp/customer_no_partition
âœ… Written PARTITIONED data to: /tmp/customer_partitioned

TEST 1: PREDICATE PUSHDOWN (Non-partitioned data)
Query: SELECT * WHERE Customer = 'John'

ðŸ‘‡ Look for 'PushedFilters:' in the Physical Plan below:
== Physical Plan ==
*(1) Filter (isnotnull(Customer#302) AND (Customer#302 = John))
+- *(1) ColumnarToRow
   +- FileScan parquet [OrderID#300L,OrderName#301,Customer#302,Date#303,date_column#304] Batched: true, DataFilters: [isnotnull(Customer#302), (Customer#302 = John)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/tmp/customer_no_partition], PartitionFilters: [], PushedFilters: [IsNotNull(Customer), EqualTo(Customer,John)], ReadSchema: struct<OrderID:bigint,OrderName:string,Customer:string,Date:string,date_column:date>



âœ… Results:
+-------+---------+--------+----------+-----------+
|OrderID|OrderName|Customer|      Date|date_column|
+-------+---------+--------