In [1]:
from pyspark.sql import SparkSession

In [2]:
from db_config import DATABASE_CONFIG
spark = SparkSession.builder \
    .appName("SQLServerConnection") \
    .config("spark.driver.extraClassPath", DATABASE_CONFIG["jar_path"]) \
    .getOrCreate()

In [4]:
df = spark.read.jdbc(url=DATABASE_CONFIG["url"], table="employees", properties= {"user": DATABASE_CONFIG["user"], "password": DATABASE_CONFIG["password"], "driver": DATABASE_CONFIG["driver"]}) \
        .filter("age > 30")

df.show()

+-----+---+----------+
| Name|Age|Department|
+-----+---+----------+
|Alice| 35| Marketing|
+-----+---+----------+



In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CatalystExample").getOrCreate()

data = [(1, "Alice", 30), (2, "Bob", 40), (3, "Charlie", 50)]
columns = ["id", "name", "age"]

df = spark.createDataFrame(data, schema=columns)
df.createOrReplaceTempView("people") 
df.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 30|
|  2|    Bob| 40|
|  3|Charlie| 50|
+---+-------+---+



In [6]:
optimized_df = spark.sql("SELECT * FROM people WHERE age > 30") 
optimized_df.explain(mode="formatted") # Shows optimized execution plan

== Physical Plan ==
* Filter (2)
+- * Scan ExistingRDD (1)


(1) Scan ExistingRDD [codegen id : 1]
Output [3]: [id#25L, name#26, age#27L]
Arguments: [id#25L, name#26, age#27L], MapPartitionsRDD[7] at applySchemaToPythonRDD at NativeMethodAccessorImpl.java:0, ExistingRDD, UnknownPartitioning(0)

(2) Filter [codegen id : 1]
Input [3]: [id#25L, name#26, age#27L]
Condition : (isnotnull(age#27L) AND (age#27L > 30))


