In [1]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("PySparkSQLAdvanced").getOrCreate()
print(spark)

<pyspark.sql.session.SparkSession object at 0x00000229A2250B80>


In [3]:
# Load the dataset
df = spark.read.format("csv").option("header", "true").load("Employee.csv")

# Show the first 5 rows
df.show(5)

+---+-------+----------+------+-----+
| id|   name|department|salary|bonus|
+---+-------+----------+------+-----+
|  1|  Alice|     Sales|  5000|  200|
|  2|    Bob|   Finance|  6000|  300|
|  3|Charlie|     Sales|  5500|  250|
|  4|  David|   Finance|  7000|  400|
|  5|    Eva|        HR|  4500|  150|
+---+-------+----------+------+-----+
only showing top 5 rows



In [4]:
df.createOrReplaceTempView("employees")

In [5]:
result = spark.sql("SELECT * FROM employees WHERE salary > 5000")
result.show()

+---+-------+----------+------+-----+
| id|   name|department|salary|bonus|
+---+-------+----------+------+-----+
|  2|    Bob|   Finance|  6000|  300|
|  3|Charlie|     Sales|  5500|  250|
|  4|  David|   Finance|  7000|  400|
|  7|  Grace|     Sales|  5200|  220|
|  8|   Hank|   Finance|  6500|  350|
+---+-------+----------+------+-----+



In [7]:
result = spark.sql("""
    SELECT Name, AVG(salary) AS avg_salary
    FROM employees
    GROUP BY Name
""")
result.show()

+-------+----------+
|   Name|avg_salary|
+-------+----------+
|  Grace|    5200.0|
|    Eva|    4500.0|
|   Hank|    6500.0|
|Charlie|    5500.0|
|    Bob|    6000.0|
|  Alice|    5000.0|
|  David|    7000.0|
|  Frank|    4800.0|
+-------+----------+



In [8]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum

# Define a window specification
window_spec = Window.partitionBy("department").orderBy("salary")

# Add a cumulative salary column
df_with_cumulative = df.withColumn("cumulative_salary", sum("salary").over(window_spec))

# Show the result
df_with_cumulative.show()

+---+-------+----------+------+-----+-----------------+
| id|   name|department|salary|bonus|cumulative_salary|
+---+-------+----------+------+-----+-----------------+
|  2|    Bob|   Finance|  6000|  300|           6000.0|
|  8|   Hank|   Finance|  6500|  350|          12500.0|
|  4|  David|   Finance|  7000|  400|          19500.0|
|  5|    Eva|        HR|  4500|  150|           4500.0|
|  6|  Frank|        HR|  4800| NULL|           9300.0|
|  1|  Alice|     Sales|  5000|  200|           5000.0|
|  7|  Grace|     Sales|  5200|  220|          10200.0|
|  3|Charlie|     Sales|  5500|  250|          15700.0|
+---+-------+----------+------+-----+-----------------+



In [9]:
result = spark.sql("""
    SELECT Id, salary
    FROM employees
    WHERE salary > (SELECT AVG(salary) FROM employees)
""")
result.show()

+---+------+
| Id|salary|
+---+------+
|  2|  6000|
|  4|  7000|
|  8|  6500|
+---+------+

