In [None]:
% pip install duckdb

In [None]:
from duckdb.experimental.spark.sql import SparkSession
from duckdb.experimental.spark.sql.functions import col, avg, when

spark = SparkSession.builder \
    .appName("Interview PySpark Code") \
    .getOrCreate()

data = [
    ("Alice", "HR", 3000),
    ("Bob", "Finance", 4000),
    ("Charlie", "HR", 3500),
    ("David@gmail.com", "IT", 4500),
    ("Eve", "Finance", 3800),
    ("Alice", "HR", 5000),
    ("Frank", None, 2900)
]

columns = ["Name", "Department", "Salary"]

df = spark.createDataFrame(data, schema=columns)

df_with_levels = df.withColumn(
    "Salary_Level",
    when(col("Salary") < 3500, "Low")
    .when((col("Salary") >= 3500) & (col("Salary") < 4000), "Medium")
    .otherwise("High")
)

average_salary = df_with_levels.groupBy("Department") \
    .agg(avg("Salary").alias("Average_Salary"))



In [None]:
print("Original Data:")
df.show()

In [None]:
print("Data with Salary Levels:")
df_with_levels.show()

In [None]:
print("Average Salary by Department:")
average_salary.show()

In [None]:
spark.stop()