In [None]:
# Section 1: Setup and Data Preparation

from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import avg, count, col, when, rank
from pyspark.sql.window import Window

# create spark session
spark = SparkSession.builder \
    .appName("employee_work_data_analysis") \
    .getOrCreate()

# simulate employee data
data = [
    Row(emp_id=101, name="Ravi", department="engineering", project="ai engine", salary=95000, hours_per_week=42),
    Row(emp_id=102, name="Sneha", department="engineering", project="data platform", salary=87000, hours_per_week=45),
    Row(emp_id=103, name="Kabir", department="marketing", project="product launch", salary=65000, hours_per_week=40),
    Row(emp_id=104, name="Anita", department="sales", project="client outreach", salary=70000, hours_per_week=38),
    Row(emp_id=105, name="Divya", department="engineering", project="ai engine", salary=99000, hours_per_week=48),
    Row(emp_id=106, name="Amit", department="marketing", project="social media", salary=62000, hours_per_week=35),
    Row(emp_id=107, name="Priya", department="hr", project="policy revamp", salary=58000, hours_per_week=37),
    Row(emp_id=108, name="Manav", department="sales", project="lead gen", salary=73000, hours_per_week=41),
    Row(emp_id=109, name="Neha", department="engineering", project="security suite", salary=91000, hours_per_week=46),
    Row(emp_id=110, name="Farah", department="hr", project="onboarding", salary=60000, hours_per_week=36)
]

# create dataframe
df = spark.createDataFrame(data)
df.show(truncate=False)

In [None]:
# Section 2: Creating Views

# create local temp view
df.createOrReplaceTempView("employees_local")

# create global temp view
df.createOrReplaceGlobalTempView("employees_global")

In [None]:
# Part A: Queries on Local View

# 1. employees working on "ai engine"
spark.sql("""
    SELECT * 
    FROM employees_local 
    WHERE project = 'ai engine'
""").show()

# 2. marketing employees with salary > 60000
spark.sql("""
    SELECT * 
    FROM employees_local 
    WHERE department = 'marketing' AND salary > 60000
""").show()

# 3. average salary by department
spark.sql("""
    SELECT department, AVG(salary) AS avg_salary 
    FROM employees_local 
    GROUP BY department
""").show()

# 4. top 3 highest paid employees
spark.sql("""
    SELECT * 
    FROM employees_local 
    ORDER BY salary DESC 
    LIMIT 3
""").show()

# 5. employees working > 40 hours
spark.sql("""
    SELECT * 
    FROM employees_local 
    WHERE hours_per_week > 40
""").show()

# 6. number of employees per project
spark.sql("""
    SELECT project, COUNT(*) AS employee_count 
    FROM employees_local 
    GROUP BY project
""").show()

# 7. drop local view and try querying again
spark.catalog.dropTempView("employees_local")
# spark.sql("SELECT * FROM employees_local").show()  # will error if uncommented

In [None]:
# Part B: Queries on Global View

# 1. HR employees working < 38 hours
spark.sql("""
    SELECT * 
    FROM global_temp.employees_global 
    WHERE department = 'hr' AND hours_per_week < 38
""").show()

# 2. total salary by department
spark.sql("""
    SELECT department, SUM(salary) AS total_salary 
    FROM global_temp.employees_global 
    GROUP BY department
""").show()

# 3. add status column
df_with_status = spark.sql("""
    SELECT *, 
        CASE 
            WHEN hours_per_week > 45 THEN 'overworked' 
            ELSE 'normal' 
        END AS status 
    FROM global_temp.employees_global
""")
df_with_status.show()

# 4. number of employees per project
spark.sql("""
    SELECT project, COUNT(*) AS total_employees 
    FROM global_temp.employees_global 
    GROUP BY project
""").show()

# 5. employees with salary above dept average
df_global = spark.table("global_temp.employees_global")
dept_avg = df_global.groupBy("department").agg(avg("salary").alias("dept_avg_salary"))
above_avg = df_global.join(dept_avg, "department") \
    .filter(col("salary") > col("dept_avg_salary")) \
    .select("emp_id", "name", "department", "salary")
above_avg.show()

# 6. open new session and query global view
new_spark = SparkSession.builder \
    .appName("new_session") \
    .getOrCreate()
new_spark.sql("SELECT * FROM global_temp.employees_global").show()

In [None]:
# Bonus Challenges

# 1. rank employees by salary within each department
window_spec = Window.partitionBy("department").orderBy(col("salary").desc())
df_ranked = df_global.withColumn("salary_rank", rank().over(window_spec))
df_ranked.select("emp_id", "name", "department", "salary", "salary_rank").show()

# 2. create view for engineering employees
df_engineering = df_global.filter(col("department") == "engineering")
df_engineering.createOrReplaceTempView("engineering_employees")
spark.sql("SELECT * FROM engineering_employees").show()

# 3. create view for active employees (working >= 38 hours)
df_active = df_global.filter(col("hours_per_week") >= 38)
df_active.createOrReplaceTempView("active_employees")
spark.sql("SELECT * FROM active_employees").show()