In [0]:
#Employees earning more than managers

In [0]:
from pyspark.sql.functions import col

#sample data
data=[
    (1,"John",6000,4),
    (2,"Kevin",11000,4),
    (3,"Bob",8000,5),
    (4,"Laura",9000,None),
    (5,"Sarah",10000,None),
]

#define the schema
from pyspark.sql.types import IntegerType,StringType,StructType,StructField

schema = StructType([
StructField("id",IntegerType(),True), #3rd option for nullable or not
StructField("name",StringType(),True),
StructField("salary",IntegerType(),True),
StructField("managerId",IntegerType(),True),
])

#create dataframe:
df = spark.createDataFrame(data,schema=schema)

df.display()

id,name,salary,managerId
1,John,6000,4.0
2,Kevin,11000,4.0
3,Bob,8000,5.0
4,Laura,9000,
5,Sarah,10000,


self join

In [0]:
#self-join to match employee with their managers
join_df = df.alias("employee").join(df.alias("manager"),
                                       col("employee.managerId")==col("manager.id"),
                                       "inner")

join_df.display()

id,name,salary,managerId,id.1,name.1,salary.1,managerId.1
1,John,6000,4,4,Laura,9000,
2,Kevin,11000,4,4,Laura,9000,
3,Bob,8000,5,5,Sarah,10000,


filter where employees salary greater than manager salary

In [0]:
filter_df=join_df.filter(col("employee.salary") > col("manager.salary"))

filter_df.display()

id,name,salary,managerId,id.1,name.1,salary.1,managerId.1
2,Kevin,11000,4,4,Laura,9000,


final output

In [0]:
#selecting relevant columns

result_df=filter_df.select("employee.name")

result_df.display()

name
Kevin


###using Spark SQL

In [0]:
df.createOrReplaceTempView("employees")

Approach 1:

In [0]:
%sql
select e.id as employee_id,
e.name as employee_name,
e.salary as employee_salary,
m.id as manager_id,
m.name as manager_name,
m.salary as manager_salary
from employees e
join employees m on e.managerId=m.id
where e.salary > m.salary

employee_id,employee_name,employee_salary,manager_id,manager_name,manager_salary
2,Kevin,11000,4,Laura,9000


Approach 2:

In [0]:
%sql
select e.name as employee_name
from employees as e
join employees as e2 on e2.id=e.managerId and e2.salary < e.salary

employee_name
Kevin
