In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("SelfJoinExample").getOrCreate()

# Sample data for the employees
data = [
    (1, "John", 1000, 2),
    (2, "Doe", 1200, 3),
    (3, "Jane", 1500, None),
    (4, "Dave", 900, 2),
    (5, "Alice", 1100, 1)
]

# Create DataFrame from sample data
columns = ["emp_id", "emp_name", "sal", "manager_id"]
emp_df = spark.createDataFrame(data, columns)

# Perform a self join to compare employee salary with manager salary
self_join_df = emp_df.alias("emp").join(
    emp_df.alias("manager"),
    col("emp.manager_id") == col("manager.emp_id"),
    "inner"
)

# Filter rows where employee salary is greater than manager salary
result_df = self_join_df.filter(col("emp.sal") > col("manager.sal"))

# Show the result
result_df.select("emp.emp_id", "emp.emp_name", "emp.sal", "manager.emp_name", "manager.sal").show()




+------+--------+----+--------+----+
|emp_id|emp_name| sal|emp_name| sal|
+------+--------+----+--------+----+
|     5|   Alice|1100|    John|1000|
+------+--------+----+--------+----+



In [3]:
result_df.columns

['emp_id',
 'emp_name',
 'sal',
 'manager_id',
 'emp_id',
 'emp_name',
 'sal',
 'manager_id']

In [4]:
result_df.first

<bound method DataFrame.first of DataFrame[emp_id: bigint, emp_name: string, sal: bigint, manager_id: bigint, emp_id: bigint, emp_name: string, sal: bigint, manager_id: bigint]>

In [5]:
self_join_df

DataFrame[emp_id: bigint, emp_name: string, sal: bigint, manager_id: bigint, emp_id: bigint, emp_name: string, sal: bigint, manager_id: bigint]

In [6]:
self_join_df.printSchema()

root
 |-- emp_id: long (nullable = true)
 |-- emp_name: string (nullable = true)
 |-- sal: long (nullable = true)
 |-- manager_id: long (nullable = true)
 |-- emp_id: long (nullable = true)
 |-- emp_name: string (nullable = true)
 |-- sal: long (nullable = true)
 |-- manager_id: long (nullable = true)



In [7]:
data = [
    (1, "John", 1000, 2),
    (2, "Doe", 1200, 3),
    (3, "Jane", 1500, None),
    (4, "Dave", 900, 2),
    (5, "Alice", 1100, 1)
]

# Create DataFrame from sample data
columns = ["emp_id", "emp_name", "sal", "manager_id"]
emp_df = spark.createDataFrame(data, columns)

In [8]:
emp_df.show()

+------+--------+----+----------+
|emp_id|emp_name| sal|manager_id|
+------+--------+----+----------+
|     1|    John|1000|         2|
|     2|     Doe|1200|         3|
|     3|    Jane|1500|      null|
|     4|    Dave| 900|         2|
|     5|   Alice|1100|         1|
+------+--------+----+----------+



In [11]:
self_join_df = emp_df.alias("emp").join(
    emp_df.alias("manager"),
    col("emp.manager_id") == col("manager.emp_id"),
    "inner"
)

# self join used in pyspark

emp_df = emp_df.alias("emp1").join(emp_df.alias("emp2"), col("emp1.emp_id") == col("emp2.manager_id"), "inner")
emp_df.show()

+------+--------+----+----------+------+--------+----+----------+
|emp_id|emp_name| sal|manager_id|emp_id|emp_name| sal|manager_id|
+------+--------+----+----------+------+--------+----+----------+
|     1|    John|1000|         2|     5|   Alice|1100|         1|
|     2|     Doe|1200|         3|     1|    John|1000|         2|
|     2|     Doe|1200|         3|     4|    Dave| 900|         2|
|     3|    Jane|1500|      null|     2|     Doe|1200|         3|
+------+--------+----+----------+------+--------+----+----------+



In [1]:
spark.stop()

NameError: name 'spark' is not defined