In [0]:
emp_df = spark.createDataFrame(
    [
        (1,'manish',50000,'IT','m'),
        (2,'vikash',60000,'sales','m'),
        (3,'raushan',70000,'marketing','m'),
        (4,'mukesh',80000,'IT','m'),
        (5,'priti',90000,'sales','f'),
        (6,'nikita',45000,'marketing','f'),
        (7,'ragini',55000,'marketing','f'),
        (8,'rashi',100000,'IT','f'),
        (9,'aditya',65000,'IT','m'),
        (10,'rahul',50000,'marketing','m'),
        (11,'rakhi',50000,'IT','f'),
        (12,'akhilesh',90000,'sales','m')
    ], ["id", "name", "salary", "dept", "gender"]
)

emp_df.show()

+---+--------+------+---------+------+
| id|    name|salary|     dept|gender|
+---+--------+------+---------+------+
|  1|  manish| 50000|       IT|     m|
|  2|  vikash| 60000|    sales|     m|
|  3| raushan| 70000|marketing|     m|
|  4|  mukesh| 80000|       IT|     m|
|  5|   priti| 90000|    sales|     f|
|  6|  nikita| 45000|marketing|     f|
|  7|  ragini| 55000|marketing|     f|
|  8|   rashi|100000|       IT|     f|
|  9|  aditya| 65000|       IT|     m|
| 10|   rahul| 50000|marketing|     m|
| 11|   rakhi| 50000|       IT|     f|
| 12|akhilesh| 90000|    sales|     m|
+---+--------+------+---------+------+



In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import *

window = Window.partitionBy("dept").orderBy("salary")

emp_df.withColumn("rn", row_number().over(window))\
    .withColumn("rnk", rank().over(window))\
    .withColumn("dr", dense_rank().over(window))\
    .show(truncate=False)

+---+--------+------+---------+------+---+---+---+
|id |name    |salary|dept     |gender|rn |rnk|dr |
+---+--------+------+---------+------+---+---+---+
|1  |manish  |50000 |IT       |m     |1  |1  |1  |
|11 |rakhi   |50000 |IT       |f     |2  |1  |1  |
|9  |aditya  |65000 |IT       |m     |3  |3  |2  |
|4  |mukesh  |80000 |IT       |m     |4  |4  |3  |
|8  |rashi   |100000|IT       |f     |5  |5  |4  |
|6  |nikita  |45000 |marketing|f     |1  |1  |1  |
|10 |rahul   |50000 |marketing|m     |2  |2  |2  |
|7  |ragini  |55000 |marketing|f     |3  |3  |3  |
|3  |raushan |70000 |marketing|m     |4  |4  |4  |
|2  |vikash  |60000 |sales    |m     |1  |1  |1  |
|5  |priti   |90000 |sales    |f     |2  |2  |2  |
|12 |akhilesh|90000 |sales    |m     |3  |2  |2  |
+---+--------+------+---------+------+---+---+---+



In [0]:
"""
Calculate top 2 earner from each dept
With dense_rank() for this particular example there will be 3 rows returned for sales dept
"""
emp_df.withColumn("dr", dense_rank().over(Window.partitionBy("dept").orderBy(desc("salary"))))\
    .filter(col("dr") <= 2).drop("dr").show()

+---+--------+------+---------+------+
| id|    name|salary|     dept|gender|
+---+--------+------+---------+------+
|  8|   rashi|100000|       IT|     f|
|  4|  mukesh| 80000|       IT|     m|
|  3| raushan| 70000|marketing|     m|
|  7|  ragini| 55000|marketing|     f|
|  5|   priti| 90000|    sales|     f|
| 12|akhilesh| 90000|    sales|     m|
|  2|  vikash| 60000|    sales|     m|
+---+--------+------+---------+------+

