In [4]:
# Window Function
# rank()
# dense_rank()
# row_number()
# percent_rank()
# ntile(offset)
# lag(col,offset,default)
# lead(col,offset,default)
# cume_dist()
# nth_value(col,offset,ignoreNull)
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.functions import *

spark = SparkSession.builder.master("local").getOrCreate()
df = spark.createDataFrame([
    ["Student A",1,"Science",10],
    ["Student A",1,"Art",20],

    ["Student B",1,"Science",10],
    ["Student B",1,"Art",40],

    ["Student C",2,"Science",50],
    ["Student C",2,"Art",60],

    ["Student D",2,"Science",70],
    ["Student D",2,"Art",60],
],["NAME","CLASS","SUBJECT","SCORE"])

# 计算每个学生在专业里的成绩排名
spec = Window.partitionBy(df.SUBJECT).orderBy(df.SCORE.desc())
df.withColumn("RANK",rank().over(spec)).show()

+---------+-----+-------+-----+----+
|     NAME|CLASS|SUBJECT|SCORE|RANK|
+---------+-----+-------+-----+----+
|Student C|    2|    Art|   60|   1|
|Student D|    2|    Art|   60|   1|
|Student B|    1|    Art|   40|   3|
|Student A|    1|    Art|   20|   4|
|Student D|    2|Science|   70|   1|
|Student C|    2|Science|   50|   2|
|Student A|    1|Science|   10|   3|
|Student B|    1|Science|   10|   3|
+---------+-----+-------+-----+----+



In [5]:
spec = Window.partitionBy(df.SUBJECT).orderBy(df.SCORE.desc())
df.withColumn("RANK",dense_rank().over(spec)).show()

+---------+-----+-------+-----+----+
|     NAME|CLASS|SUBJECT|SCORE|RANK|
+---------+-----+-------+-----+----+
|Student C|    2|    Art|   60|   1|
|Student D|    2|    Art|   60|   1|
|Student B|    1|    Art|   40|   2|
|Student A|    1|    Art|   20|   3|
|Student D|    2|Science|   70|   1|
|Student C|    2|Science|   50|   2|
|Student A|    1|Science|   10|   3|
|Student B|    1|Science|   10|   3|
+---------+-----+-------+-----+----+



In [6]:
spec = Window.partitionBy(df.SUBJECT).orderBy(df.SCORE.desc())
df.withColumn("RANK",row_number().over(spec)).show()

+---------+-----+-------+-----+----+
|     NAME|CLASS|SUBJECT|SCORE|RANK|
+---------+-----+-------+-----+----+
|Student C|    2|    Art|   60|   1|
|Student D|    2|    Art|   60|   2|
|Student B|    1|    Art|   40|   3|
|Student A|    1|    Art|   20|   4|
|Student D|    2|Science|   70|   1|
|Student C|    2|Science|   50|   2|
|Student A|    1|Science|   10|   3|
|Student B|    1|Science|   10|   4|
+---------+-----+-------+-----+----+



In [7]:
spec = Window.partitionBy(df.SUBJECT).orderBy(df.SCORE.desc())
df.withColumn("RANK",percent_rank().over(spec)).show()

+---------+-----+-------+-----+------------------+
|     NAME|CLASS|SUBJECT|SCORE|              RANK|
+---------+-----+-------+-----+------------------+
|Student C|    2|    Art|   60|               0.0|
|Student D|    2|    Art|   60|               0.0|
|Student B|    1|    Art|   40|0.6666666666666666|
|Student A|    1|    Art|   20|               1.0|
|Student D|    2|Science|   70|               0.0|
|Student C|    2|Science|   50|0.3333333333333333|
|Student A|    1|Science|   10|0.6666666666666666|
|Student B|    1|Science|   10|0.6666666666666666|
+---------+-----+-------+-----+------------------+



In [8]:
spec = Window.partitionBy(df.SUBJECT).orderBy(df.SCORE.desc())
df.withColumn("RANK",ntile(3).over(spec)).show()

+---------+-----+-------+-----+----+
|     NAME|CLASS|SUBJECT|SCORE|RANK|
+---------+-----+-------+-----+----+
|Student C|    2|    Art|   60|   1|
|Student D|    2|    Art|   60|   1|
|Student B|    1|    Art|   40|   2|
|Student A|    1|    Art|   20|   3|
|Student D|    2|Science|   70|   1|
|Student C|    2|Science|   50|   1|
|Student A|    1|Science|   10|   2|
|Student B|    1|Science|   10|   3|
+---------+-----+-------+-----+----+



In [9]:
spec = Window.partitionBy(df.SUBJECT).orderBy(df.SCORE.desc())
df.withColumn("RANK",lag(df.SCORE,1,None).over(spec)).show()

+---------+-----+-------+-----+----+
|     NAME|CLASS|SUBJECT|SCORE|RANK|
+---------+-----+-------+-----+----+
|Student C|    2|    Art|   60|null|
|Student D|    2|    Art|   60|  60|
|Student B|    1|    Art|   40|  60|
|Student A|    1|    Art|   20|  40|
|Student D|    2|Science|   70|null|
|Student C|    2|Science|   50|  70|
|Student A|    1|Science|   10|  50|
|Student B|    1|Science|   10|  10|
+---------+-----+-------+-----+----+



In [10]:
spec = Window.partitionBy(df.SUBJECT).orderBy(df.SCORE.desc())
df.withColumn("RANK",lead(df.SCORE,1,None).over(spec)).show()

+---------+-----+-------+-----+----+
|     NAME|CLASS|SUBJECT|SCORE|RANK|
+---------+-----+-------+-----+----+
|Student C|    2|    Art|   60|  60|
|Student D|    2|    Art|   60|  40|
|Student B|    1|    Art|   40|  20|
|Student A|    1|    Art|   20|null|
|Student D|    2|Science|   70|  50|
|Student C|    2|Science|   50|  10|
|Student A|    1|Science|   10|  10|
|Student B|    1|Science|   10|null|
+---------+-----+-------+-----+----+



In [11]:
spec = Window.partitionBy(df.SUBJECT).orderBy(df.SCORE.desc())
df.withColumn("RANK",cume_dist().over(spec)).show()

+---------+-----+-------+-----+----+
|     NAME|CLASS|SUBJECT|SCORE|RANK|
+---------+-----+-------+-----+----+
|Student C|    2|    Art|   60| 0.5|
|Student D|    2|    Art|   60| 0.5|
|Student B|    1|    Art|   40|0.75|
|Student A|    1|    Art|   20| 1.0|
|Student D|    2|Science|   70|0.25|
|Student C|    2|Science|   50| 0.5|
|Student A|    1|Science|   10| 1.0|
|Student B|    1|Science|   10| 1.0|
+---------+-----+-------+-----+----+



In [12]:
spec = Window.partitionBy(df.SUBJECT).orderBy(df.SCORE.desc())
df.withColumn("RANK",nth_value(df.SCORE,1,False).over(spec)).show()

+---------+-----+-------+-----+----+
|     NAME|CLASS|SUBJECT|SCORE|RANK|
+---------+-----+-------+-----+----+
|Student C|    2|    Art|   60|  60|
|Student D|    2|    Art|   60|  60|
|Student B|    1|    Art|   40|  60|
|Student A|    1|    Art|   20|  60|
|Student D|    2|Science|   70|  70|
|Student C|    2|Science|   50|  70|
|Student A|    1|Science|   10|  70|
|Student B|    1|Science|   10|  70|
+---------+-----+-------+-----+----+

