# Window functions

In [1]:
marks=[
    "김하나, English, 100",
    "김하나, Math, 80",
    "임하나, English, 70",
    "임하나, Math, 100",
    "김갑돌, English, 82.3",
    "김갑돌, Math, 98.5"
]

In [3]:
import findspark
findspark.init()
import pyspark
myConf=pyspark.SparkConf()
spark = pyspark.sql.SparkSession.builder\
    .master("local")\
    .appName("myApp")\
    .config(conf=myConf)\
    .getOrCreate()

In [4]:
_marksRdd=spark.sparkContext.parallelize(marks).map(lambda x:x.split(','))

In [5]:

_marksDf=spark.createDataFrame(_marksRdd, schema=["name", "subject", "mark"])

In [6]:
_marksDf.printSchema()

root
 |-- name: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- mark: string (nullable = true)



In [7]:
_marksDf.show()

+------+--------+-----+
|  name| subject| mark|
+------+--------+-----+
|김하나| English|  100|
|김하나|    Math|   80|
|임하나| English|   70|
|임하나|    Math|  100|
|김갑돌| English| 82.3|
|김갑돌|    Math| 98.5|
+------+--------+-----+



In [8]:
_marksDf.groupBy('subject').count().show()

+--------+-----+
| subject|count|
+--------+-----+
|    Math|    3|
| English|    3|
+--------+-----+



### partitionBy, orderBy, frame

In [9]:
from pyspark.sql.window import Window

win = Window.partitionBy("subject").orderBy("mark")

In [10]:
from pyspark.sql.functions import row_number

_marksDf.withColumn("row_number", row_number().over(win)).show()

+------+--------+-----+----------+
|  name| subject| mark|row_number|
+------+--------+-----+----------+
|임하나|    Math|  100|         1|
|김하나|    Math|   80|         2|
|김갑돌|    Math| 98.5|         3|
|김하나| English|  100|         1|
|임하나| English|   70|         2|
|김갑돌| English| 82.3|         3|
+------+--------+-----+----------+



In [14]:
from pyspark.sql.functions import rank

_marksDf.withColumn("rank", rank().over(win)).show()

+------+--------+-----+----+
|  name| subject| mark|rank|
+------+--------+-----+----+
|임하나|    Math|  100|   1|
|김하나|    Math|   80|   2|
|김갑돌|    Math| 98.5|   3|
|김하나| English|  100|   1|
|임하나| English|   70|   2|
|김갑돌| English| 82.3|   3|
+------+--------+-----+----+



In [15]:
from pyspark.sql.types import FloatType

_marksDf = _marksDf.withColumn('markF', _marksDf['mark'].cast(FloatType()))

In [16]:
from pyspark.sql import functions as F
winF = Window.partitionBy("subject").orderBy(F.col("markF").desc()) #

In [17]:
from pyspark.sql.functions import row_number

_marksDf.withColumn("row_number", row_number().over(winF)).show()

+------+--------+-----+-----+----------+
|  name| subject| mark|markF|row_number|
+------+--------+-----+-----+----------+
|임하나|    Math|  100|100.0|         1|
|김갑돌|    Math| 98.5| 98.5|         2|
|김하나|    Math|   80| 80.0|         3|
|김하나| English|  100|100.0|         1|
|김갑돌| English| 82.3| 82.3|         2|
|임하나| English|   70| 70.0|         3|
+------+--------+-----+-----+----------+



In [18]:
from pyspark.sql.functions import rank

_marksDf.withColumn("rank", rank().over(winF)).show()

+------+--------+-----+-----+----+
|  name| subject| mark|markF|rank|
+------+--------+-----+-----+----+
|임하나|    Math|  100|100.0|   1|
|김갑돌|    Math| 98.5| 98.5|   2|
|김하나|    Math|   80| 80.0|   3|
|김하나| English|  100|100.0|   1|
|김갑돌| English| 82.3| 82.3|   2|
|임하나| English|   70| 70.0|   3|
+------+--------+-----+-----+----+



In [19]:
from pyspark.sql.functions import cume_dist

_marksDf.withColumn("cume_dist", cume_dist().over(winF)).show()

+------+--------+-----+-----+------------------+
|  name| subject| mark|markF|         cume_dist|
+------+--------+-----+-----+------------------+
|임하나|    Math|  100|100.0|0.3333333333333333|
|김갑돌|    Math| 98.5| 98.5|0.6666666666666666|
|김하나|    Math|   80| 80.0|               1.0|
|김하나| English|  100|100.0|0.3333333333333333|
|김갑돌| English| 82.3| 82.3|0.6666666666666666|
|임하나| English|   70| 70.0|               1.0|
+------+--------+-----+-----+------------------+



In [20]:
from pyspark.sql.functions import lag

_marksDf.withColumn("lag", lag('mark', 1).over(winF)).show()

+------+--------+-----+-----+-----+
|  name| subject| mark|markF|  lag|
+------+--------+-----+-----+-----+
|임하나|    Math|  100|100.0| null|
|김갑돌|    Math| 98.5| 98.5|  100|
|김하나|    Math|   80| 80.0| 98.5|
|김하나| English|  100|100.0| null|
|김갑돌| English| 82.3| 82.3|  100|
|임하나| English|   70| 70.0| 82.3|
+------+--------+-----+-----+-----+



In [21]:
from pyspark.sql.functions import lead

_marksDf.withColumn("lag", lead('mark', 1).over(winF)).show()

+------+--------+-----+-----+-----+
|  name| subject| mark|markF|  lag|
+------+--------+-----+-----+-----+
|임하나|    Math|  100|100.0| 98.5|
|김갑돌|    Math| 98.5| 98.5|   80|
|김하나|    Math|   80| 80.0| null|
|김하나| English|  100|100.0| 82.3|
|김갑돌| English| 82.3| 82.3|   70|
|임하나| English|   70| 70.0| null|
+------+--------+-----+-----+-----+



## Aggregate Functions

In [22]:
winAgg  = Window.partitionBy("subject")

In [23]:
from pyspark.sql import functions as F
_marksDf.withColumn("avg", F.avg(F.col("markF")).over(winAgg))\
    .withColumn("sum", F.sum(F.col("markF")).over(winAgg))\
    .withColumn("min", F.min(F.col("markF")).over(winAgg))\
    .withColumn("max", F.max(F.col("markF")).over(winAgg))\
    .show()

+------+--------+-----+-----+-----------------+-----------------+----+-----+
|  name| subject| mark|markF|              avg|              sum| min|  max|
+------+--------+-----+-----+-----------------+-----------------+----+-----+
|김하나|    Math|   80| 80.0|92.83333333333333|            278.5|80.0|100.0|
|임하나|    Math|  100|100.0|92.83333333333333|            278.5|80.0|100.0|
|김갑돌|    Math| 98.5| 98.5|92.83333333333333|            278.5|80.0|100.0|
|김하나| English|  100|100.0|84.10000101725261|252.3000030517578|70.0|100.0|
|임하나| English|   70| 70.0|84.10000101725261|252.3000030517578|70.0|100.0|
|김갑돌| English| 82.3| 82.3|84.10000101725261|252.3000030517578|70.0|100.0|
+------+--------+-----+-----+-----------------+-----------------+----+-----+

