In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import *

spark = SparkSession \
    .builder \
    .getOrCreate()

Create Dataframe

In [5]:
data = (("James","Sales","NY",9000,34),
("Alicia","Sales","NY",8600,56),
("Robert","Sales","CA",8100,30),
("John","Sales","AZ",8600,30),
("Ross","Sales","AZ",8100,33),
("Khaty","Sales","AZ",1000,39),
("Lisa","Finance","CA",9000,24),
("Deja","Finance","CA",9900,40),
("Sugie","Finance","NY",8300,36),
("Ram","Finance","NY",7900,53),
("Sugie","Finance","NY",8300,36),
("Kyle","Marketing","CA",8000,25),
("Reid","Marketing","NY",9100,50))

schema=("empname", "dept", "state","salary","age")
df = spark.createDataFrame (data,schema =schema)
df.show()

+-------+---------+-----+------+---+
|empname|     dept|state|salary|age|
+-------+---------+-----+------+---+
|  James|    Sales|   NY|  9000| 34|
| Alicia|    Sales|   NY|  8600| 56|
| Robert|    Sales|   CA|  8100| 30|
|   John|    Sales|   AZ|  8600| 30|
|   Ross|    Sales|   AZ|  8100| 33|
|  Khaty|    Sales|   AZ|  1000| 39|
|   Lisa|  Finance|   CA|  9000| 24|
|   Deja|  Finance|   CA|  9900| 40|
|  Sugie|  Finance|   NY|  8300| 36|
|    Ram|  Finance|   NY|  7900| 53|
|  Sugie|  Finance|   NY|  8300| 36|
|   Kyle|Marketing|   CA|  8000| 25|
|   Reid|Marketing|   NY|  9100| 50|
+-------+---------+-----+------+---+



Ranking Window Functions
- row_number() - Ranking sequencial.
- rank() - Em caso de empate, deixa um gap na sequencia.
- dense_rank() - Em caso de empate, não deixa gap na sequencia.
- percent_rank()
- ntile() - Ranking em grupos
- cume_dist() - Distribuição acumulativa

row_number()

In [6]:
spec = Window.partitionBy(col('dept')).orderBy(col('salary').desc())

df.select('dept', 'salary')\
    .withColumn("row_number_rank", row_number().over(spec))\
    .withColumn("rank", rank().over(spec))\
    .withColumn("dense_rank", dense_rank().over(spec))\
    .withColumn("percent_rank", percent_rank().over(spec))\
    .withColumn("ntile_rank", ntile(3).over(spec))\
    .withColumn("cume_dist_rank", cume_dist().over(spec))\
    .show()

+---------+------+---------------+----+----------+------------+----------+-------------------+
|     dept|salary|row_number_rank|rank|dense_rank|percent_rank|ntile_rank|     cume_dist_rank|
+---------+------+---------------+----+----------+------------+----------+-------------------+
|  Finance|  9900|              1|   1|         1|         0.0|         1|                0.2|
|  Finance|  9000|              2|   2|         2|        0.25|         1|                0.4|
|  Finance|  8300|              3|   3|         3|         0.5|         2|                0.8|
|  Finance|  8300|              4|   3|         3|         0.5|         2|                0.8|
|  Finance|  7900|              5|   5|         4|         1.0|         3|                1.0|
|Marketing|  9100|              1|   1|         1|         0.0|         1|                0.5|
|Marketing|  8000|              2|   2|         2|         1.0|         2|                1.0|
|    Sales|  9000|              1|   1|         1|

Function lead(), lag()

In [17]:
spec = Window.partitionBy(col('dept')).orderBy(col('salary'))
df.select("dept", "salary")\
    .withColumn("lag_prev_salary", lag("salary", 1, 0).over(spec))\
    .withColumn("lead_prev_salary", lead("salary", 2, 9999).over(spec))\
    .show()

+---------+------+---------------+----------------+
|     dept|salary|lag_prev_salary|lead_prev_salary|
+---------+------+---------------+----------------+
|  Finance|  7900|              0|            8300|
|  Finance|  8300|           7900|            9000|
|  Finance|  8300|           8300|            9900|
|  Finance|  9000|           8300|            9999|
|  Finance|  9900|           9000|            9999|
|Marketing|  8000|              0|            9999|
|Marketing|  9100|           8000|            9999|
|    Sales|  1000|              0|            8100|
|    Sales|  8100|           1000|            8600|
|    Sales|  8100|           8100|            8600|
|    Sales|  8600|           8100|            9000|
|    Sales|  8600|           8600|            9999|
|    Sales|  9000|           8600|            9999|
+---------+------+---------------+----------------+



Aggregate Window Functions: </br>
- avg
- sum()
- min()
- max()
- count()
- first()
- last()

sum()

In [32]:
spec = Window.partitionBy("dept")

df.select("dept", "salary")\
    .withColumn("sum_sal_per_dept", sum("salary").over(spec))\
    .withColumn("max_sal_per_dept", max("salary").over(spec))\
    .withColumn("min_sal_per_dept", min("salary").over(spec))\
    .withColumn("count_sal_per_dept", count("salary").over(spec))\
    .show()

+---------+------+----------------+----------------+----------------+------------------+
|     dept|salary|sum_sal_per_dept|max_sal_per_dept|min_sal_per_dept|count_sal_per_dept|
+---------+------+----------------+----------------+----------------+------------------+
|  Finance|  9000|           43400|            9900|            7900|                 5|
|  Finance|  9900|           43400|            9900|            7900|                 5|
|  Finance|  8300|           43400|            9900|            7900|                 5|
|  Finance|  7900|           43400|            9900|            7900|                 5|
|  Finance|  8300|           43400|            9900|            7900|                 5|
|Marketing|  8000|           17100|            9100|            8000|                 2|
|Marketing|  9100|           17100|            9100|            8000|                 2|
|    Sales|  9000|           43400|            9000|            1000|                 6|
|    Sales|  8600|   

first()

In [27]:
spec = Window.partitionBy("dept").orderBy(col("salary").desc())

df.select("dept", "salary")\
    .withColumn("high_salary",first("salary").over(spec))\
    .withColumn("low_salary",last("salary").over(spec))\
    .withColumn("avg_salary", avg("salary").over(spec))\
    .show()

+---------+------+-----------+----------+-----------------+
|     dept|salary|high_salary|low_salary|       avg_salary|
+---------+------+-----------+----------+-----------------+
|  Finance|  9900|       9900|      9900|           9900.0|
|  Finance|  9000|       9900|      9000|           9450.0|
|  Finance|  8300|       9900|      8300|           8875.0|
|  Finance|  8300|       9900|      8300|           8875.0|
|  Finance|  7900|       9900|      7900|           8680.0|
|Marketing|  9100|       9100|      9100|           9100.0|
|Marketing|  8000|       9100|      8000|           8550.0|
|    Sales|  9000|       9000|      9000|           9000.0|
|    Sales|  8600|       9000|      8600|8733.333333333334|
|    Sales|  8600|       9000|      8600|8733.333333333334|
|    Sales|  8100|       9000|      8100|           8480.0|
|    Sales|  8100|       9000|      8100|           8480.0|
|    Sales|  1000|       9000|      1000|7233.333333333333|
+---------+------+-----------+----------