In [33]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Aggregations").getOrCreate()

In [34]:
df = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("/data/SparkDefinitive/data/retail-data/all/*.csv")\
    .coalesce(5)
    
df.cache()
df.createOrReplaceTempView("dfTable")

Window Functions

In [35]:
from pyspark.sql.functions import col, to_date

dfWithDate = df.withColumn("date", to_date(col("InvoiceDate"), "MM/d/yyyy H:mm"))

dfWithDate.createOrReplaceTempView("dfWithDate")

In [36]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc
windowSpec = Window\
    .partitionBy("CustomerId", "date")\
    .orderBy(desc("Quantity"))\
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)

In [37]:
from pyspark.sql.functions import max
maxPurchaseQuantity = max(col("Quantity")).over(windowSpec)

In [38]:
from pyspark.sql.functions import dense_rank, rank
purchaseDenseRank = dense_rank().over(windowSpec)
purchaseRank = rank().over(windowSpec)

In [47]:
spark.sql("set spark.sql.legacy.timeParserPolicy = LEGACY")

DataFrame[key: string, value: string]

In [54]:
from pyspark.sql.functions import col
dfWithDate.where("CustomerId IS NOT NULL").orderBy("CustomerId")\
.select(
    col("CustomerId"),
    col("date"),
    col("Quantity"),
    purchaseRank.alias("quantityRank"),
    purchaseDenseRank.alias("quantityDenseRank"),
    maxPurchaseQuantity.alias("maxPurchaseQuantity"))\
.show(2)

+----------+----------+--------+------------+-----------------+-------------------+
|CustomerId|      date|Quantity|quantityRank|quantityDenseRank|maxPurchaseQuantity|
+----------+----------+--------+------------+-----------------+-------------------+
|     12346|2011-01-18|   74215|           1|                1|              74215|
|     12346|2011-01-18|  -74215|           2|                2|              74215|
+----------+----------+--------+------------+-----------------+-------------------+
only showing top 2 rows



In [55]:
spark.sql('''
    SELECT CustomerId, date, Quantity,
        rank(Quantity) OVER w as rank,
        dense_rank(Quantity) OVER w as dRank,
        max(Quantity) OVER w as maxPurchase
    FROM dfWithDate WHERE CustomerId IS NOT NULL ORDER BY CustomerId
    WINDOW w AS (PARTITION BY CustomerId, date
            ORDER BY Quantity DESC NULLS LAST
            ROWS BETWEEN
            UNBOUNDED PRECEDING AND CURRENT ROW);
''').show(3)

+----------+----------+--------+----+-----+-----------+
|CustomerId|      date|Quantity|rank|dRank|maxPurchase|
+----------+----------+--------+----+-----+-----------+
|     12346|2011-01-18|   74215|   1|    1|      74215|
|     12346|2011-01-18|  -74215|   2|    2|      74215|
|     12347|2010-12-07|      36|   1|    1|         36|
+----------+----------+--------+----+-----+-----------+
only showing top 3 rows

