### Window Aggregation


In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\Program Files\\spark\\spark-3.4.1-bin-hadoop3'

In [2]:
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Window Aggregation').master('local[2]').getOrCreate()
spark

In [3]:
from pyspark.sql.types import StructField,StructType,StringType,IntegerType,DateType,FloatType

schema=StructType([
    StructField('txn_id',IntegerType()),
    StructField('txn_dt',DateType()),
    StructField('cid',IntegerType()),
    StructField('amt',FloatType()),
    StructField('prod_cat',StringType()),
    StructField('prod',StringType()),
    StructField('city',StringType()),
    StructField('state',StringType()),
    StructField('mode',StringType())
])

In [4]:
txn_df=spark.read\
    .option('mode','permissive')\
        .option('header',True)\
        .schema(schema)\
        .option('dateFormat','M-d-y')\
        .csv('../data/txns_with_header.csv')

In [5]:
txn_df.show(2,truncate=False)

+------+----------+-------+------+------------------+--------------------------+-----------+----------+------+
|txn_id|txn_dt    |cid    |amt   |prod_cat          |prod                      |city       |state     |mode  |
+------+----------+-------+------+------------------+--------------------------+-----------+----------+------+
|0     |2011-06-26|4007024|null  |Exercise & Fitness|Cardio Machine Accessories|Clarksville|Tennessee |credit|
|1     |2011-05-26|4006742|198.44|Exercise & Fitness|Weightlifting Gloves      |Long Beach |California|credit|
+------+----------+-------+------+------------------+--------------------------+-----------+----------+------+
only showing top 2 rows



In [6]:
txn_df.printSchema()

root
 |-- txn_id: integer (nullable = true)
 |-- txn_dt: date (nullable = true)
 |-- cid: integer (nullable = true)
 |-- amt: float (nullable = true)
 |-- prod_cat: string (nullable = true)
 |-- prod: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- mode: string (nullable = true)



In [7]:
from pyspark.sql.functions import sum as fsum
txn_state_city_sum = txn_df.groupby("state","city").agg(fsum("amt").alias("totalSale")).orderBy("state","city")
txn_state_city_sum.show(50)

+--------------------+----------------+------------------+
|               state|            city|         totalSale|
+--------------------+----------------+------------------+
|             Alabama|      Birmingham|20612.269999027252|
|             Alabama|      Huntsville| 46623.00000619888|
|             Alabama|      Montgomery| 48780.06999254227|
|             Arizona|         Gilbert|40408.460063934326|
|             Arizona|         Phoenix| 45919.95989751816|
|             Arizona|      Scottsdale|44962.910140514374|
|          California|         Anaheim| 45263.18995189667|
|          California|        Berkeley| 43734.15998840332|
|          California|         Fremont| 47498.90001678467|
|          California|          Irvine| 48039.33996915817|
|          California|      Long Beach|52686.700026512146|
|          California|     Los Angeles| 45167.45998287201|
|          California|         Oakland| 49764.80998420715|
|          California|          Orange|  47215.350028991

In [8]:
from pyspark.sql import Window
running_total_window=Window.partitionBy('state').orderBy("city").rowsBetween(Window.unboundedPreceding,Window.currentRow)

In [9]:
txn_state_city_sum.withColumn('RunningTotal',fsum('totalSale').over(running_total_window)).show()

+----------+-------------+------------------+------------------+
|     state|         city|         totalSale|      RunningTotal|
+----------+-------------+------------------+------------------+
|   Alabama|   Birmingham|20612.269999027252|20612.269999027252|
|   Alabama|   Huntsville| 46623.00000619888| 67235.27000522614|
|   Alabama|   Montgomery| 48780.06999254227| 116015.3399977684|
|   Arizona|      Gilbert|40408.460063934326|40408.460063934326|
|   Arizona|      Phoenix| 45919.95989751816| 86328.41996145248|
|   Arizona|   Scottsdale|44962.910140514374|131291.33010196686|
|California|      Anaheim| 45263.18995189667| 45263.18995189667|
|California|     Berkeley| 43734.15998840332| 88997.34994029999|
|California|      Fremont| 47498.90001678467|136496.24995708466|
|California|       Irvine| 48039.33996915817|184535.58992624283|
|California|   Long Beach|52686.700026512146|237222.28995275497|
|California|  Los Angeles| 45167.45998287201|  282389.749935627|
|California|      Oakland

In [10]:
txn_state_city_sum.write\
    .mode('overwrite')\
    .format('parquet')\
    .save('../data/txn_state_city_sum')

In [11]:
# this will overwrite previously created parquet file
txn_state_city_sum.write\
    .mode('overwrite')\
    .format('json')\
    .save('../data/txn_state_city_sum')