%md

Here are the common aggregate functions that are available as part of `pyspark.sql.functions`
* `count`
* `sum`
* `min`
* `max`
* `avg`

Spark also supports some statistical functions such as stddev.

In [0]:
## aggregations functions 
orders = spark.read.json('/FileStore/tables/orders-4.json')

In [0]:
%fs ls /FileStore/tables/orders-4.json


path,name,size,modificationTime
dbfs:/FileStore/tables/orders-4.json,orders-4.json,7477339,1670835162000


In [0]:
orders.printSchema()

root
 |-- order_customer_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [0]:
order_items = spark.read.json('/FileStore/tables/ordersItems.json')

In [0]:
order_items.printSchema()

root
 |-- order_item_id: long (nullable = true)
 |-- order_item_order_id: long (nullable = true)
 |-- order_item_product_id: long (nullable = true)
 |-- order_item_product_price: double (nullable = true)
 |-- order_item_quantity: long (nullable = true)
 |-- order_item_subtotal: double (nullable = true)



In [0]:
from pyspark.sql.functions import count
orders.select(count("*")).show()
## note : above will count the no of rows 
orders.groupBy('order_status').agg(count("*")).show()
## this will do  first aggregation groupBy on order_status  then count on orders_status


+--------+
|count(1)|
+--------+
|   68883|
+--------+

+---------------+--------+
|   order_status|count(1)|
+---------------+--------+
|PENDING_PAYMENT|   15030|
|       COMPLETE|   22899|
|        ON_HOLD|    3798|
| PAYMENT_REVIEW|     729|
|     PROCESSING|    8275|
|         CLOSED|    7556|
|SUSPECTED_FRAUD|    1558|
|        PENDING|    7610|
|       CANCELED|    1428|
+---------------+--------+



In [0]:
# Function count on data frame is action. It will trigger execution.
order_items.count()
from pyspark.sql.functions import count
order_items.select(count("*")).show()

+--------+
|count(1)|
+--------+
|  172198|
+--------+



In [0]:
## group by function overview 
# groupBy(*cols) method of pyspark.sql.dataframe.DataFrame instance
#     Groups the :class:`DataFrame` using the specified columns,
#     so we can run aggregation on them.

In [0]:
order_items.count()

Out[13]: 172198

In [0]:
order_items.show(2,False)
order_items.describe().show()

+-------------+-------------------+---------------------+------------------------+-------------------+-------------------+
|order_item_id|order_item_order_id|order_item_product_id|order_item_product_price|order_item_quantity|order_item_subtotal|
+-------------+-------------------+---------------------+------------------------+-------------------+-------------------+
|1            |1                  |957                  |299.98                  |1                  |299.98             |
|2            |2                  |1073                 |199.99                  |1                  |199.99             |
+-------------+-------------------+---------------------+------------------------+-------------------+-------------------+
only showing top 2 rows

+-------+-----------------+-------------------+---------------------+------------------------+-------------------+-------------------+
|summary|    order_item_id|order_item_order_id|order_item_product_id|order_item_product_price|order_it

In [0]:
order_items.groupBy().min().show()
#  note : this will do grouping on all column and give min value of each columns
## min is aggregate function used over groupBy

+------------------+------------------------+--------------------------+-----------------------------+------------------------+------------------------+
|min(order_item_id)|min(order_item_order_id)|min(order_item_product_id)|min(order_item_product_price)|min(order_item_quantity)|min(order_item_subtotal)|
+------------------+------------------------+--------------------------+-----------------------------+------------------------+------------------------+
|                 1|                       1|                        19|                         9.99|                       1|                    9.99|
+------------------+------------------------+--------------------------+-----------------------------+------------------------+------------------------+



In [0]:
order_items_grouped = order_items.groupBy()

type(order_items_grouped)
## grouped data is of type GroupedData
## note Imp: createing groupedData Type

Out[30]: pyspark.sql.group.GroupedData

In [0]:
## we can perform direct aggregation of GroupedData type 
order_items_grouped. \
    count(). \
    withColumnRenamed('count', 'order_count'). \
    show()

+-----------+
|order_count|
+-----------+
|     172198|
+-----------+



In [0]:
# Get sum of all numeric fields
order_items_grouped. \
    sum(). \
    show()

+------------------+------------------------+--------------------------+-----------------------------+------------------------+------------------------+
|sum(order_item_id)|sum(order_item_order_id)|sum(order_item_product_id)|sum(order_item_product_price)|sum(order_item_quantity)|sum(order_item_subtotal)|
+------------------+------------------------+--------------------------+-----------------------------+------------------------+------------------------+
|       14826161701|              5930941122|                 113734664|         2.3033043690006454E7|                  375758|     3.432261993000947E7|
+------------------+------------------------+--------------------------+-----------------------------+------------------------+------------------------+



In [0]:
order_items_grouped = order_items. \
    select('order_item_order_id', 'order_item_quantity', 'order_item_subtotal'). \
    groupBy('order_item_order_id')
help(order_items_grouped.sum)

Help on method sum in module pyspark.sql.group:

sum(*cols) method of pyspark.sql.group.GroupedData instance
    Computes the sum for each numeric columns for each group.
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    cols : str
        column names. Non-numeric columns are ignored.
    
    Examples
    --------
    >>> df.groupBy().sum('age').collect()
    [Row(sum(age)=7)]
    >>> df3.groupBy().sum('age', 'height').collect()
    [Row(sum(age)=7, sum(height)=165)]



In [0]:
# Gets sum on order_item_order_id as well
# It is not relevant and better to discard aggregation on key fields such as order_item_order_id
order_items_grouped. \
    sum(). \
    show()

+-------------------+------------------------+------------------------+------------------------+
|order_item_order_id|sum(order_item_order_id)|sum(order_item_quantity)|sum(order_item_subtotal)|
+-------------------+------------------------+------------------------+------------------------+
|                 29|                     145|                       9|                 1109.85|
|                474|                    2370|                      13|       774.8199999999999|
|                964|                    3856|                      11|       739.8800000000001|
|               1677|                    8385|                      14|       649.9200000000001|
|               1806|                    5418|                       8|                  789.94|
|               1950|                    9750|                      12|      1015.8700000000001|
|               2214|                    4428|                       5|                  449.96|
|               2250|         

In [0]:
## doing aggregation on spectfic data 
# Consider only order_item_quantity and order_item_subtotal
order_items_grouped. \
    sum('order_item_quantity', 'order_item_subtotal'). \
    show(4,False)

+-------------------+------------------------+------------------------+
|order_item_order_id|sum(order_item_quantity)|sum(order_item_subtotal)|
+-------------------+------------------------+------------------------+
|29                 |9                       |1109.85                 |
|474                |13                      |774.8199999999999       |
|964                |11                      |739.8800000000001       |
|1677               |14                      |649.9200000000001       |
+-------------------+------------------------+------------------------+
only showing top 4 rows



In [0]:
order_items_grouped. \
    sum('order_item_quantity', 'order_item_subtotal'). \
    toDF('order_item_order_id', 'order_quantity', 'order_revenue'). \
    printSchema()
## grouping data and renaming the columns 
order_items_grouped. \
    sum('order_item_quantity', 'order_item_subtotal'). \
    toDF('order_item_order_id', 'order_quantity', 'order_revenue').show(2)
## note: above sum/aggregation  is only happning on  two columns 

root
 |-- order_item_order_id: long (nullable = true)
 |-- order_quantity: long (nullable = true)
 |-- order_revenue: double (nullable = true)

+-------------------+--------------+-----------------+
|order_item_order_id|order_quantity|    order_revenue|
+-------------------+--------------+-----------------+
|                 29|             9|          1109.85|
|                474|            13|774.8199999999999|
+-------------------+--------------+-----------------+
only showing top 2 rows



In [0]:
from pyspark.sql.functions import round
# We can specify custom names to derived fields using toDF
# We can specify custom names to derived fields using toDF
# withColumn can be used to apply functions such as round on aggregated results
order_items_grouped. \
    sum('order_item_quantity', 'order_item_subtotal'). \
    toDF('order_item_order_id', 'order_quantity', 'order_revenue'). \
    withColumn('order_revenue', round('order_revenue', 2)). \
    show(2)

+-------------------+--------------+-------------+
|order_item_order_id|order_quantity|order_revenue|
+-------------------+--------------+-------------+
|                 29|             9|      1109.85|
|                474|            13|       774.82|
+-------------------+--------------+-------------+
only showing top 2 rows



In [0]:
#08 Perform Grouped Aggregations using agg on a Spark Data Frame
order_items_grouped = order_items. \
    groupBy('order_item_order_id')

In [0]:
order_items_grouped. \
    sum('order_item_quantity', 'order_item_subtotal'). \
    show(2)

## note: above we get sum of order_item_quantity and sum of order_item_subtotal in there column grouped by order_item_order_id
order_items_grouped. \
    sum('order_item_quantity', 'order_item_subtotal').printSchema

+-------------------+------------------------+------------------------+
|order_item_order_id|sum(order_item_quantity)|sum(order_item_subtotal)|
+-------------------+------------------------+------------------------+
|                 29|                       9|                 1109.85|
|                474|                      13|       774.8199999999999|
+-------------------+------------------------+------------------------+
only showing top 2 rows

Out[51]: <bound method DataFrame.printSchema of DataFrame[order_item_order_id: bigint, sum(order_item_quantity): bigint, sum(order_item_subtotal): double]>

In [0]:
# agg(*exprs) method of pyspark.sql.group.GroupedData instance
#     Compute aggregates and returns the result as a :class:`DataFrame`.

In [0]:
## grouping by using agg  on spark dataFrame
## only agg function can be leverage as part of agg 
## we want to get min , max ,sum,avg on sum of order_iter_order_id 

In [0]:
from pyspark.sql.functions import sum
order_items_grouped. \
    agg(sum('order_item_quantity'), sum('order_item_subtotal')). \
    printSchema

## here sum is function from sql 
order_items_grouped. \
    agg(sum('order_item_quantity'), sum('order_item_subtotal')).show(2)

# note: agg is prefered approch 
from pyspark.sql.functions import round

order_items_grouped. \
    agg(sum('order_item_quantity').alias('order_quanity'), round(sum('order_item_subtotal'), 2).alias('order_revenue')).show(2)

## here we are rounding off the data with columns 


[0;36m  File [0;32m"<command-1564495909092769>"[0;36m, line [0;32m13[0m
[0;31m    agg(round(sum('order_item_quantity')),round( sum('order_item_subtotal')).show(2)[0m
[0m                                                                                    
^[0m
[0;31mSyntaxError[0m[0;31m:[0m unexpected EOF while parsing


In [0]:
from pyspark.sql.functions import min,col
## note: agg is prefered method for aggregation 
order_items_grouped.agg({'order_item_quantity': 'sum', 'order_item_quantity': 'min'}).show(2)

## sending the dictionary to aggregate methods  where key is column and value is aggregate function 
order_items_grouped. \
    agg(sum(col('order_item_quantity')).alias('order_quanity'), min(col('order_item_quantity')).alias('min_order_quantity'),
        round(sum('order_item_subtotal'), 2).alias('order_revenue'),
        min('order_item_subtotal').alias('min_order_item_subtotal')
    ). \
    show()

+-------------------+------------------------+
|order_item_order_id|min(order_item_quantity)|
+-------------------+------------------------+
|                 29|                       1|
|                474|                       1|
+-------------------+------------------------+
only showing top 2 rows

+-------------------+-------------+------------------+-------------+-----------------------+
|order_item_order_id|order_quanity|min_order_quantity|order_revenue|min_order_item_subtotal|
+-------------------+-------------+------------------+-------------+-----------------------+
|                 29|            9|                 1|      1109.85|                 129.99|
|                474|           13|                 1|       774.82|                  24.99|
|                964|           11|                 1|       739.88|                 129.99|
|               1677|           14|                 1|       649.92|                   50.0|
|               1806|            8|       