In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder. \
appName("Aggregate Func"). \
getOrCreate()

In [3]:
order_df = spark.read.csv("./dataset/order_data.csv", header=True, inferSchema=True)
order_df.show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536378|     NULL|PACK OF 60 DINOSA...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|     NULL|PACK OF 60 PINK P...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|    84991|60 TEATIME FAIRY ...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|   84519A|TOMATO CHARLIE+LO...|       6|01-12-2010 9.37|     2.95|     14688|United Kingdom|
|   536378|   85183B|CHARLIE & LOLA WA...|      48|01-12-2010 9.37|     1.25|     14688|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 5 rows



In [4]:
order_df.createOrReplaceTempView("order_tmp")

In [5]:
order_df.rdd.getNumPartitions()

8

In [6]:
order_df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



### 1. Tính tổng số dòng

In [7]:
order_df.select(count("*").alias("total_rows")).show()  # return DF

+----------+
|total_rows|
+----------+
|    541782|
+----------+



In [8]:
order_df.count()  # returns int

541782

### 2. Tính số hóa đơn

In [9]:
order_df.select(countDistinct(order_df.InvoiceNo).alias("total_unique_invoices")).show()  # return DF

+---------------------+
|total_unique_invoices|
+---------------------+
|                25858|
+---------------------+



In [10]:
order_df.select(order_df.InvoiceNo).distinct().count()  # returns int

25858

### 3. Tính tổng số lượng hàng bán & giá bán trung bình

In [11]:
order_df.select(sum(order_df.Quantity).alias("total_quantity"), round(avg(order_df.UnitPrice), 2).alias("avg_price")).show()

+--------------+---------+
|total_quantity|avg_price|
+--------------+---------+
|       5175855|     4.61|
+--------------+---------+



In [12]:
order_df.selectExpr("sum(Quantity) as total_quantity", "round(avg(UnitPrice), 2) as avg_price").show()

+--------------+---------+
|total_quantity|avg_price|
+--------------+---------+
|       5175855|     4.61|
+--------------+---------+



In [13]:
spark.sql("select sum(Quantity) total_quantity, round(avg(UnitPrice),2) avg_price from order_tmp").show()

+--------------+---------+
|total_quantity|avg_price|
+--------------+---------+
|       5175855|     4.61|
+--------------+---------+



### 4. Tổng số lượng hàng hóa theo quốc gia

In [14]:
order_df.groupBy(order_df.Country).agg(
    sum(order_df.Quantity).alias("total_quantity"),
    round(sum(order_df.Quantity * order_df.UnitPrice), 1).alias("total_amount")
).show()
# no need .select() because Spark will automatically understand

+---------------+--------------+------------+
|        Country|total_quantity|total_amount|
+---------------+--------------+------------+
|         Sweden|         35637|     36595.9|
|      Singapore|          5234|      9120.4|
|        Germany|        117448|    221698.2|
|         France|        110031|    196548.0|
|         Greece|          1556|      4710.5|
|        Belgium|         23152|     40911.0|
|        Finland|         10666|     22326.7|
|          Italy|          7999|     16890.5|
|           EIRE|        142637|    263276.8|
|      Lithuania|           652|      1661.1|
|         Norway|         19247|     35163.5|
|          Spain|         26824|     54774.6|
|        Denmark|          8188|     18768.1|
|      Hong Kong|          4769|     10117.0|
|        Iceland|          2458|      4310.0|
|         Israel|          4353|      7907.8|
|Channel Islands|          9479|     20086.3|
|         Cyprus|          6317|     12946.3|
|    Switzerland|         30325|  

In [15]:
order_df.groupBy(order_df.Country).agg(
    expr("sum(Quantity) as total_quantity"),
    expr("round(sum(Quantity * UnitPrice), 1) as total_amount"),
).show()

+---------------+--------------+------------+
|        Country|total_quantity|total_amount|
+---------------+--------------+------------+
|         Sweden|         35637|     36595.9|
|      Singapore|          5234|      9120.4|
|        Germany|        117448|    221698.2|
|         France|        110031|    196548.0|
|         Greece|          1556|      4710.5|
|        Belgium|         23152|     40911.0|
|        Finland|         10666|     22326.7|
|          Italy|          7999|     16890.5|
|           EIRE|        142637|    263276.8|
|      Lithuania|           652|      1661.1|
|         Norway|         19247|     35163.5|
|          Spain|         26824|     54774.6|
|        Denmark|          8188|     18768.1|
|      Hong Kong|          4769|     10117.0|
|        Iceland|          2458|      4310.0|
|         Israel|          4353|      7907.8|
|Channel Islands|          9479|     20086.3|
|         Cyprus|          6317|     12946.3|
|    Switzerland|         30325|  

In [16]:
spark.sql(
    """
        select Country, sum(Quantity) as total_quantity, round(sum(Quantity * UnitPrice), 1) as total_amount
        from order_tmp
        group by Country
    """
).show()

+---------------+--------------+------------+
|        Country|total_quantity|total_amount|
+---------------+--------------+------------+
|         Sweden|         35637|     36595.9|
|      Singapore|          5234|      9120.4|
|        Germany|        117448|    221698.2|
|         France|        110031|    196548.0|
|         Greece|          1556|      4710.5|
|        Belgium|         23152|     40911.0|
|        Finland|         10666|     22326.7|
|          Italy|          7999|     16890.5|
|           EIRE|        142637|    263276.8|
|      Lithuania|           652|      1661.1|
|         Norway|         19247|     35163.5|
|          Spain|         26824|     54774.6|
|        Denmark|          8188|     18768.1|
|      Hong Kong|          4769|     10117.0|
|        Iceland|          2458|      4310.0|
|         Israel|          4353|      7907.8|
|Channel Islands|          9479|     20086.3|
|         Cyprus|          6317|     12946.3|
|    Switzerland|         30325|  