In [1]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder. \
appName("DataFrame and TempView Practice"). \
getOrCreate()

In [3]:
df = spark.read.csv("./dataset/orders_wh.csv", header=True, inferSchema=True)
df.show(5)

+--------+-------------------+-----------+---------------+
|order_id|         order_date|customer_id|   order_status|
+--------+-------------------+-----------+---------------+
|       1|2013-07-25 00:00:00|      11599|         CLOSED|
|       2|2013-07-25 00:00:00|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|      12111|       COMPLETE|
|       4|2013-07-25 00:00:00|       8827|         CLOSED|
|       5|2013-07-25 00:00:00|      11318|       COMPLETE|
+--------+-------------------+-----------+---------------+
only showing top 5 rows



In [4]:
df.createOrReplaceTempView("orders_tmp")

### 1. Số lượng orders ứng với mỗi status

In [5]:
df.groupBy("order_status").count().show()

+---------------+-----+
|   order_status|count|
+---------------+-----+
|PENDING_PAYMENT|15030|
|       COMPLETE|22899|
|        ON_HOLD| 3798|
| PAYMENT_REVIEW|  729|
|     PROCESSING| 8275|
|         CLOSED| 7556|
|SUSPECTED_FRAUD| 1558|
|        PENDING| 7610|
|       CANCELED| 1428|
+---------------+-----+



In [6]:
spark.sql("SELECT order_status, count(*) orders_each_status FROM orders_tmp GROUP BY order_status").show()

+---------------+------------------+
|   order_status|orders_each_status|
+---------------+------------------+
|PENDING_PAYMENT|             15030|
|       COMPLETE|             22899|
|        ON_HOLD|              3798|
| PAYMENT_REVIEW|               729|
|     PROCESSING|              8275|
|         CLOSED|              7556|
|SUSPECTED_FRAUD|              1558|
|        PENDING|              7610|
|       CANCELED|              1428|
+---------------+------------------+



### 2. Top 10 khách hàng có số lượng orders nhiều nhất?

In [7]:
df_top10 = df.groupBy("customer_id").count().sort("count", ascending=False).limit(10)
df_top10.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       5897|   16|
|      12431|   16|
|        569|   16|
|       6316|   16|
|      12284|   15|
|       5624|   15|
|       4320|   15|
|       5283|   15|
|        221|   15|
|       5654|   15|
+-----------+-----+



In [8]:
spark.sql("SELECT customer_id, count(*) count FROM orders_tmp GROUP BY customer_id ORDER BY count DESC LIMIT 10").show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       5897|   16|
|      12431|   16|
|        569|   16|
|       6316|   16|
|      12284|   15|
|       5624|   15|
|       4320|   15|
|       5283|   15|
|        221|   15|
|       5654|   15|
+-----------+-----+



### 3. Tổng số lượng khách hàng hiện tại là bao nhiêu?

In [9]:
df.select("customer_id").distinct().count()

12405

In [10]:
spark.sql("SELECT COUNT(DISTINCT customer_id) AS customer_count FROM orders_tmp").show()

+--------------+
|customer_count|
+--------------+
|         12405|
+--------------+



### 4. Khách hàng nào có số lượng orders "CLOSED" nhiều nhất?

In [11]:
df_top1 = df.where("order_status = 'CLOSED'").groupBy("customer_id").count().sort("count", ascending=False).limit(5)
df_top1.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       1833|    6|
|       1363|    5|
|       1687|    5|
|       5493|    5|
|      12431|    4|
+-----------+-----+



In [12]:
spark.sql("""
          SELECT customer_id, count(*) count FROM orders_tmp 
          WHERE order_status = 'CLOSED' 
          GROUP BY customer_id 
          ORDER BY count DESC
          LIMIT(5)
          """).show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       1833|    6|
|       1363|    5|
|       1687|    5|
|       5493|    5|
|      12431|    4|
+-----------+-----+

