In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()

In [2]:
orders_df = spark.read \
.format("csv") \
.option("header","true") \
.option("inferSchema", "true") \
.load("/user/itv019463/TrendyTechBigData/Week5/data/ordersWithHeader.csv")

In [3]:
orders_df.show(5)

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
+--------+--------------------+-----------+---------------+
only showing top 5 rows



In [4]:
orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [5]:
orders_df.createOrReplaceTempView("orders")

# Higher Level API's Demo

##### 1. Top 15 customers who placed the most number of orders

In [6]:
result = orders_df.groupBy("customer_id").count().sort("count", ascending = False).limit(15)

In [7]:
result.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       5897|   16|
|      12431|   16|
|        569|   16|
|       6316|   16|
|      12284|   15|
|       4320|   15|
|       5624|   15|
|       5283|   15|
|        221|   15|
|       5654|   15|
|       6248|   14|
|       3708|   14|
|       1011|   14|
|       8652|   14|
|       4517|   14|
+-----------+-----+



In [8]:
df = spark.sql("select customer_id, count(*) as num_orders from orders group by customer_id order by count(*) desc limit 15")

In [9]:
df.show()

+-----------+----------+
|customer_id|num_orders|
+-----------+----------+
|       5897|        16|
|      12431|        16|
|        569|        16|
|       6316|        16|
|      12284|        15|
|       4320|        15|
|       5624|        15|
|       5283|        15|
|        221|        15|
|       5654|        15|
|       6248|        14|
|       3708|        14|
|       1011|        14|
|       8652|        14|
|       4517|        14|
+-----------+----------+



##### 2. No. of orders under each order status

In [10]:
result = orders_df.groupBy("order_status").count()

In [11]:
result.show()

+---------------+-----+
|   order_status|count|
+---------------+-----+
|PENDING_PAYMENT|15030|
|       COMPLETE|22899|
|        ON_HOLD| 3798|
| PAYMENT_REVIEW|  729|
|     PROCESSING| 8275|
|         CLOSED| 7556|
|SUSPECTED_FRAUD| 1558|
|        PENDING| 7610|
|       CANCELED| 1428|
+---------------+-----+



In [12]:
df = spark.sql("select order_status, count(*) as num_orders from orders group by order_status")

In [13]:
df.show()

+---------------+----------+
|   order_status|num_orders|
+---------------+----------+
|PENDING_PAYMENT|     15030|
|       COMPLETE|     22899|
|        ON_HOLD|      3798|
| PAYMENT_REVIEW|       729|
|     PROCESSING|      8275|
|         CLOSED|      7556|
|SUSPECTED_FRAUD|      1558|
|        PENDING|      7610|
|       CANCELED|      1428|
+---------------+----------+



##### 3. No. of active customers (who placed atleast one order)

In [14]:
result = orders_df.select("customer_id").distinct().count()

In [15]:
result

12405

In [18]:
df = spark.sql("select count(distinct customer_id) as count_cust from orders")

In [19]:
df.show()

+----------+
|count_cust|
+----------+
|     12405|
+----------+



##### 4. Customers with most most number of CLOSED orders

In [20]:
result = orders_df.select("customer_id", "order_status").where("order_status = 'CLOSED'").groupBy("customer_id").count().sort("count", ascending = False).limit(10)

In [21]:
result.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       1833|    6|
|       1363|    5|
|       1687|    5|
|       5493|    5|
|       2403|    4|
|       7850|    4|
|      10018|    4|
|       2236|    4|
|       2768|    4|
|      10111|    4|
+-----------+-----+



In [24]:
df = spark.sql("select customer_id, count(*) from orders where order_status = 'CLOSED' group by customer_id order by count(*) desc limit 10")

In [25]:
df.show()

+-----------+--------+
|customer_id|count(1)|
+-----------+--------+
|       1833|       6|
|       1363|       5|
|       1687|       5|
|       5493|       5|
|       2403|       4|
|       7850|       4|
|      10018|       4|
|       2236|       4|
|       2768|       4|
|      10111|       4|
+-----------+--------+

