In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
    .appName("RDD basics") \
    .getOrCreate()

spark

In [3]:
rdd = spark.sparkContext.textFile("./dataset/orders_sh.csv")

In [4]:
rdd.count()

68883

In [5]:
rdd.take(5)

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE']

### 1. Số lượng orders ứng với mỗi status

In [6]:
status_rdd = rdd.map(lambda x: (x.split(",")[3], 1))
status_rdd.take(5)

[('CLOSED', 1),
 ('PENDING_PAYMENT', 1),
 ('COMPLETE', 1),
 ('CLOSED', 1),
 ('COMPLETE', 1)]

In [7]:
ordersEachStatus_rdd = status_rdd.reduceByKey(lambda x,y : x+y)
ordersEachStatus_rdd.collect()

[('CLOSED', 7556),
 ('CANCELED', 1428),
 ('PENDING_PAYMENT', 15030),
 ('COMPLETE', 22899),
 ('PROCESSING', 8275),
 ('PAYMENT_REVIEW', 729),
 ('PENDING', 7610),
 ('ON_HOLD', 3798),
 ('SUSPECTED_FRAUD', 1558)]

In [8]:
ordersEachStatus_rdd.sortBy(lambda x: x[1]).collect()

[('PAYMENT_REVIEW', 729),
 ('CANCELED', 1428),
 ('SUSPECTED_FRAUD', 1558),
 ('ON_HOLD', 3798),
 ('CLOSED', 7556),
 ('PENDING', 7610),
 ('PROCESSING', 8275),
 ('PENDING_PAYMENT', 15030),
 ('COMPLETE', 22899)]

In [9]:
ordersEachStatus_rdd.sortBy(lambda x: x[1], False).collect()

[('COMPLETE', 22899),
 ('PENDING_PAYMENT', 15030),
 ('PROCESSING', 8275),
 ('PENDING', 7610),
 ('CLOSED', 7556),
 ('ON_HOLD', 3798),
 ('SUSPECTED_FRAUD', 1558),
 ('CANCELED', 1428),
 ('PAYMENT_REVIEW', 729)]

### 2. Top 10 khách có lượng orders nhiều nhất

In [10]:
cust_rdd = rdd.map(lambda x: (x.split(",")[2], 1))
cust_rdd.take(5)

[('11599', 1), ('256', 1), ('12111', 1), ('8827', 1), ('11318', 1)]

In [11]:
ordersEachCust_rdd = cust_rdd.reduceByKey(lambda x,y : x+y)
ordersEachCust_rdd.take(5)

[('256', 10), ('12111', 6), ('11318', 6), ('7130', 7), ('2911', 6)]

In [12]:
ordersEachCust_rdd.sortBy(lambda x : x[1], False).take(10)

[('5897', 16),
 ('6316', 16),
 ('12431', 16),
 ('569', 16),
 ('4320', 15),
 ('221', 15),
 ('5624', 15),
 ('5283', 15),
 ('12284', 15),
 ('5654', 15)]

### 3. Tổng số lượng khách hàng hiện tại là bao nhiêu?

In [13]:
duplicateCust_rdd = rdd.map(lambda x: x.split(",")[2])
duplicateCust_rdd.count()

68883

In [14]:
distinctCust_rdd = duplicateCust_rdd.distinct()
distinctCust_rdd.count()

12405

### 4. Khách hàng nào có số lượng orders "CLOSED" nhiều nhất?

In [15]:
closedOrders_rdd = rdd.filter(lambda x : x.split(",")[3] == "CLOSED")
closedOrders_rdd.take(5)

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '12,2013-07-25 00:00:00.0,1837,CLOSED',
 '18,2013-07-25 00:00:00.0,1205,CLOSED',
 '24,2013-07-25 00:00:00.0,11441,CLOSED']

In [16]:
custClosed_rdd = closedOrders_rdd.map(lambda x : (x.split(",")[2], 1))
custClosed_rdd.take(5)

[('11599', 1), ('8827', 1), ('1837', 1), ('1205', 1), ('11441', 1)]

In [17]:
closedOrdersEachCust_rdd = custClosed_rdd.reduceByKey(lambda x,y : x+y)
closedOrdersEachCust_rdd.take(5)

[('5863', 1), ('12271', 2), ('7073', 1), ('3065', 2), ('5116', 2)]

In [18]:
closedOrdersEachCust_rdd.sortBy(lambda x : x[1], False).take(5)

[('1833', 6), ('1363', 5), ('1687', 5), ('5493', 5), ('5011', 4)]