In [None]:
# Based on
# https://github.com/PacktPublishing/Frank-Kanes-Taming-Big-Data-with-Apache-Spark-and-Python/blob/master/total-spent-by-customer-sorted.py

In [1]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("SpendByCustomerSorted")
sc = SparkContext(conf = conf)

In [2]:
def extractCustomerPricePairs(line):
    fields = line.split(',')
    return (int(fields[0]), float(fields[2]))

In [3]:
csv_input = sc.textFile("./data/customer-orders.csv")
csv_input.take(5)

['44,8602,37.19',
 '35,5368,65.89',
 '2,3391,40.64',
 '47,6694,14.98',
 '29,680,13.08']

In [4]:
mappedInput = csv_input.map(extractCustomerPricePairs)
mappedInput.take(5)

[(44, 37.19), (35, 65.89), (2, 40.64), (47, 14.98), (29, 13.08)]

In [5]:
totalByCustomer = mappedInput.reduceByKey(lambda x, y: x + y)
totalByCustomer.take(5)

[(44, 4756.8899999999985),
 (35, 5155.419999999999),
 (2, 5994.59),
 (47, 4316.299999999999),
 (29, 5032.529999999999)]

In [6]:
#Changed for Python 3 compatibility:
#flipped = totalByCustomer.map(lambda (x,y):(y,x))
flipped = totalByCustomer.map(lambda x: (x[1], x[0]))
flipped.take(5)

[(4756.8899999999985, 44),
 (5155.419999999999, 35),
 (5994.59, 2),
 (4316.299999999999, 47),
 (5032.529999999999, 29)]

In [7]:
totalByCustomerSorted = flipped.sortByKey()

In [8]:
results = totalByCustomerSorted.collect();
for result in results:
    print(result)

(3309.38, 45)
(3790.570000000001, 79)
(3924.230000000001, 96)
(4042.6499999999987, 23)
(4172.289999999998, 99)
(4178.500000000001, 75)
(4278.049999999997, 36)
(4297.260000000001, 98)
(4316.299999999999, 47)
(4327.729999999999, 77)
(4367.62, 13)
(4384.33, 48)
(4394.599999999999, 49)
(4475.569999999999, 94)
(4505.79, 67)
(4517.27, 50)
(4524.509999999999, 78)
(4561.069999999999, 5)
(4628.4, 57)
(4635.799999999997, 83)
(4642.259999999999, 91)
(4647.129999999999, 74)
(4652.939999999999, 84)
(4659.63, 3)
(4664.589999999998, 12)
(4681.919999999999, 66)
(4701.019999999999, 56)
(4707.41, 21)
(4727.860000000001, 80)
(4735.030000000001, 14)
(4735.200000000002, 37)
(4755.070000000001, 7)
(4756.8899999999985, 44)
(4765.05, 31)
(4812.489999999998, 82)
(4815.050000000002, 4)
(4819.700000000001, 10)
(4830.549999999999, 88)
(4836.859999999999, 20)
(4851.479999999999, 89)
(4876.840000000002, 95)
(4898.460000000002, 38)
(4904.209999999999, 76)
(4908.81, 86)
(4915.889999999999, 27)
(4921.27, 18)
(4945.299