In [1]:
#Spark Imports and Get Context

#https://spark.apache.org/docs/latest/sql-getting-started.html
from pyspark import SparkContext, SparkConf, SQLContext
from os import getcwd

conf = SparkConf().setAppName('SparkS1')
sc = SparkContext(conf=conf).getOrCreate()
spark = SQLContext.getOrCreate(sc)

In [2]:
#Classic Models Example

#Header - orderNumber,productCode,quantityOrdered,priceEach,orderLineNumber
orderDetailRDD = sc.textFile('../datasets/classicmodels/orderdetail.csv')
#orderDetailRDD.take(5)

#Removing Header
header = orderDetailRDD.first()
ODRDDWh = orderDetailRDD.filter(lambda x : x != header)
ODRDDWh.take(5)

['10100,S18_1749,30,136,3',
 '10100,S18_2248,50,55.09,2',
 '10100,S18_4409,22,75.46,4',
 '10100,S24_3969,49,35.29,1',
 '10101,S18_2325,25,108.06,4']

In [3]:
#Splitting the lines into array

ODRDDWhA = ODRDDWh.map(lambda x : x.split(','))
ODRDDWhA.take(5)

[['10100', 'S18_1749', '30', '136', '3'],
 ['10100', 'S18_2248', '50', '55.09', '2'],
 ['10100', 'S18_4409', '22', '75.46', '4'],
 ['10100', 'S24_3969', '49', '35.29', '1'],
 ['10101', 'S18_2325', '25', '108.06', '4']]

In [4]:
#Total Sales for Each product

ODProdAmt = ODRDDWhA.map(lambda x : (x[1],int(x[2])*float(x[3])))
#ODProdAmt.take(5)

#Summing values of Unique Product Code
ODProdtot = ODProdAmt.reduceByKey(lambda x,y: x+y).mapValues(lambda x : round(x,2))
ODProdtot.take(5)

[('S18_2795', 132275.98),
 ('S24_2022', 38449.09),
 ('S18_1342', 102563.52),
 ('S10_4962', 123123.01),
 ('S12_1666', 119085.25)]

In [5]:
#Pair RDD Transformation - reduceByKey, groupBykey, aggregateByKey, countByKey

ODProdtot1 = ODProdAmt.groupByKey().mapValues(lambda x: round(sum(x),2))
ODProdtot1.take(5) #Same result as cmd 3
#ODProdAmt.countByKey()

[('S18_2795', 132275.98),
 ('S24_2022', 38449.09),
 ('S18_1342', 102563.52),
 ('S10_4962', 123123.01),
 ('S12_1666', 119085.25)]

In [7]:
#Accumlator

#Header - orderNumber,productCode,quantityOrdered,priceEach,orderLineNumber
orderDetailRDD = sc.textFile('../datasets/classicmodels/orderdetail.csv')
#orderDetailRDD.take(5)

#Removing Header
header = orderDetailRDD.first()
ODRDDWh = orderDetailRDD.filter(lambda x : x != header)
ODRDDWh.take(5)

highValAcc = sc.accumulator(0)

def highValfunc(qty):
  if (qty >= 50):
    print(qty)
    highValAcc.add(1)

#x = ODRDDWhA.take(1)
#x[0][2]


ODRDDWhA.foreach(lambda x : highValfunc(int(x[2])))
highValAcc.value

139