In [1]:
#Spark Imports and Get Context

#https://spark.apache.org/docs/latest/sql-getting-started.html
from pyspark import SparkContext, SparkConf, SQLContext
from os import getcwd

conf = SparkConf().setAppName('SparkS1')
sc = SparkContext(conf=conf).getOrCreate()
spark = SQLContext.getOrCreate(sc)

In [2]:
#Taking RDD from Classic Models Example

#Header - orderNumber,productCode,quantityOrdered,priceEach,orderLineNumber
orderDetailRDD = sc.textFile('../datasets/classicmodels/orderdetail.csv')
#orderDetailRDD.take(5)

#Removing Header
header = orderDetailRDD.first()
ODRDDWh = orderDetailRDD.filter(lambda x : x != header)
ODRDDWh.take(5)

ODRDDWhA = ODRDDWh.map(lambda x : x.split(','))
ODRDDWhA.take(5)

[['10100', 'S18_1749', '30', '136', '3'],
 ['10100', 'S18_2248', '50', '55.09', '2'],
 ['10100', 'S18_4409', '22', '75.46', '4'],
 ['10100', 'S24_3969', '49', '35.29', '1'],
 ['10101', 'S18_2325', '25', '108.06', '4']]

In [3]:
#Data Frame Examples - Creating DF from RDD

from pyspark.sql import Row

ODRowRDD = ODRDDWhA.map(lambda x: Row (OrderNumber = x[0], ProductCode = x[1], Qty = int(x[2]), UnitPrice = float(x[3]), OrderLineNumber = int(x[4])))

#ODRowRDD.take(5)

ODDataFrame = spark.createDataFrame(ODRowRDD)
ODDataFrame.show(5)

+-----------+-----------+---+---------+---------------+
|OrderNumber|ProductCode|Qty|UnitPrice|OrderLineNumber|
+-----------+-----------+---+---------+---------------+
|      10100|   S18_1749| 30|    136.0|              3|
|      10100|   S18_2248| 50|    55.09|              2|
|      10100|   S18_4409| 22|    75.46|              4|
|      10100|   S24_3969| 49|    35.29|              1|
|      10101|   S18_2325| 25|   108.06|              4|
+-----------+-----------+---+---------+---------------+
only showing top 5 rows



In [4]:
#Creating DF with spark.read.format

#Orderdetails table with inferSchema option
ODDataFrame1 = spark.read.format('csv').option('header','true').option('inferSchema','true').load('../datasets/classicmodels/orderdetail.csv')
ODDataFrame1.show(5)

#Order table with inferSchema
ODataFrame1 = spark.read.format('csv').option('header','true').option('inferSchema','true').load('../datasets/classicmodels/order.csv')
ODataFrame1.show(5)

+-----------+-----------+---------------+---------+---------------+
|orderNumber|productCode|quantityOrdered|priceEach|orderLineNumber|
+-----------+-----------+---------------+---------+---------------+
|      10100|   S18_1749|             30|    136.0|              3|
|      10100|   S18_2248|             50|    55.09|              2|
|      10100|   S18_4409|             22|    75.46|              4|
|      10100|   S24_3969|             49|    35.29|              1|
|      10101|   S18_2325|             25|   108.06|              4|
+-----------+-----------+---------------+---------+---------------+
only showing top 5 rows

+-----------+----------+------------+-----------+-------+--------------------+--------------+
|orderNumber| orderDate|requiredDate|shippedDate| status|            comments|customerNumber|
+-----------+----------+------------+-----------+-------+--------------------+--------------+
|      10100|2003-01-06|  2003-01-13| 2003-01-10|Shipped|                null|   

In [6]:
#Using Schema option

from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType, TimestampType

orderSchema = StructType(
[
  StructField('orderNumber',IntegerType()),
  StructField('orderDate',TimestampType()),
  StructField('requiredDate',TimestampType()),
  StructField('shippedDate',TimestampType()),
  StructField('status',StringType()),
  StructField('comments',StringType()),
  StructField('customerNumber',IntegerType()),
]
)

ODataFrameS = spark.read.format('csv').option('header','true').schema(orderSchema).load('../datasets/classicmodels/order.csv')
ODataFrameS.show(5)

+-----------+-------------------+-------------------+-------------------+-------+--------------------+--------------+
|orderNumber|          orderDate|       requiredDate|        shippedDate| status|            comments|customerNumber|
+-----------+-------------------+-------------------+-------------------+-------+--------------------+--------------+
|      10100|2003-01-06 00:00:00|2003-01-13 00:00:00|2003-01-10 00:00:00|Shipped|                null|           363|
|      10101|2003-01-09 00:00:00|2003-01-18 00:00:00|2003-01-11 00:00:00|Shipped|Check on availabi...|           128|
|      10102|2003-01-10 00:00:00|2003-01-18 00:00:00|2003-01-14 00:00:00|Shipped|                null|           181|
|      10103|2003-01-29 00:00:00|2003-02-07 00:00:00|2003-02-02 00:00:00|Shipped|                null|           121|
|      10104|2003-01-31 00:00:00|2003-02-09 00:00:00|2003-02-01 00:00:00|Shipped|                null|           141|
+-----------+-------------------+-------------------+---

In [8]:
#handling Nulls with read.csv method, options('nullvalue','null') can also be used

ODataFrame2 = spark.read.csv(
'../datasets/classicmodels/order.csv',
header=True,
schema=orderSchema,
nullValue='null'
)

##for help
#readformat = readformat.option('header','true')
#type(readformat)
#help(readformat)

#ODataFrame2.show(5)


ODataFrameNA = ODataFrame2.fillna('NA',['comments']) #Replace nulls with some Strings
ODataFrameNA.show(5)

+-----------+-------------------+-------------------+-------------------+-------+--------------------+--------------+
|orderNumber|          orderDate|       requiredDate|        shippedDate| status|            comments|customerNumber|
+-----------+-------------------+-------------------+-------------------+-------+--------------------+--------------+
|      10100|2003-01-06 00:00:00|2003-01-13 00:00:00|2003-01-10 00:00:00|Shipped|                  NA|           363|
|      10101|2003-01-09 00:00:00|2003-01-18 00:00:00|2003-01-11 00:00:00|Shipped|Check on availabi...|           128|
|      10102|2003-01-10 00:00:00|2003-01-18 00:00:00|2003-01-14 00:00:00|Shipped|                  NA|           181|
|      10103|2003-01-29 00:00:00|2003-02-07 00:00:00|2003-02-02 00:00:00|Shipped|                  NA|           121|
|      10104|2003-01-31 00:00:00|2003-02-09 00:00:00|2003-02-01 00:00:00|Shipped|                  NA|           141|
+-----------+-------------------+-------------------+---

In [9]:
#Operations

#Drop
ODataFrameDrop = ODataFrameNA.drop('requiredDate','shippedDate','comments')
#ODataFrameDrop.show(5)

#Join
#ODDataFrame1.show(5)

OSummary = ODataFrameDrop.join(ODDataFrame1,'orderNumber','inner')
OSummary.show(5)

#Add column
OSumTotalAmount = OSummary.withColumn('totalAmount',OSummary.quantityOrdered * OSummary.priceEach)

OSumTotalAmount.show(5)

+-----------+-------------------+-------+--------------+-----------+---------------+---------+---------------+
|orderNumber|          orderDate| status|customerNumber|productCode|quantityOrdered|priceEach|orderLineNumber|
+-----------+-------------------+-------+--------------+-----------+---------------+---------+---------------+
|      10100|2003-01-06 00:00:00|Shipped|           363|   S18_1749|             30|    136.0|              3|
|      10100|2003-01-06 00:00:00|Shipped|           363|   S18_2248|             50|    55.09|              2|
|      10100|2003-01-06 00:00:00|Shipped|           363|   S18_4409|             22|    75.46|              4|
|      10100|2003-01-06 00:00:00|Shipped|           363|   S24_3969|             49|    35.29|              1|
|      10101|2003-01-09 00:00:00|Shipped|           128|   S18_2325|             25|   108.06|              4|
+-----------+-------------------+-------+--------------+-----------+---------------+---------+---------------+
o

In [10]:
#Find the total sale by Product Code, Date - Sort on Desencing order of OrderDate
from pyspark.sql.functions import sum,round,col

productSummaryG = OSumTotalAmount.groupBy('productCode','orderDate','quantityOrdered')
#help(productSummaryG) -Grouped Data

productSummaryA = productSummaryG.agg(round(sum('totalAmount'),2).alias('totalSales'))
#type(productSummaryA) - DataFrame
#productSummaryA.show(5)

productSummaryS = productSummaryA.orderBy('productCode',col('orderDate').desc())
productSummaryS.show(5)

+-----------+-------------------+---------------+----------+
|productCode|          orderDate|quantityOrdered|totalSales|
+-----------+-------------------+---------------+----------+
|   S10_1678|2005-05-13 00:00:00|             66|   5242.38|
|   S10_1678|2005-04-08 00:00:00|             24|   2044.08|
|   S10_1678|2005-04-01 00:00:00|             40|    3100.8|
|   S10_1678|2005-03-03 00:00:00|             42|   3376.38|
|   S10_1678|2005-02-03 00:00:00|             21|   1607.76|
+-----------+-------------------+---------------+----------+
only showing top 5 rows

