In [1]:
#Spark Imports and Get Context

#https://spark.apache.org/docs/latest/sql-getting-started.html
from pyspark import SparkContext, SparkConf, SQLContext
from os import getcwd

conf = SparkConf().setAppName('SparkS1')
sc = SparkContext(conf=conf).getOrCreate()
spark = SQLContext.getOrCreate(sc)

In [2]:
#Code from 4_DataFrame

from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType, TimestampType

orderSchema = StructType(
[
  StructField('orderNumber',IntegerType()),
  StructField('orderDate',TimestampType()),
  StructField('requiredDate',TimestampType()),
  StructField('shippedDate',TimestampType()),
  StructField('status',StringType()),
  StructField('comments',StringType()),
  StructField('customerNumber',IntegerType()),
]
)

In [3]:
#Code from 4_DataFrame book

###########Order Summary#######################################

ODataFrame1 = spark.read.csv(
'../datasets/classicmodels/order.csv',
header=True,
schema=orderSchema,
nullValue='null'
)

ODataFrame1.show(5)

ODDataFrame1 = spark.read.csv(
'../datasets/classicmodels/orderdetail.csv',
header=True,
inferSchema=True,
nullValue='null'
)

#ODDataFrame1 = spark.read.format('csv').option('header','true').option('inferSchema','true').load('/FileStore/tables/cm/orderdetail.csv')
ODDataFrame1.show(5)

ODataFrameDrop = ODataFrame1.drop('requiredDate','shippedDate','comments')
OSummary = ODataFrameDrop.join(ODDataFrame1,'orderNumber','inner')
OSummary.show(5)

#Add column
OSumTotalAmount = OSummary.withColumn('totalAmount',OSummary.quantityOrdered * OSummary.priceEach)

OSumTotalAmount.show(5)


###########Product Summary#######################################
from pyspark.sql.functions import sum,round,col

productSummaryG = OSumTotalAmount.groupBy('productCode','orderDate','quantityOrdered')
#help(productSummaryG) -Grouped Data

productSummaryA = productSummaryG.agg(round(sum('totalAmount'),2).alias('totalSales'))
#type(productSummaryA) - DataFrame
#productSummaryA.show(5)

productSummaryS = productSummaryA.orderBy('productCode',col('orderDate').desc())
productSummaryS.show(5)

+-----------+-------------------+-------------------+-------------------+-------+--------------------+--------------+
|orderNumber|          orderDate|       requiredDate|        shippedDate| status|            comments|customerNumber|
+-----------+-------------------+-------------------+-------------------+-------+--------------------+--------------+
|      10100|2003-01-06 00:00:00|2003-01-13 00:00:00|2003-01-10 00:00:00|Shipped|                null|           363|
|      10101|2003-01-09 00:00:00|2003-01-18 00:00:00|2003-01-11 00:00:00|Shipped|Check on availabi...|           128|
|      10102|2003-01-10 00:00:00|2003-01-18 00:00:00|2003-01-14 00:00:00|Shipped|                null|           181|
|      10103|2003-01-29 00:00:00|2003-02-07 00:00:00|2003-02-02 00:00:00|Shipped|                null|           121|
|      10104|2003-01-31 00:00:00|2003-02-09 00:00:00|2003-02-01 00:00:00|Shipped|                null|           141|
+-----------+-------------------+-------------------+---

In [4]:
#SQL Commands

#Creating Database and Querying
spark.sql('CREATE DATABASE sparkHive')
OSumTotalAmount.write.saveAsTable('sparkHive.OSumTotalAmount')

#######help(OSumTotalAmount.write)

spark.sql('SELECT * FROM sparkHive.OSumTotalAmount').show(5)  #select statements return DF

#Views
OSumTotalAmount.createOrReplaceTempView('orderSummary')

spark.sql('SELECT * FROM sparkHive.OSumTotalAmount').show(5) 

#joins are also possible with 2 temp views

spark.sql('SELECT distinct customerNumber FROM sparkHive.OSumTotalAmount').show(5) 

+-----------+-------------------+-------+--------------+-----------+---------------+---------+---------------+-----------+
|orderNumber|          orderDate| status|customerNumber|productCode|quantityOrdered|priceEach|orderLineNumber|totalAmount|
+-----------+-------------------+-------+--------------+-----------+---------------+---------+---------------+-----------+
|      10100|2003-01-06 00:00:00|Shipped|           363|   S18_1749|             30|    136.0|              3|     4080.0|
|      10100|2003-01-06 00:00:00|Shipped|           363|   S18_2248|             50|    55.09|              2|     2754.5|
|      10100|2003-01-06 00:00:00|Shipped|           363|   S18_4409|             22|    75.46|              4|    1660.12|
|      10100|2003-01-06 00:00:00|Shipped|           363|   S24_3969|             49|    35.29|              1|    1729.21|
|      10101|2003-01-09 00:00:00|Shipped|           128|   S18_2325|             25|   108.06|              4|     2701.5|
+-----------+---

In [5]:
#Window functions
#mySQL reference - https://dev.mysql.com/doc/refman/8.0/en/window-functions-usage.html


from pyspark.sql.window import Window
from pyspark.sql.functions import *

#E1 - Cumulative Sales Quantity for each Product ordered by orderDate --productSummaryS is sorted by product code, order date desc

prodSum1 = productSummaryS.withColumn('cumulativeQty', sum('quantityOrdered').over(Window.orderBy('orderDate').partitionBy('productCode')))

#E2 - Top 3 Product each day
prodSum2 = productSummaryS.withColumn('prodRank', rank().over(Window.orderBy(col('totalSales').desc()).partitionBy('orderDate')))

#Filter top3
prodSum2F = prodSum2.filter(prodSum2.prodRank < 4)

#E3- Add SL number
prodSum2F1 = prodSum2F.withColumn('rowNum', row_number().over(Window.orderBy('orderDate','prodRank')))


prodSum1.show()
prodSum2.show()
prodSum2F.show()
prodSum2F1.show()

+-----------+-------------------+---------------+----------+-------------+
|productCode|          orderDate|quantityOrdered|totalSales|cumulativeQty|
+-----------+-------------------+---------------+----------+-------------+
|   S18_4600|2003-01-29 00:00:00|             36|   3530.52|           36|
|   S18_4600|2003-04-01 00:00:00|             41|   4318.94|           77|
|   S18_4600|2003-05-28 00:00:00|             50|    5146.0|          127|
|   S18_4600|2003-07-24 00:00:00|             40|    4020.0|          167|
|   S18_4600|2003-09-19 00:00:00|             49|   5458.11|          216|
|   S18_4600|2003-10-21 00:00:00|             45|    4849.2|          261|
|   S18_4600|2003-11-06 00:00:00|             47|   4837.24|          308|
|   S18_4600|2003-11-13 00:00:00|             21|   2491.86|          329|
|   S18_4600|2003-11-25 00:00:00|             32|   3642.24|          361|
|   S18_4600|2003-12-09 00:00:00|             47|   5633.89|          408|
|   S18_4600|2004-02-04 0