In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", "10")
# spark.conf.set("spark.driver.memory", "8g")
# spark.conf.set("spark.driver.maxResultSize", "10g")
# spark.conf.set("spark.executor.memory", "10g")
spark

In [2]:
staticDataFrame = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("./../sdg/data/retail-data/by-day/*.csv")

In [3]:
staticDataFrame.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [4]:
staticDataFrame.createOrReplaceTempView("retail_data")
staticSchema = staticDataFrame.schema

In [5]:
from pyspark.sql.functions import window, desc, col

staticDataFrame\
    .selectExpr("CustomerId", "(UnitPrice * Quantity) as total_cost", "InvoiceDate")\
    .groupBy("CustomerId", window("InvoiceDate", "1 day"))\
    .sum("total_cost")\
    .show(5, truncate=False)

+----------+------------------------------------------+------------------+
|CustomerId|window                                    |sum(total_cost)   |
+----------+------------------------------------------+------------------+
|14075.0   |[2011-12-05 01:00:00, 2011-12-06 01:00:00]|316.78000000000003|
|15290.0   |[2011-12-05 01:00:00, 2011-12-06 01:00:00]|263.02000000000004|
|16500.0   |[2011-12-05 01:00:00, 2011-12-06 01:00:00]|52.74000000000001 |
|16873.0   |[2011-12-05 01:00:00, 2011-12-06 01:00:00]|1854.8300000000002|
|14649.0   |[2011-12-05 01:00:00, 2011-12-06 01:00:00]|513.9899999999998 |
+----------+------------------------------------------+------------------+
only showing top 5 rows



In [7]:
streamingDataFrame = spark.readStream\
    .schema(staticSchema)\
    .option("maxFilesPerTrigger", 1)\
    .format("csv")\
    .option("header", "true")\
    .load("../sdg/data/retail-data/by-day/*.csv")

In [8]:
streamingDataFrame.isStreaming

True

In [9]:
purchaseByCustomerPerHour = streamingDataFrame\
    .selectExpr(
        "CustomerId",
        "(UnitPrice * Quantity) as total_cost",
        "InvoiceDate"
    )\
    .groupBy(
        "CustomerId",
        window("InvoiceDate", "1 day")
    )\
    .sum("total_cost")

In [10]:
purchaseByCustomerPerHour.writeStream\
    .format("memory")\
    .queryName("customer_purchases")\
    .outputMode("complete")\
    .start()

<pyspark.sql.streaming.StreamingQuery at 0x11e838cd0>

In [29]:
spark.sql("""
SELECT *
FROM customer_purchases
ORDER BY `sum(total_cost)` DESC
""").show(5, truncate=False)

+----------+------------------------------------------+------------------+
|CustomerId|window                                    |sum(total_cost)   |
+----------+------------------------------------------+------------------+
|17450.0   |[2011-09-20 02:00:00, 2011-09-21 02:00:00]|71601.44          |
|null      |[2011-11-14 01:00:00, 2011-11-15 01:00:00]|55316.08          |
|null      |[2011-11-07 01:00:00, 2011-11-08 01:00:00]|42939.17          |
|null      |[2011-03-29 02:00:00, 2011-03-30 02:00:00]|33521.39999999998 |
|null      |[2011-12-08 01:00:00, 2011-12-09 01:00:00]|31975.590000000007|
+----------+------------------------------------------+------------------+
only showing top 5 rows



In [30]:
purchaseByCustomerPerHour.writeStream\
    .format("console")\
    .queryName("customr_purchases_2")\
    .outputMode("complete")\
    .start()

<pyspark.sql.streaming.StreamingQuery at 0x11f12d850>

In [31]:
from pyspark.sql.functions import date_format

preppedDataFrame = staticDataFrame\
    .na.fill(0)\
    .withColumn("day_of_week", date_format("InvoiceDate", "EEEE"))\
    .coalesce(5)

In [32]:
trainDataFrame = preppedDataFrame\
    .where("InvoiceDate < '2011-07-01'")
testDataFrame = preppedDataFrame\
    .where("InvoiceDate >= '2011-07-01'")

In [33]:
display(trainDataFrame.count())
display(testDataFrame.count())

245903

296006

In [34]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer()\
    .setInputCol("day_of_week")\
    .setOutputCol("day_of_week_index")

In [36]:
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder()\
    .setInputCol("day_of_week_index")\
    .setOutputCol("day_of_week_encoded")

In [37]:
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler()\
    .setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"])\
    .setOutputCol("features")

In [38]:
from pyspark.ml import Pipeline

transformationPipeline = Pipeline()\
    .setStages([indexer, encoder, vectorAssembler])

In [39]:
fittedPipeline = transformationPipeline.fit(trainDataFrame)

In [40]:
transformedTraining = fittedPipeline.transform(trainDataFrame)

In [41]:
transformedTraining.cache()

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string, day_of_week: string, day_of_week_index: double, day_of_week_encoded: vector, features: vector]

In [45]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans()\
    .setK(20)\
    .setSeed(1)

In [46]:
kmModel = kmeans.fit(transformedTraining)

In [47]:
kmModel.computeCost(transformedTraining)

103503481.10517502

In [48]:
transformedTest = fittedPipeline.transform(testDataFrame)

In [49]:
kmModel.computeCost(transformedTest)

548689531.0782777