In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", "10")
# spark.conf.set("spark.driver.memory", "8g")
# spark.conf.set("spark.driver.maxResultSize", "10g")
# spark.conf.set("spark.executor.memory", "10g")
spark

In [None]:
staticDataFrame = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("./../sdg/data/retail-data/by-day/*.csv")

In [None]:
staticDataFrame.printSchema()

In [None]:
staticDataFrame.createOrReplaceTempView("retail_data")
staticSchema = staticDataFrame.schema

In [None]:
from pyspark.sql.functions import window, desc, col

staticDataFrame\
    .selectExpr("CustomerId", "(UnitPrice * Quantity) as total_cost", "InvoiceDate")\
    .groupBy("CustomerId", window("InvoiceDate", "1 day"))\
    .sum("total_cost")\
    .show(5, truncate=False)

In [None]:
streamingDataFrame = spark.readStream\
    .schema(staticSchema)\
    .option("maxFilesPerTrigger", 1)\
    .format("csv")\
    .option("header", "true")\
    .load("../sdg/data/retail-data/by-day/*.csv")

In [None]:
streamingDataFrame.isStreaming

In [None]:
purchaseByCustomerPerHour = streamingDataFrame\
    .selectExpr(
        "CustomerId",
        "(UnitPrice * Quantity) as total_cost",
        "InvoiceDate"
    )\
    .groupBy(
        "CustomerId",
        window("InvoiceDate", "1 day")
    )\
    .sum("total_cost")

In [None]:
purchaseByCustomerPerHour.writeStream\
    .format("memory")\
    .queryName("customer_purchases")\
    .outputMode("complete")\
    .start()

In [None]:
spark.sql("""
SELECT *
FROM customer_purchases
ORDER BY `sum(total_cost)` DESC
""").show(5, truncate=False)

In [None]:
purchaseByCustomerPerHour.writeStream\
    .format("console")\
    .queryName("customr_purchases_2")\
    .outputMode("complete")\
    .start()

In [None]:
from pyspark.sql.functions import date_format

preppedDataFrame = staticDataFrame\
    .na.fill(0)\
    .withColumn("day_of_week", date_format("InvoiceDate", "EEEE"))\
    .coalesce(5)

In [None]:
trainDataFrame = preppedDataFrame\
    .where("InvoiceDate < '2011-07-01'")
testDataFrame = preppedDataFrame\
    .where("InvoiceDate >= '2011-07-01'")

In [None]:
display(trainDataFrame.count())
display(testDataFrame.count())

In [None]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer()\
    .setInputCol("day_of_week")\
    .setOutputCol("day_of_week_index")

In [None]:
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder()\
    .setInputCol("day_of_week_index")\
    .setOutputCol("day_of_week_encoded")

In [None]:
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler()\
    .setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"])\
    .setOutputCol("features")

In [None]:
from pyspark.ml import Pipeline

transformationPipeline = Pipeline()\
    .setStages([indexer, encoder, vectorAssembler])

In [None]:
fittedPipeline = transformationPipeline.fit(trainDataFrame)

In [None]:
transformedTraining = fittedPipeline.transform(trainDataFrame)

In [None]:
transformedTraining.cache()

In [None]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans()\
    .setK(20)\
    .setSeed(1)

In [None]:
kmModel = kmeans.fit(transformedTraining)

In [None]:
kmModel.computeCost(transformedTraining)

In [None]:
transformedTest = fittedPipeline.transform(testDataFrame)

In [None]:
kmModel.computeCost(transformedTest)