# 1Predicting sales data using Spark Streaming

### 2.1 Create SparkSession


In [99]:
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql.functions import col, decode, expr
from pyspark.sql.functions import split
from pyspark.sql.functions import explode
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql import SparkSession  # Spark SQL
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.0.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0 pyspark-shell'

# the below setup will run Spark in local mode with * working processors(equal to logical cores on the machine)
master = "local[4]"

# Setup `appName` field to be displayed at Spark cluster UI page
app_name = "FIT5202 Assignment 2b"
# Setup configuration parameters for Spark
spark_conf = (SparkConf()
              .setMaster(master)
              .setAppName(app_name))

# Setup SparkSession and configure it with Melbourne timezone.
spark = (
    SparkSession
    .builder
    .config(conf=spark_conf)
    .getOrCreate()
)

### 2.2 Define schema and load file



In [85]:
# provide a schema to the valuees
produce_data_labels = [
    ("Store", StringType()),
    ("Date", StringType()),
    ("Temperature", StringType()),
    ("Fuel_Price", StringType()),
    ("MarkDown1", StringType()),
    ("MarkDown2", StringType()),
    ("MarkDown3", StringType()),
    ("MarkDown4", StringType()),
    ("MarkDown5", StringType()),
    ("CPI", StringType()),
    ("Unemployment", StringType()),
    ("IsHoliday", StringType()),
    ("last_weekly_sales", StringType()),
    ("ts", IntegerType())
]

# features schema
produce_data_schema = ArrayType(StructType(
    [StructField(x[0], x[1], True) for x in produce_data_labels]))

### 2.3 Injest Kafka data

In [86]:
# setup
hostip = "192.168.8.133"  # change me
topic = 'assignment2b'

# read df
df = (spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", f'{hostip}:9092')
      .option("subscribe", topic)
      .option("dateFormat", "d/M/y")
      .load()  # load df
      # re-hydrate binary
      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
      .select(F.from_json(F.col("value").cast("string"),
                          produce_data_schema).alias('parsed_value'))  # parse json
      # un-nest columns
      .select(F.explode(F.col("parsed_value")).alias('unnested_value'))
      )

# format dataframe and cast into proper data types
df_formatted = df.select(
    F.col("unnested_value.Store").alias("Store"),
    F.col("unnested_value.Date").alias("Date"),
    F.col("unnested_value.Temperature").alias("Temperature"),
    F.col("unnested_value.Fuel_Price").alias("Fuel_Price"),
    F.col("unnested_value.MarkDown1").alias("MarkDown1"),
    F.col("unnested_value.MarkDown2").alias("MarkDown2"),
    F.col("unnested_value.MarkDown3").alias("MarkDown3"),
    F.col("unnested_value.MarkDown4").alias("MarkDown4"),
    F.col("unnested_value.MarkDown5").alias("MarkDown5"),
    F.col("unnested_value.CPI").alias("CPI"),
    F.col("unnested_value.Unemployment").alias("Unemployment"),
    F.col("unnested_value.IsHoliday").alias("IsHoliday"),
    F.col("unnested_value.last_weekly_sales").alias("last_weekly_sales"),
    F.col("unnested_value.ts").alias("ts"),
)

### 2.4 Persist raw data


In [87]:
def foreach_batch_function(df, epoch_id):
    df.show(5, False)

In [88]:
# final dataframe
query = (df_formatted
         .writeStream
         .format("parquet")
         .option("path", "output/filesink_output")
         .option("checkpointLocation", "checkpoint/filesink_checkpoint")
         .foreachBatch(foreach_batch_function)
         .trigger(processingTime='5 seconds')
         .start()
         )

+-----+----------+-----------+----------+---------+---------+---------+---------+---------+---------+------------+---------+------------------+----------+
|Store|Date      |Temperature|Fuel_Price|MarkDown1|MarkDown2|MarkDown3|MarkDown4|MarkDown5|CPI      |Unemployment|IsHoliday|last_weekly_sales |ts        |
+-----+----------+-----------+----------+---------+---------+---------+---------+---------+---------+------------+---------+------------------+----------+
|5    |2011-09-02|90.38      |3.533     |nan      |nan      |nan      |nan      |nan      |216.35886|6.529       |false    |310338.1683688164 |1675494330|
|12   |2011-09-02|93.66      |3.798     |nan      |nan      |nan      |nan      |nan      |129.32594|13.503      |false    |1017593.4658427238|1675494330|
|43   |2011-09-02|87.84      |3.533     |nan      |nan      |nan      |nan      |nan      |207.6207 |10.641      |false    |561573.0730142593 |1675494330|
|33   |2011-09-02|99.2       |3.798     |nan      |nan      |nan      

In [89]:
# stop query
query.stop()

### 2.6 Prepare feature columns


In [94]:
# format dataframe and cast into proper data types
df_final = (df_formatted
            # cast data types
            .select(
                F.col("Store").cast(StringType()),
                F.col("Date").cast(DateType()),
                F.col("Temperature").cast(FloatType()),
                F.col("Fuel_Price").cast(FloatType()),
                F.col("MarkDown1").cast(FloatType()),
                F.col("MarkDown2").cast(FloatType()),
                F.col("MarkDown3").cast(FloatType()),
                F.col("MarkDown4").cast(FloatType()),
                F.col("MarkDown5").cast(FloatType()),
                F.col("CPI").cast(FloatType()),
                F.col("Unemployment").cast(FloatType()),
                F.col("IsHoliday").cast(StringType()),
                F.col("last_weekly_sales").cast(FloatType()),
                F.col("ts").cast(TimestampType()))
            #create new columns
            .withColumn("Month", F.month("Date"))
            .withColumn("day_of_month", F.dayofmonth("Date"))
            .withColumn("day_of_year", F.dayofyear("Date"))
            .withColumn("week_of_year", F.weekofyear("Date"))
            )

In [None]:
# query = (df_final
#          .writeStream
#          .outputMode("append")
#          .format("console")
#          .option("truncate", False)
#          # send the above dataframe to console every 5 seconds
#          .trigger(processingTime='5 seconds')
#          .start())

### 2.7 Join the local data


In [103]:
#read stores dataset

# stores data type
stores_labels = [
    ("Store", StringType()),
    ("Type", StringType()),
    ("Size", IntegerType()),
]
# stores schema
stores_schema = StructType([StructField(x[0], x[1], True)
                           for x in stores_labels])

# load stores df
df_stores = (
    spark.read.format("csv")
    .option("header", True)
    .option("encoding", "UTF-8")
    .load("data/stores.csv", schema=stores_schema)
)

df_joined = (df_final
            .join(df_stores,df_final.Store==df_stores.Store,how="left")
            )

In [108]:
query = (df_joined
         .select("Type","Size")
         .writeStream
         .outputMode("append")
         .format("console")
         .option("truncate", False)
         # send the above dataframe to console every 5 seconds
         .trigger(processingTime='5 seconds')
         .start())

In [109]:
query.stop()

### 2.8 Perform predictions


In [111]:
#load libraries
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel

In [112]:
#load persisted model
pipelineModel = PipelineModel.load('sales_estimation_pipeline_model')

In [113]:
print(pipelineModel.stages[-1]._java_obj.paramMap())

{
	GBTRegressor_428b57e2311a-featuresCol: features,
	GBTRegressor_428b57e2311a-labelCol: Weekly_Sales
}


### 2.9 write code to process the data following requirements


### 2.10 average weekly sales predictions of different types of stores and write the stream back to Kafka sink using a different topic name

The data you sended should be like this:

|  key   | value  |
|  ----  | ----  |
| timestamp of window start | JSON of store type and avg sales |
| '1673233646'  | '{"Type":"A","predict_weekly_sales":20000}' |


##### For cleaning up the quries and files