# 1Predicting sales data using Spark Streaming

### 2.1 Create SparkSession


In [1]:
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql.functions import split
from pyspark.sql.functions import explode
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql import SparkSession  # Spark SQL
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.0.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0 pyspark-shell'

# the below setup will run Spark in local mode with * working processors(equal to logical cores on the machine)
master = "local[4]"

# Setup `appName` field to be displayed at Spark cluster UI page
app_name = "FIT5202 Assignment 2b"
# Setup configuration parameters for Spark
spark_conf = (SparkConf()
              .setMaster(master)
              .setAppName(app_name))

# Setup SparkSession and configure it with Melbourne timezone.
spark = (
    SparkSession
    .builder
    .config(conf=spark_conf)
    .getOrCreate()
)

### 2.2 Define schema and load file



In [2]:
# provide a schema to the valuees
produce_data_labels = [
    ("Store", StringType()),
    ("Date", StringType()),
    ("Temperature", StringType()),
    ("Fuel_Price", StringType()),
    ("MarkDown1", StringType()),
    ("MarkDown2", StringType()),
    ("MarkDown3", StringType()),
    ("MarkDown4", StringType()),
    ("MarkDown5", StringType()),
    ("CPI", StringType()),
    ("Unemployment", StringType()),
    ("IsHoliday", StringType()),
    ("last_weekly_sales", StringType()),
    ("ts", IntegerType())
]

# features schema
produce_data_schema = ArrayType(StructType(
    [StructField(x[0], x[1], True) for x in produce_data_labels]))

### 2.3 Injest Kafka data




In [4]:
hostip = "192.168.8.133"  # change me
topic = 'assignment2b'
df = (spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", f'{hostip}:9092')
      .option("subscribe", topic)
      .load()
      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
     )

In [5]:
df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)



In [6]:
#create a query
query = (df
         .writeStream
         .outputMode("append")
         .format("console")
         #send the above dataframe to console every 5 seconds
         .trigger(processingTime='5 seconds')
         .start())

In [7]:
query.stop()

In [8]:
df = df.select(F.from_json(F.col("value").cast("string"), produce_data_schema).alias('parsed_value'))
df = df.select(F.explode(F.col("parsed_value")).alias('unnested_value'))

In [10]:
df_formatted = df.select(
     F.col("unnested_value.Store").alias("Store"),
     F.col("unnested_value.Date").alias("Date"),
     F.col("unnested_value.Temperature").alias("Temperature"),
     F.col("unnested_value.Fuel_Price").alias("Fuel_Price"),
     F.col("unnested_value.MarkDown1").alias("MarkDown1"),
     F.col("unnested_value.MarkDown2").alias("MarkDown2"),
     F.col("unnested_value.MarkDown3").alias("MarkDown3"),
     F.col("unnested_value.MarkDown4").alias("MarkDown4"),
     F.col("unnested_value.MarkDown5").alias("MarkDown5"),
     F.col("unnested_value.CPI").alias("CPI"),
     F.col("unnested_value.Unemployment").alias("Unemployment"),
     F.col("unnested_value.IsHoliday").alias("IsHoliday"),
     F.col("unnested_value.last_weekly_sales").alias("last_weekly_sales"),
     F.col("unnested_value.ts").alias("ts"),
)

### 2.4 Persist raw data


In [11]:
df_formatted.printSchema()

root
 |-- Store: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Temperature: string (nullable = true)
 |-- Fuel_Price: string (nullable = true)
 |-- MarkDown1: string (nullable = true)
 |-- MarkDown2: string (nullable = true)
 |-- MarkDown3: string (nullable = true)
 |-- MarkDown4: string (nullable = true)
 |-- MarkDown5: string (nullable = true)
 |-- CPI: string (nullable = true)
 |-- Unemployment: string (nullable = true)
 |-- IsHoliday: string (nullable = true)
 |-- last_weekly_sales: string (nullable = true)
 |-- ts: integer (nullable = true)



In [12]:
#create a query
query = (df_formatted
         .writeStream
         .outputMode("append")
         .format("console")
         #send the above dataframe to console every 5 seconds
         .trigger(processingTime='5 seconds')
         .start())

In [13]:
#stop query
query.stop()

### 2.5 Transform data formats


### 2.6 Prepare feature columns


### 2.7 Join the local data


### 2.8 Perform predictions


### 2.9 write code to process the data following requirements


### 2.10 average weekly sales predictions of different types of stores and write the stream back to Kafka sink using a different topic name

The data you sended should be like this:

|  key   | value  |
|  ----  | ----  |
| timestamp of window start | JSON of store type and avg sales |
| '1673233646'  | '{"Type":"A","predict_weekly_sales":20000}' |


##### For cleaning up the quries and files