#  Retail Sales Data Preparation using Spark

Preparing retail data for training a regression model to predict total sales revenue of a product from a store using the following features: 
- Brand (The brand of the product)
- Quantity (Quantity of product purchased)
- Advert (Whether the product had an advertisement or not)
- Price (How much the product costs)

%md
<div><img src="https://stanalyticssolutionsdev.blob.core.windows.net/assets/sales_forecasting.jpg?sp=r&st=2022-09-23T16:12:34Z&se=2025-01-01T01:12:34Z&spr=https&sv=2021-06-08&sr=b&sig=l8Prl1UTwclNsUJQhhCKGxL%2B21dGPvUQVJKnEpB0NRk%3D" width="500" height="300"/></div>

## Importing Libraries

In [0]:
import dlt
from pyspark.sql import SparkSession
from pyspark.sql.functions import percent_rank
from pyspark.sql import Window
from io import BytesIO
from copy import deepcopy
from datetime import datetime
from dateutil import parser
import logging
from pyspark.sql.types import *

## Defining the schema for the data

In [0]:
Dataschema = StructType([
    StructField("ID", StringType()),
    StructField("WeekStarting", DateType()),
    StructField("Store", IntegerType()),
    StructField("Brand", StringType()),
    StructField("Quantity", IntegerType()),
    StructField("Advert", IntegerType()),
    StructField("Price", FloatType()),
    StructField("Revenue", FloatType())
])


## Load the data from the source and perform the transformations

In [0]:
@dlt.table(comment="Raw data")
def bronze_SalesTrans():
  return (spark.read.csv('/mnt/data-source/Store Transactions Data/dbo.SalesTransData.txt',schema=Dataschema))

In [0]:
@dlt.table(comment="Silver data")
def silver_rank_data():
    pydf = dlt.read('bronze_SalesTrans').withColumn("rank", percent_rank().over(Window.partitionBy().orderBy("WeekStarting")))
    return pydf

In [0]:
@dlt.table(comment="Gold data")
def gold_train():
    train = dlt.read('silver_rank_data').where("rank <= .8").drop("rank")
    return train
    
@dlt.table(comment="Gold data")
def gold_test():
    test = dlt.read('silver_rank_data').where("rank > .8").drop("rank")
    return test