## Overview

This notebook will show you how to augment your data to create contiguous data

In [2]:
dbutils.widgets.text("product", defaultValue="dbfs:/FileStore/tables/sales.csv", label="Source orders path")
dbutils.widgets.text("output", defaultValue="test_output", label="Output order details spark table name")
dbutils.widgets.get("product")

In [3]:
# File location and type
file_location = dbutils.widgets.get("product")
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","
from pyspark.sql.types import StructField,StructType,IntegerType,StringType,FloatType,ArrayType
schema1 = StructType([
 
    StructField("Month", IntegerType(), True),
    StructField("Year", IntegerType(), True),
    StructField("Product",StringType(),True),
    StructField("Sales",FloatType(),True)

])


productDF = spark.read.load(file_location, format="csv", header="true", sep=',', schema=schema1)

display(productDF.groupby("Year","Product").count())

Year,Product,count
2002,cornflakes,3


In [4]:
display(productDF.filter(productDF.Year==2003))

Month,Year,Product,Sales


In [5]:
from pyspark.sql import functions as F
from pyspark.sql import types 
df_1=productDF.groupby(["Product","Year"]).agg(F.collect_set("Month"),F.mean("Sales"))
df_1=df_1.withColumnRenamed("collect_set(Month)","Months_Sold")
df_1=df_1.withColumnRenamed("avg(Sales)","Average_per_year")

display(df_1)

Product,Year,Months_Sold,Average_per_year
cornflakes,2002,"List(1, 5, 3)",2.896666685740153


In [6]:
df_1=df_1.withColumn("Year_Months",F.array([F.lit(i) for i in range(1,13)]))
df_1=df_1.withColumn("Left_Months",F.array_except(F.col("Year_Months"),F.col("Months_Sold")))

df_1=(df_1.withColumn("Month",F.explode(F.col("Left_Months"))))
df_1=df_1.withColumn("Sales",F.col("Average_per_year"))
df_2=df_1.select("Month","Year","Product","Sales")
df_all=productDF.union(df_2).orderBy([F.col("Year"),F.col("Month")])
display(df_all)

Month,Year,Product,Sales
1,2002,cornflakes,2.900000095367432
2,2002,cornflakes,2.896666685740153
3,2002,cornflakes,2.9200000762939453
4,2002,cornflakes,2.896666685740153
5,2002,cornflakes,2.869999885559082
6,2002,cornflakes,2.896666685740153
7,2002,cornflakes,2.896666685740153
8,2002,cornflakes,2.896666685740153
9,2002,cornflakes,2.896666685740153
10,2002,cornflakes,2.896666685740153


In [7]:
display(df_all.groupby("Year","Product").count())

Year,Product,count
2002,cornflakes,12


In [8]:
(df_all
  .write
  .mode("overwrite")
  .saveAsTable(dbutils.widgets.get("output"))
)