## Overview

This notebook will show you how to augment your data to create contiguous data 

In [2]:
# File location and type
file_location = "/FileStore/tables/sales.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","
from pyspark.sql.types import StructField,StructType,IntegerType,StringType,FloatType,ArrayType
schema1 = StructType([
 
    StructField("Month", IntegerType(), True),
    StructField("Year", IntegerType(), True),
    StructField("Product",StringType(),True),
    StructField("Sales",FloatType(),True)

])


productDF = spark.read.load(file_location, format="csv", header="true", sep=',', schema=schema1)

display(productDF.groupby("Year","Product").count())

Year,Product,count
2003,cornflakes,5
2008,cornflakes,4
2002,cornflakes,4
2005,cornflakes,3


In [3]:
display(productDF.filter(productDF.Year==2003))

Month,Year,Product,Sales
1,2003,cornflakes,2.9
2,2003,cornflakes,2.92
3,2003,cornflakes,2.87
9,2003,cornflakes,2.67
11,2003,cornflakes,2.9


In [4]:
from pyspark.sql import functions as F
from pyspark.sql import types 
df_1=productDF.groupby(["Product","Year"]).agg(F.collect_set("Month"),F.mean("Sales"))
df_1=df_1.withColumnRenamed("collect_set(Month)","Months_Sold")
df_1=df_1.withColumnRenamed("avg(Sales)","Average_per_year")

display(df_1)

Product,Year,Months_Sold,Average_per_year
cornflakes,2002,"List(1, 5, 3, 7)",2.840000033378601
cornflakes,2005,"List(1, 5, 7)",2.8200000127156577
cornflakes,2008,"List(1, 5, 2, 7)",2.840000033378601
cornflakes,2003,"List(9, 1, 2, 3, 11)",2.852000045776367


In [5]:
df_1=df_1.withColumn("Year_Months",F.array([F.lit(i) for i in range(1,13)]))
df_1=df_1.withColumn("Left_Months",F.array_except(F.col("Year_Months"),F.col("Months_Sold")))

df_1=(df_1.withColumn("Month",F.explode(F.col("Left_Months"))))
df_1=df_1.withColumn("Sales",F.col("Average_per_year"))
df_2=df_1.select("Month","Year","Product","Sales")
df_all=productDF.union(df_2).orderBy([F.col("Year"),F.col("Month")])
display(df_all)

In [6]:
display(df_all.groupby("Year","Product").count())

Year,Product,count
2003,cornflakes,12
2008,cornflakes,12
2002,cornflakes,12
2005,cornflakes,12


In [7]:
display(df_all)

Month,Year,Product,Sales
1,2002,cornflakes,2.900000095367432
2,2002,cornflakes,2.840000033378601
3,2002,cornflakes,2.9200000762939453
4,2002,cornflakes,2.840000033378601
5,2002,cornflakes,2.869999885559082
6,2002,cornflakes,2.840000033378601
7,2002,cornflakes,2.6700000762939453
8,2002,cornflakes,2.840000033378601
9,2002,cornflakes,2.840000033378601
10,2002,cornflakes,2.840000033378601
