In [7]:
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._


val data = Seq( 
            Row("2023-01-01",100,10),
            Row("2023-01-02",100,15),
            Row("2023-01-03",100,20),
            Row("2023-01-04",100,25),
            Row("2023-01-05",100,30),
            Row("2023-01-06",100,35),
            Row("2023-01-07",100,40),
            Row("2023-01-08",100,45)
        )

val schema = StructType(Array(
    StructField("Date", StringType),
    StructField("ProductID", IntegerType),
    StructField("QuantitySold", IntegerType)
))

val rdd = spark.sparkContext.parallelize(data)

val df = spark.createDataFrame(rdd, schema).withColumn("Date", to_date($"Date", "yyyy-MM-dd"))

df.printSchema()
df.show(false)

root
 |-- Date: date (nullable = true)
 |-- ProductID: integer (nullable = true)
 |-- QuantitySold: integer (nullable = true)

+----------+---------+------------+
|Date      |ProductID|QuantitySold|
+----------+---------+------------+
|2023-01-01|100      |10          |
|2023-01-02|100      |15          |
|2023-01-03|100      |20          |
|2023-01-04|100      |25          |
|2023-01-05|100      |30          |
|2023-01-06|100      |35          |
|2023-01-07|100      |40          |
|2023-01-08|100      |45          |
+----------+---------+------------+



import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.expressions._
data: Seq[org.apache.spark.sql.Row] = List([2023-01-01,100,10], [2023-01-02,100,15], [2023-01-03,100,20], [2023-01-04,100,25], [2023-01-05,100,30], [2023-01-06,100,35], [2023-01-07,100,40], [2023-01-08,100,45])
schema: org.apache.spark.sql.types.StructType = StructType(StructField(Date,StringType,true),StructField(ProductID,IntegerType,true),StructField(QuantitySold,IntegerType,true))
rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = ParallelCollectionRDD[9] at parallelize at <console>:70
df: org.apache.spark.sql.DataFrame = [Date: date, ProductID: int ... 1 more field]


In [18]:
import org.apache.spark.sql.expressions._

df.withColumn("seven_days_avg", 
                  avg($"QuantitySold")
                      .over(Window.partitionBy($"ProductID").orderBy($"Date")
                  .rowsBetween(-6, 0))
             ).show(false)

+----------+---------+------------+--------------+
|Date      |ProductID|QuantitySold|seven_days_avg|
+----------+---------+------------+--------------+
|2023-01-01|100      |10          |10.0          |
|2023-01-02|100      |15          |12.5          |
|2023-01-03|100      |20          |15.0          |
|2023-01-04|100      |25          |17.5          |
|2023-01-05|100      |30          |20.0          |
|2023-01-06|100      |35          |22.5          |
|2023-01-07|100      |40          |25.0          |
|2023-01-08|100      |45          |30.0          |
+----------+---------+------------+--------------+



import org.apache.spark.sql.expressions._
