In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Window

In [2]:
spark = SparkSession.builder.getOrCreate()

In [5]:
df = spark.read.csv("test.csv", header=True)

In [6]:
df.toPandas()

Unnamed: 0,market,year,month,size
0,A,2023,1,10
1,A,2022,1,8
2,A,2021,1,1
3,A,2019,1,4
4,A,2015,1,20


In [7]:
cols2consider = ["market", "month"]

In [8]:
wspec = Window.partitionBy(cols2consider).orderBy(F.col("year").desc())

In [9]:
agg = df.groupBy(cols2consider).agg(F.max("year").alias("ymax"), F.min("year").alias("ymin"))

In [10]:
agg = agg.withColumn("ymin", F.col("ymin").cast(T.IntegerType()))\
    .withColumn("ymax", F.col("ymax").cast(T.IntegerType()))

In [11]:
agg = agg.withColumn("years", F.explode(F.expr("sequence(ymin, ymax)")))

In [12]:
agg = agg.select(*cols2consider, "years").withColumnRenamed("years", "year")

In [13]:
agg.toPandas()

Unnamed: 0,market,month,year
0,A,1,2015
1,A,1,2016
2,A,1,2017
3,A,1,2018
4,A,1,2019
5,A,1,2020
6,A,1,2021
7,A,1,2022
8,A,1,2023


In [14]:
df = agg.join(df, [*cols2consider, "year"], "left")

In [15]:
df.toPandas()

Unnamed: 0,market,month,year,size
0,A,1,2015,20.0
1,A,1,2016,
2,A,1,2017,
3,A,1,2018,
4,A,1,2019,4.0
5,A,1,2020,
6,A,1,2021,1.0
7,A,1,2022,8.0
8,A,1,2023,10.0


In [16]:
df = df.fillna({"size": 0})

In [17]:
df = df.withColumn("prev_year", F.lead(F.col("size"), 1, default=0).over(wspec))

In [19]:
df.withColumn("yoy_growth", F.col("size") - F.col("prev_year")).toPandas()

Unnamed: 0,market,month,year,size,prev_year,yoy_growth
0,A,1,2023,10,8,2.0
1,A,1,2022,8,1,7.0
2,A,1,2021,1,0,1.0
3,A,1,2020,0,4,-4.0
4,A,1,2019,4,0,4.0
5,A,1,2018,0,0,0.0
6,A,1,2017,0,0,0.0
7,A,1,2016,0,20,-20.0
8,A,1,2015,20,0,20.0
