In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("PySparkPrac").getOrCreate()

In [6]:
data = [
    ("Electronics", "Laptop", 1200),
    ("Electronics", "Phone", 900),
    ("Electronics", "Tablet", 700),
    ("Clothing", "Shirt", 400),
    ("Clothing", "Jeans", 600),
    ("Clothing", "Jacket", 800)
]

columns = ["category", "product", "sales"]
df = spark.createDataFrame(data, columns)

In [7]:
w = Window.partitionBy("category").orderBy(df.sales.desc())
df = df.withColumn("rank", row_number().over(w))

In [8]:
df.filter(df.rank <= 2).drop("rank").show()

+-----------+-------+-----+
|   category|product|sales|
+-----------+-------+-----+
|   Clothing| Jacket|  800|
|   Clothing|  Jeans|  600|
|Electronics| Laptop| 1200|
|Electronics|  Phone|  900|
+-----------+-------+-----+



In [9]:
data = [
    (1, "2025-01-01", 500),
    (1, "2025-01-03", -200),
    (1, "2025-01-05", 300),
    (2, "2025-01-02", 1000),
    (2, "2025-01-04", -400),
]

columns = ["user_id", "txn_date", "amount"]
df = spark.createDataFrame(data, columns)


In [12]:
w = Window.partitionBy("user_id").orderBy("txn_date").rowsBetween(Window.unboundedPreceding, Window.currentRow)
running_balance_df = df.withColumn("running_balance", sum("amount").over(w))
running_balance_df.show()

+-------+----------+------+---------------+
|user_id|  txn_date|amount|running_balance|
+-------+----------+------+---------------+
|      1|2025-01-01|   500|            500|
|      1|2025-01-03|  -200|            300|
|      1|2025-01-05|   300|            600|
|      2|2025-01-02|  1000|           1000|
|      2|2025-01-04|  -400|            600|
+-------+----------+------+---------------+

