In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder.appName("PySparkPrac").getOrCreate()

25/09/02 17:13:09 WARN Utils: Your hostname, neosoft-Latitude-5420 resolves to a loopback address: 127.0.1.1; using 10.0.61.246 instead (on interface wlp0s20f3)
25/09/02 17:13:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/02 17:13:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
data = [("Laptop", "2025-01-01", 100),
("Laptop", "2025-01-02", 250),
("Laptop", "2025-01-03", 200),
("Phone", "2025-01-01", 300),
("Phone", "2025-01-02", 150),
("Phone", "2025-01-03", 400)
]

columns = ["product", "sale_date", "sales"]

df = spark.createDataFrame(data, columns)

In [7]:
df.withColumn("sale_date", to_date("sale_date"))

DataFrame[product: string, sale_date: date, sales: bigint]

In [8]:
df.show()

+-------+----------+-----+
|product| sale_date|sales|
+-------+----------+-----+
| Laptop|2025-01-01|  100|
| Laptop|2025-01-02|  250|
| Laptop|2025-01-03|  200|
|  Phone|2025-01-01|  300|
|  Phone|2025-01-02|  150|
|  Phone|2025-01-03|  400|
+-------+----------+-----+



In [9]:
w = Window.partitionBy("product").orderBy(col("sales").desc())

In [11]:
ranked = df.withColumn("rn", dense_rank().over(w))

In [17]:
ranked.filter(col("rn") == 1).select("product", col("sale_date").alias("peak_sale_date"), "sales").show()

+-------+--------------+-----+
|product|peak_sale_date|sales|
+-------+--------------+-----+
| Laptop|    2025-01-02|  250|
|  Phone|    2025-01-03|  400|
+-------+--------------+-----+



In [19]:
data = [("P1", "2025-01-01", 100),
("P1", "2025-01-02", 120),
("P1", "2025-01-03", 90),
("P2", "2025-01-01", 200),
("P2", "2025-01-02", 180),
("P2", "2025-01-03", 220)
]

columns = ["product_id", "sale_date", "sales"]

daily_product_sales = spark.createDataFrame(data, columns)
daily_product_sales = daily_product_sales.withColumn("sale_date", to_date("sale_date"))

In [20]:
daily_product_sales.show()

+----------+----------+-----+
|product_id| sale_date|sales|
+----------+----------+-----+
|        P1|2025-01-01|  100|
|        P1|2025-01-02|  120|
|        P1|2025-01-03|   90|
|        P2|2025-01-01|  200|
|        P2|2025-01-02|  180|
|        P2|2025-01-03|  220|
+----------+----------+-----+



In [21]:
daily_product_sales.createOrReplaceTempView("daily_product_sales")

In [30]:
sales_drop = spark.sql('''
    select product_id, sale_date, sales, previous_sale from (
        select product_id, sale_date, sales, 
        lag(sales) over(partition by product_id order by sale_date) as previous_sale 
        from daily_product_sales) where sales < previous_sale
''')
sales_drop.show()

+----------+----------+-----+-------------+
|product_id| sale_date|sales|previous_sale|
+----------+----------+-----+-------------+
|        P1|2025-01-03|   90|          120|
|        P2|2025-01-02|  180|          200|
+----------+----------+-----+-------------+

