In [None]:
# PySpark – Calculate Rolling 3-Day Average of Sales
# Problem Statement
# You have a PySpark DataFrame containing daily sales data. Write a PySpark program to calculate the rolling 3-day average sales for each date, ordered by the date column.

# Sample Input (daily_sales)
# sale_date	sales
# 2025-01-01	100
# 2025-01-02	200
# 2025-01-03	300
# 2025-01-04	400
# 2025-01-05	500


In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder.appName("PySparkPrac").getOrCreate()

25/08/29 16:49:13 WARN Utils: Your hostname, neosoft-Latitude-5420 resolves to a loopback address: 127.0.1.1; using 10.0.61.246 instead (on interface wlp0s20f3)
25/08/29 16:49:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/29 16:49:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/08/29 16:49:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
data = [
    ("2025-01-01", 100),
("2025-01-02", 200),
("2025-01-03", 300),
("2025-01-04", 400),
("2025-01-05", 500)
]

columns = ["sale_date","sales"]

In [4]:
sales = spark.createDataFrame(data, columns)

In [5]:
sales = sales.withColumn("sale_date", to_date("sale_date"))

In [6]:
sales.show()

                                                                                

+----------+-----+
| sale_date|sales|
+----------+-----+
|2025-01-01|  100|
|2025-01-02|  200|
|2025-01-03|  300|
|2025-01-04|  400|
|2025-01-05|  500|
+----------+-----+



In [8]:
w = Window.orderBy("sale_date").rowsBetween(-2,0)
sales = sales.withColumn("rolling_3_days", avg("sales").over(w))

In [9]:
sales.show()

25/08/29 16:59:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 16:59:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 16:59:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 16:59:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 16:59:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----------+-----+--------------+
| sale_date|sales|rolling_3_days|
+----------+-----+--------------+
|2025-01-01|  100|         100.0|
|2025-01-02|  200|         150.0|
|2025-01-03|  300|         200.0|
|2025-01-04|  400|         300.0|
|2025-01-05|  500|         400.0|
+----------+-----+--------------+



In [None]:
# SQL – Find Customers with Increasing Purchase Amounts
# Problem Statement
# You have a SQL table purchases(customer_id, purchase_date, amount). 
# Write a query to find customers whose purchase amounts strictly increased with each new purchase date.

# Sample Input (purchases)
# customer_id	purchase_date	amount
# C1	2025-01-01	100
# C1	2025-01-05	200
# C1	2025-01-10	300
# C2	2025-01-02	150
# C2	2025-01-06	120
# C3	2025-01-03	200
# C3	2025-01-09	250

In [11]:
data = [
    ("C1", "2025-01-01", 100),
("C1", "2025-01-05", 200),
("C1", "2025-01-10", 300),
("C2", "2025-01-02", 150),
("C2", "2025-01-06", 120),
("C3", "2025-01-03", 200),
("C3", "2025-01-09", 250)
]

columns = ["customer_id", "purchase_date", "amount"]

purchases = spark.createDataFrame(data, columns)
purchases = purchases.withColumn("purchase_date", to_date("purchase_date"))

In [13]:
purchases.show()

+-----------+-------------+------+
|customer_id|purchase_date|amount|
+-----------+-------------+------+
|         C1|   2025-01-01|   100|
|         C1|   2025-01-05|   200|
|         C1|   2025-01-10|   300|
|         C2|   2025-01-02|   150|
|         C2|   2025-01-06|   120|
|         C3|   2025-01-03|   200|
|         C3|   2025-01-09|   250|
+-----------+-------------+------+



In [40]:
purchases.createOrReplaceTempView('purchases')

In [33]:
r = spark.sql('''select 
        customer_id,
        purchase_date,
        amount, 
        lag(amount) over(partition by customer_id order by purchase_date)
    as prev_amount 
    from purchases''')
r.show()

+-----------+-------------+------+-----------+
|customer_id|purchase_date|amount|prev_amount|
+-----------+-------------+------+-----------+
|         C1|   2025-01-01|   100|       NULL|
|         C1|   2025-01-05|   200|        100|
|         C1|   2025-01-10|   300|        200|
|         C2|   2025-01-02|   150|       NULL|
|         C2|   2025-01-06|   120|        150|
|         C3|   2025-01-03|   200|       NULL|
|         C3|   2025-01-09|   250|        200|
+-----------+-------------+------+-----------+



In [48]:
result = spark.sql('''
with sale_lag_amount as (
  select 
      customer_id,
      purchase_date,
      amount,
      lag(amount) over(partition by customer_id order by purchase_date) AS prev_amount
  from purchases
)
select customer_id
from sale_lag_amount
group by customer_id
having count(case when prev_amount is not null and amount <= prev_amount then 1 end) = 0;
''')

In [49]:
result.show()

+-----------+
|customer_id|
+-----------+
|         C1|
|         C3|
+-----------+

