In [0]:
"""
Swiggy

Write a sql query to calculate the number of delayed orders for each delivery partner. An order is considered 
delayed if the actual delivery time exceeds predicted delivery time.
NOTE : If a partner has no delayed orders then display 0

Input
+---------+--------+-----------+-------------+---------------------+---------------------+----------------+
| orderid | custid | city      | del_partner | order_time          | deliver_time        | predicted_time |
+---------+--------+-----------+-------------+---------------------+---------------------+----------------+
|       1 |    101 | Mumbai    | Partner A   | 2024-12-18 10:00:00 | 2024-12-18 11:30:00 |             60 |
|       2 |    102 | Delhi     | Partner A   | 2024-12-18 09:00:00 | 2024-12-18 10:00:00 |             45 |
|       3 |    103 | Pune      | Partner A   | 2024-12-18 15:00:00 | 2024-12-18 15:30:00 |             30 |
|       4 |    104 | Mumbai    | Partner A   | 2024-12-18 14:00:00 | 2024-12-18 14:50:00 |             45 |
|       5 |    105 | Bangalore | Partner B   | 2024-12-18 08:00:00 | 2024-12-18 08:29:00 |             30 |
|       6 |    106 | Hyderabad | Partner B   | 2024-12-18 13:00:00 | 2024-12-18 14:00:00 |             70 |
|       7 |    107 | Kolkata   | Partner B   | 2024-12-18 10:00:00 | 2024-12-18 10:40:00 |             45 |
|       8 |    108 | Delhi     | Partner B   | 2024-12-18 18:00:00 | 2024-12-18 18:30:00 |             40 |
|       9 |    109 | Chennai   | Partner C   | 2024-12-18 07:00:00 | 2024-12-18 07:40:00 |             30 |
|      10 |    110 | Mumbai    | Partner C   | 2024-12-18 12:00:00 | 2024-12-18 13:00:00 |             50 |
|      11 |    111 | Delhi     | Partner C   | 2024-12-18 09:00:00 | 2024-12-18 09:35:00 |             30 |
|      12 |    112 | Hyderabad | Partner C   | 2024-12-18 16:00:00 | 2024-12-18 16:45:00 |             30 |
+---------+--------+-----------+-------------+---------------------+---------------------+----------------+

Output
+-------------+----------------+
| del_partner | delayed_orders |
+-------------+----------------+
| Partner A   |              3 |
| Partner B   |              0 |
| Partner C   |              4 |
+-------------+----------------+
"""

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType

schema = StructType([
    StructField("orderid", IntegerType(), True),
    StructField("custid", IntegerType(), True),
    StructField("city", StringType(), True),
    StructField("del_partner", StringType(), True),
    StructField("order_time", StringType(), True), # Load as string first for easy data entry
    StructField("deliver_time", StringType(), True),
    StructField("predicted_time", IntegerType(), True)
])

data = [
    (1, 101, "Mumbai", "Partner A", "2024-12-18 10:00:00", "2024-12-18 11:30:00", 60),
    (2, 102, "Delhi", "Partner A", "2024-12-18 09:00:00", "2024-12-18 10:00:00", 45),
    (3, 103, "Pune", "Partner A", "2024-12-18 15:00:00", "2024-12-18 15:30:00", 30),
    (4, 104, "Mumbai", "Partner A", "2024-12-18 14:00:00", "2024-12-18 14:50:00", 45),
    (5, 105, "Bangalore", "Partner B", "2024-12-18 08:00:00", "2024-12-18 08:29:00", 30),
    (6, 106, "Hyderabad", "Partner B", "2024-12-18 13:00:00", "2024-12-18 14:00:00", 70),
    (7, 107, "Kolkata", "Partner B", "2024-12-18 10:00:00", "2024-12-18 10:40:00", 45),
    (8, 108, "Delhi", "Partner B", "2024-12-18 18:00:00", "2024-12-18 18:30:00", 40),
    (9, 109, "Chennai", "Partner C", "2024-12-18 07:00:00", "2024-12-18 07:40:00", 30),
    (10, 110, "Mumbai", "Partner C", "2024-12-18 12:00:00", "2024-12-18 13:00:00", 50),
    (11, 111, "Delhi", "Partner C", "2024-12-18 09:00:00", "2024-12-18 09:35:00", 30),
    (12, 112, "Hyderabad", "Partner C", "2024-12-18 16:00:00", "2024-12-18 16:45:00", 30)
]

swiggy_orders_df = spark.createDataFrame(data, schema)

# Convert string columns to actual Timestamp types
from pyspark.sql.functions import col
swiggy_orders_df = swiggy_orders_df.withColumn("order_time", col("order_time").cast(TimestampType())) \
       .withColumn("deliver_time", col("deliver_time").cast(TimestampType()))

swiggy_orders_df.show()

+-------+------+---------+-----------+-------------------+-------------------+--------------+
|orderid|custid|     city|del_partner|         order_time|       deliver_time|predicted_time|
+-------+------+---------+-----------+-------------------+-------------------+--------------+
|      1|   101|   Mumbai|  Partner A|2024-12-18 10:00:00|2024-12-18 11:30:00|            60|
|      2|   102|    Delhi|  Partner A|2024-12-18 09:00:00|2024-12-18 10:00:00|            45|
|      3|   103|     Pune|  Partner A|2024-12-18 15:00:00|2024-12-18 15:30:00|            30|
|      4|   104|   Mumbai|  Partner A|2024-12-18 14:00:00|2024-12-18 14:50:00|            45|
|      5|   105|Bangalore|  Partner B|2024-12-18 08:00:00|2024-12-18 08:29:00|            30|
|      6|   106|Hyderabad|  Partner B|2024-12-18 13:00:00|2024-12-18 14:00:00|            70|
|      7|   107|  Kolkata|  Partner B|2024-12-18 10:00:00|2024-12-18 10:40:00|            45|
|      8|   108|    Delhi|  Partner B|2024-12-18 18:00:00|20

### SPARK SQL       

In [0]:
swiggy_orders_df.createOrReplaceTempView("swiggy_orders")

spark.sql("""
          select 
            so1.del_partner, coalesce(delayed_partners, 0) as delayed_partners 
          from 
          (
              select distinct del_partner from swiggy_orders 
          ) so1 left join
          (
            select 
                del_partner, count(*) as delayed_partners
            from swiggy_orders
            where timestampdiff(minute, order_time, deliver_time) > predicted_time
            group by del_partner
          ) so2
          on so1.del_partner = so2.del_partner
        """).show()

spark.sql(
    """
    select del_partner, sum(case when timestampdiff(minute, order_time, deliver_time) > predicted_time then 1 else 0 end) as delayed_partners
    from swiggy_orders
    group by del_partner
    """
).show()


+-----------+----------------+
|del_partner|delayed_partners|
+-----------+----------------+
|  Partner A|               3|
|  Partner B|               0|
|  Partner C|               4|
+-----------+----------------+

+-----------+----------------+
|del_partner|delayed_partners|
+-----------+----------------+
|  Partner A|               3|
|  Partner B|               0|
|  Partner C|               4|
+-----------+----------------+



### DF API        

In [0]:
from pyspark.sql.functions import *

delayed_partners_df = swiggy_orders_df.withColumn("actual_time", timestamp_diff('MINUTE', col("order_time"), col("deliver_time"))) \
    .filter(col("actual_time") > col("predicted_time")) \
    .groupBy(col("del_partner")).agg(count("*").alias("delayed_partners")) \

non_delayed_partners_df = swiggy_orders_df.select("del_partner").distinct()

non_delayed_partners_df.join(delayed_partners_df, non_delayed_partners_df["del_partner"] == delayed_partners_df["del_partner"], "left") \
    .withColumn("delayed_partners", coalesce(col("delayed_partners"), lit(0))) \
    .select(non_delayed_partners_df["del_partner"], "delayed_partners") \
    .show()


+-----------+----------------+
|del_partner|delayed_partners|
+-----------+----------------+
|  Partner A|               3|
|  Partner B|               0|
|  Partner C|               4|
+-----------+----------------+



In [0]:
"""
On which date did the 3rd highest sale of product 4 take place in terms of value(sale: qty_sold; 
value: qty_sold * price_of_product)
+------------+-------------+----------+------------+------+-------------+
| order_date | customer_id | store_id | product_id | sale | order_value |
+------------+-------------+----------+------------+------+-------------+
| 2024-12-01 |         109 |        1 |          3 |    2 |         700 |
| 2024-12-02 |         110 |        2 |          2 |    1 |         300 |
| 2024-12-03 |         111 |        1 |          5 |    3 |         900 |
| 2024-12-04 |         112 |        3 |          1 |    2 |         500 |
| 2024-12-05 |         113 |        3 |          4 |    4 |        1200 |
| 2024-12-05 |         114 |        3 |          4 |    2 |         400 |
| 2024-12-05 |         115 |        3 |          4 |    1 |         300 |
| 2024-12-01 |         101 |        1 |          4 |    2 |         500 |
| 2024-12-01 |         102 |        1 |          4 |    1 |         300 |
| 2024-12-02 |         103 |        2 |          4 |    3 |         900 |
| 2024-12-02 |         104 |        2 |          4 |    1 |         400 |
| 2024-12-03 |         105 |        1 |          4 |    2 |         600 |
| 2024-12-03 |         106 |        1 |          4 |    3 |         800 |
| 2024-12-04 |         107 |        3 |          4 |    1 |         200 |
| 2024-12-04 |         108 |        3 |          4 |    2 |         500 |
+------------+-------------+----------+------------+------+-------------+

+------------+-------------+
| order_date | daily_sales |
+------------+-------------+
| 2024-12-02 |        1300 |
+------------+-------------+
"""

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

schema = StructType([
  StructField("order_date", StringType(), True), # Load as string first
  StructField("customer_id", IntegerType(), True),
  StructField("store_id", IntegerType(), True),
  StructField("product_id", IntegerType(), True),
  StructField("sale", IntegerType(), True),
  StructField("order_value", IntegerType(), True)
])

data = [
  ("2024-12-01", 109, 1, 3, 2, 700),
  ("2024-12-02", 110, 2, 2, 1, 300),
  ("2024-12-03", 111, 1, 5, 3, 900),
  ("2024-12-04", 112, 3, 1, 2, 500),
  ("2024-12-05", 113, 3, 4, 4, 1200),
  ("2024-12-05", 114, 3, 4, 2, 400),
  ("2024-12-05", 115, 3, 4, 1, 300),
  ("2024-12-01", 101, 1, 4, 2, 500),
  ("2024-12-01", 102, 1, 4, 1, 300),
  ("2024-12-02", 103, 2, 4, 3, 900),
  ("2024-12-02", 104, 2, 4, 1, 400),
  ("2024-12-03", 105, 1, 4, 2, 600),
  ("2024-12-03", 106, 1, 4, 3, 800),
  ("2024-12-04", 107, 3, 4, 1, 200),
  ("2024-12-04", 108, 3, 4, 2, 500)
]

sales_data_df = spark.createDataFrame(data, schema)

# Convert the order_date string column to an actual DateType
from pyspark.sql.functions import col
sales_data_df = sales_data_df.withColumn("order_date", col("order_date").cast(DateType()))

sales_data_df.show()

+----------+-----------+--------+----------+----+-----------+
|order_date|customer_id|store_id|product_id|sale|order_value|
+----------+-----------+--------+----------+----+-----------+
|2024-12-01|        109|       1|         3|   2|        700|
|2024-12-02|        110|       2|         2|   1|        300|
|2024-12-03|        111|       1|         5|   3|        900|
|2024-12-04|        112|       3|         1|   2|        500|
|2024-12-05|        113|       3|         4|   4|       1200|
|2024-12-05|        114|       3|         4|   2|        400|
|2024-12-05|        115|       3|         4|   1|        300|
|2024-12-01|        101|       1|         4|   2|        500|
|2024-12-01|        102|       1|         4|   1|        300|
|2024-12-02|        103|       2|         4|   3|        900|
|2024-12-02|        104|       2|         4|   1|        400|
|2024-12-03|        105|       1|         4|   2|        600|
|2024-12-03|        106|       1|         4|   3|        800|
|2024-12

### SPARK SQL


In [0]:
sales_data_df.createOrReplaceTempView("sales_data")

spark.sql("""
          with cte as (
            select
              order_date, sum(order_value) as daily_sales
            from sales_data
            where product_id=4
            group by order_date
          ), cte2 as (
            select 
              *, 
              row_number() over(order by daily_sales desc) as rn
            from cte
          )
          select 
            order_date, daily_sales
          from cte2 where rn=3
          """).show()

+----------+-----------+
|order_date|daily_sales|
+----------+-----------+
|2024-12-02|       1300|
+----------+-----------+



### DF API

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import *

sales_data_df.filter(col("product_id") == 4).groupBy(col("order_date")).agg(sum(col("order_value")).alias("daily_sales")) \
    .withColumn("rn", row_number().over(Window.orderBy(desc(col("daily_sales"))))) \
    .filter(col("rn") == 3) \
    .select("order_date", "daily_sales") \
    .show()

+----------+-----------+
|order_date|daily_sales|
+----------+-----------+
|2024-12-02|       1300|
+----------+-----------+

