In [1]:
// Write a query to get the total sales of the company and sales of the company for the current year.

// Expected O/P :
// ORDERID | ORDER_DATE | PRODUCT_PRICE | QUANTITY  | TOTAL_SALES | TOTAL_SALES_OF_YEAR
// --------|------------|---------------|-----------|-------------|-----------------------
// O1      | 2023-01-01 | 300           | 2         | 4600        | 3600
// O2      | 2022-01-01 | 200           | 5         | 4600        | 3600
// O3      | 2023-02-03 | 600           | 5         | 4600        | 3600


val df = Seq(
    ("O1","2023-01-01",300,2),
    ("O2","2022-01-01",200,5),
    ("O3","2023-02-03",600,5),
).toDF("ORDERID","ORDER_DATE","PRODUCT_PRICE","QUANTITY")

df.show(false)



Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.0.102:4040
SparkContext available as 'sc' (version = 3.4.1, master = local[*], app id = local-1705076977783)
SparkSession available as 'spark'


+-------+----------+-------------+--------+
|ORDERID|ORDER_DATE|PRODUCT_PRICE|QUANTITY|
+-------+----------+-------------+--------+
|O1     |2023-01-01|300          |2       |
|O2     |2022-01-01|200          |5       |
|O3     |2023-02-03|600          |5       |
+-------+----------+-------------+--------+



df: org.apache.spark.sql.DataFrame = [ORDERID: string, ORDER_DATE: string ... 2 more fields]


In [14]:
// SPARK SQL 
// TRICK is to use window function without anything in the over(), this way there's no need to use any col for agg unlike group by

df.createOrReplaceTempView("sales")

spark.sql("""
    with years as (
        select
            *,
            extract(year from to_date(ORDER_DATE)) as sale_year,
            year(CURDATE())-1 as current_year
        from sales
    )
    select 
        ORDERID,ORDER_DATE,PRODUCT_PRICE,QUANTITY,ORDER_DATE,
        sum(PRODUCT_PRICE * QUANTITY) over() as total_sales,
        sum(case when sale_year=current_year then product_price * quantity else 0 end) over() as total_sale_prev_year
    from years
""").show(false)

+-------+----------+-------------+--------+----------+-----------+--------------------+
|ORDERID|ORDER_DATE|PRODUCT_PRICE|QUANTITY|ORDER_DATE|total_sales|total_sale_prev_year|
+-------+----------+-------------+--------+----------+-----------+--------------------+
|O1     |2023-01-01|300          |2       |2023-01-01|4600       |3600                |
|O2     |2022-01-01|200          |5       |2022-01-01|4600       |3600                |
|O3     |2023-02-03|600          |5       |2023-02-03|4600       |3600                |
+-------+----------+-------------+--------+----------+-----------+--------------------+



In [33]:
import org.apache.spark.sql.expressions.Window

val window_spec = Window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)

df.withColumn("ORDER_DATE", $"ORDER_DATE".cast("date")
  ).withColumn("total_sales", sum($"PRODUCT_PRICE" * $"QUANTITY").over(window_spec)
  ).withColumn("total_sale_prev_year", 
    sum(when(year($"ORDER_DATE") === year(current_date())-1, $"PRODUCT_PRICE" * $"QUANTITY"
            ).otherwise(lit("0"))).over(window_spec)).show(false)

+-------+----------+-------------+--------+-----------+--------------------+
|ORDERID|ORDER_DATE|PRODUCT_PRICE|QUANTITY|total_sales|total_sale_prev_year|
+-------+----------+-------------+--------+-----------+--------------------+
|O1     |2023-01-01|300          |2       |4600       |3600.0              |
|O2     |2022-01-01|200          |5       |4600       |3600.0              |
|O3     |2023-02-03|600          |5       |4600       |3600.0              |
+-------+----------+-------------+--------+-----------+--------------------+



import org.apache.spark.sql.expressions.Window
window_spec: org.apache.spark.sql.expressions.WindowSpec = org.apache.spark.sql.expressions.WindowSpec@1bbccca
