In [13]:
// Find out the difference in sales of each product from their first month sales to latest sales.

import org.apache.spark.sql.expressions._

val product_data = Seq(
    (2,"samsung","01-01-1995",11000),
    (1,"iphone","01-02-2023",1300000),
    (2,"samsung","01-02-2023",1120000),
    (3,"oneplus","01-02-2023",1120000),
    (1,"iphone","01-03-2023",1600000),
    (2,"samsung","01-03-2023",1080000),
    (3,"oneplus","01-03-2023",1160000),
    (1,"iphone","01-01-2006",15000),
    (1,"iphone","01-04-2023",1700000),
    (2,"samsung","01-04-2023",1800000),
    (3,"oneplus","01-04-2023",1170000),
    (1,"iphone","01-05-2023",1200000),
    (2,"samsung","01-05-2023",980000),
    (3,"oneplus","01-05-2023",1175000),
    (1,"iphone","01-06-2023",1100000),
    (3,"oneplus","01-01-2010",23000),
    (2,"samsung","01-06-2023",1100000),
    (3,"oneplus","01-06-2023",1200000)
).toDF("id","brand","date","sales")

val formatted = product_data.withColumn("date", date_format(to_date($"date", "dd-MM-yyyy"),"yyyy-MM-dd"))

val window = Window.partitionBy($"id").orderBy($"date").rowsBetween(Window.unboundedPreceding,Window.unboundedFollowing)

// val first_last = formatted.withColumn("first_val", first($"sales").over(Window.partitionBy($"id").orderBy($"date"))
//         ).withColumn("last_val", last($"sales").over(Window.partitionBy($"id").orderBy($"date")))

// first_last.show(false)
// first_last.explain()

// By default specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())
// unboundedpreceding - starting row from partition
// currentrow - current row


val first_last = formatted.withColumn("first_val", first($"sales").over(window)
        ).withColumn("last_val", last($"sales").over(window)
        ).withColumn("differ_sales", $"last_val" - $"first_val").drop("first_val","last_val")
        .groupBy($"brand").agg(first($"differ_sales").as("differ_sales"))
    

first_last.show(false)


+-------+------------+
|brand  |differ_sales|
+-------+------------+
|iphone |1085000     |
|samsung|1089000     |
|oneplus|1177000     |
+-------+------------+



import org.apache.spark.sql.expressions._
product_data: org.apache.spark.sql.DataFrame = [id: int, brand: string ... 2 more fields]
formatted: org.apache.spark.sql.DataFrame = [id: int, brand: string ... 2 more fields]
window: org.apache.spark.sql.expressions.WindowSpec = org.apache.spark.sql.expressions.WindowSpec@5f3fdee0
first_last: org.apache.spark.sql.DataFrame = [brand: string, differ_sales: int]


In [52]:

// Send a mail to employees who have not completed 8hours in office for the given day

import org.apache.spark.sql.types.TimestampType


val emp_data = Seq(
        (1,"manish","11-07-2023","10:20:00"),
        (1,"manish","11-07-2023","11:20:00"),
        (2,"rajesh","11-07-2023","11:20:00"),
        (1,"manish","11-07-2023","11:50:00"),
        (2,"rajesh","11-07-2023","13:20:00"),
        (1,"manish","11-07-2023","19:20:00"),
        (2,"rajesh","11-07-2023","17:20:00"),
        (1,"manish","12-07-2023","10:32:00"),
        (1,"manish","12-07-2023","12:20:00"),
        (3,"vikash","12-07-2023","09:12:00"),
        (1,"manish","12-07-2023","16:23:00"),
        (3,"vikash","12-07-2023","18:08:00")).toDF("id","name","date","time")

val formatted = emp_data.withColumn("ts",expr("concat(date,' ',time)")
                                   ).withColumn("ts",to_timestamp($"ts","dd-MM-yyyy HH:mm:ss"))

formatted.show(false)
formatted.printSchema()


val window = Window.partitionBy($"id",$"date").orderBy($"date").rowsBetween(Window.unboundedPreceding,Window.unboundedFollowing)


formatted.withColumn("login",first($"ts").over(window)
        ).withColumn("logout",last($"ts").over(window)
        ).withColumn("hours_per_day", hour($"logout")-hour($"login")
        ).filter($"hours_per_day" < 8).groupBy($"name").agg(first($"hours_per_day").as("hours_less_than_8")).show(false)



+---+------+----------+--------+-------------------+
|id |name  |date      |time    |ts                 |
+---+------+----------+--------+-------------------+
|1  |manish|11-07-2023|10:20:00|2023-07-11 10:20:00|
|1  |manish|11-07-2023|11:20:00|2023-07-11 11:20:00|
|2  |rajesh|11-07-2023|11:20:00|2023-07-11 11:20:00|
|1  |manish|11-07-2023|11:50:00|2023-07-11 11:50:00|
|2  |rajesh|11-07-2023|13:20:00|2023-07-11 13:20:00|
|1  |manish|11-07-2023|19:20:00|2023-07-11 19:20:00|
|2  |rajesh|11-07-2023|17:20:00|2023-07-11 17:20:00|
|1  |manish|12-07-2023|10:32:00|2023-07-12 10:32:00|
|1  |manish|12-07-2023|12:20:00|2023-07-12 12:20:00|
|3  |vikash|12-07-2023|09:12:00|2023-07-12 09:12:00|
|1  |manish|12-07-2023|16:23:00|2023-07-12 16:23:00|
|3  |vikash|12-07-2023|18:08:00|2023-07-12 18:08:00|
+---+------+----------+--------+-------------------+

root
 |-- id: integer (nullable = false)
 |-- name: string (nullable = true)
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)
 |

import org.apache.spark.sql.types.TimestampType
emp_data: org.apache.spark.sql.DataFrame = [id: int, name: string ... 2 more fields]
formatted: org.apache.spark.sql.DataFrame = [id: int, name: string ... 3 more fields]
window: org.apache.spark.sql.expressions.WindowSpec = org.apache.spark.sql.expressions.WindowSpec@72b01c5b


In [60]:

// Find out the performance of the sales based on last 3 months average

val product_data = Seq(
    (1,"iphone","01-01-2023",1500000),
    (2,"samsung","01-01-2023",1100000),
    (3,"oneplus","01-01-2023",1100000),
    (1,"iphone","01-02-2023",1300000),
    (2,"samsung","01-02-2023",1120000),
    (3,"oneplus","01-02-2023",1120000),
    (1,"iphone","01-03-2023",1600000),
    (2,"samsung","01-03-2023",1080000),
    (3,"oneplus","01-03-2023",1160000),
    (1,"iphone","01-04-2023",1700000),
    (2,"samsung","01-04-2023",1800000),
    (3,"oneplus","01-04-2023",1170000),
    (1,"iphone","01-05-2023",1200000),
    (2,"samsung","01-05-2023",980000),
    (3,"oneplus","01-05-2023",1175000),
    (1,"iphone","01-06-2023",1100000),
    (2,"samsung","01-06-2023",1100000),
    (3,"oneplus","01-06-2023",1200000)
).toDF("id","brand","date","sale")

val formatted = product_data.withColumn("date",date_format(to_date($"date","dd-MM-yyyy"),"yyyy-MM-dd"))

val window = Window.partitionBy($"brand").orderBy($"date").rowsBetween(-2,0)

formatted.withColumn("avg_3mo_sales", avg($"sale").over(window)
         ).withColumn("row_num",row_number().over(Window.partitionBy($"brand").orderBy($"date"))
         ).where($"row_num" > 2).drop("row_num").show(false)


+---+-------+----------+-------+------------------+
|id |brand  |date      |sale   |avg_3mo_sales     |
+---+-------+----------+-------+------------------+
|1  |iphone |2023-03-01|1600000|1466666.6666666667|
|1  |iphone |2023-04-01|1700000|1533333.3333333333|
|1  |iphone |2023-05-01|1200000|1500000.0         |
|1  |iphone |2023-06-01|1100000|1333333.3333333333|
|3  |oneplus|2023-03-01|1160000|1126666.6666666667|
|3  |oneplus|2023-04-01|1170000|1150000.0         |
|3  |oneplus|2023-05-01|1175000|1168333.3333333333|
|3  |oneplus|2023-06-01|1200000|1181666.6666666667|
|2  |samsung|2023-03-01|1080000|1100000.0         |
|2  |samsung|2023-04-01|1800000|1333333.3333333333|
|2  |samsung|2023-05-01|980000 |1286666.6666666667|
|2  |samsung|2023-06-01|1100000|1293333.3333333333|
+---+-------+----------+-------+------------------+



product_data: org.apache.spark.sql.DataFrame = [id: int, brand: string ... 2 more fields]
formatted: org.apache.spark.sql.DataFrame = [id: int, brand: string ... 2 more fields]
window: org.apache.spark.sql.expressions.WindowSpec = org.apache.spark.sql.expressions.WindowSpec@756071af
