In [0]:
"""
1. Find out the difference in sales of each product from their first month sales to latest sales.
2. Send a mail to employee who all have not completed 8 hours in office when they come to office.
3. Find out the performance of the sales based on last 3 months average.
"""

"""
+----------+------------+----------+-------+
|product_id|product_name|sales_date|  sales|
+----------+------------+----------+-------+
|         1|      iphone|01-01-2023|1500000| 1 -2 | unbounded preceding
|         2|     samsung|01-01-2023|1100000| 2 -1 |
|         3|     oneplus|01-01-2023|1100000| 3 0  || current row
|         1|      iphone|01-02-2023|1300000| 4 1  |
|         2|     samsung|01-02-2023|1120000| 5 2  | unbounded following
|         3|     oneplus|01-02-2023|1120000| 6 3  |
|         1|      iphone|01-03-2023|1600000|
|         2|     samsung|01-03-2023|1080000| default is rowsBetween unbounded preceding and current row
|         3|     oneplus|01-03-2023|1160000| this is why you don't get correct result for last function in a window
|         1|      iphone|01-04-2023|1700000| use unbounded preceding and unbounded following so that entire
|         2|     samsung|01-04-2023|1800000| window is considered to calc last 
|         3|     oneplus|01-04-2023|1170000|
|         1|      iphone|01-05-2023|1200000|
|         2|     samsung|01-05-2023| 980000|
|         3|     oneplus|01-05-2023|1175000|
|         1|      iphone|01-06-2023|1100000|
|         2|     samsung|01-06-2023|1100000|
|         3|     oneplus|01-06-2023|1200000|
+----------+------------+----------+-------+
"""

In [0]:
# 1. Find out the difference in sales of each product from their first month sales to latest sales.

from pyspark.sql.functions import *
from pyspark.sql.window import Window

product_data = [
(2,"samsung","01-01-1995",11000),
(1,"iphone","01-02-2023",1300000),
(2,"samsung","01-02-2023",1120000),
(3,"oneplus","01-02-2023",1120000),
(1,"iphone","01-03-2023",1600000),
(2,"samsung","01-03-2023",1080000),
(3,"oneplus","01-03-2023",1160000),
(1,"iphone","01-01-2006",15000),
(1,"iphone","01-04-2023",1700000),
(2,"samsung","01-04-2023",1800000),
(3,"oneplus","01-04-2023",1170000),
(1,"iphone","01-05-2023",1200000),
(2,"samsung","01-05-2023",980000),
(3,"oneplus","01-05-2023",1175000),
(1,"iphone","01-06-2023",1100000),
(3,"oneplus","01-01-2010",23000),
(2,"samsung","01-06-2023",1100000),
(3,"oneplus","01-06-2023",1200000)
]

product_schema=["product_id","product_name","sales_date","sales"]

product_df = spark.createDataFrame(data=product_data,schema=product_schema)
product_df.show()

product_df.withColumn("first_mnth_sales", first("sales").over(Window.partitionBy("product_id").orderBy("sales_date")))\
    .withColumn("last_mnth_sales", last("sales").over(Window.partitionBy("product_id").orderBy("sales_date").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)))\
    .withColumn("first_last_mnth_diff", col("last_mnth_sales") - col("first_mnth_sales"))\
    .groupBy("product_id", "product_name").agg(first("first_last_mnth_diff").alias("sales_differ"))\
    .show()

+----------+------------+----------+-------+
|product_id|product_name|sales_date|  sales|
+----------+------------+----------+-------+
|         2|     samsung|01-01-1995|  11000|
|         1|      iphone|01-02-2023|1300000|
|         2|     samsung|01-02-2023|1120000|
|         3|     oneplus|01-02-2023|1120000|
|         1|      iphone|01-03-2023|1600000|
|         2|     samsung|01-03-2023|1080000|
|         3|     oneplus|01-03-2023|1160000|
|         1|      iphone|01-01-2006|  15000|
|         1|      iphone|01-04-2023|1700000|
|         2|     samsung|01-04-2023|1800000|
|         3|     oneplus|01-04-2023|1170000|
|         1|      iphone|01-05-2023|1200000|
|         2|     samsung|01-05-2023| 980000|
|         3|     oneplus|01-05-2023|1175000|
|         1|      iphone|01-06-2023|1100000|
|         3|     oneplus|01-01-2010|  23000|
|         2|     samsung|01-06-2023|1100000|
|         3|     oneplus|01-06-2023|1200000|
+----------+------------+----------+-------+

+--------

In [0]:
# 2. Send a mail to employee who all have not completed 8 hours in office when they come to office.

from pyspark.sql.functions import *
from pyspark.sql.window import Window

emp_data = [(1,"manish","11-07-2023","10:20"),
        (1,"manish","11-07-2023","11:20"),
        (2,"rajesh","11-07-2023","11:20"),
        (1,"manish","11-07-2023","11:50"),
        (2,"rajesh","11-07-2023","13:20"),
        (1,"manish","11-07-2023","19:20"),
        (2,"rajesh","11-07-2023","17:20"),
        (1,"manish","12-07-2023","10:32"),
        (1,"manish","12-07-2023","12:20"),
        (3,"vikash","12-07-2023","09:12"),
        (1,"manish","12-07-2023","16:23"),
        (3,"vikash","12-07-2023","18:08")]

emp_schema = ["id", "name", "date", "time"]

emp_df = spark.createDataFrame(data=emp_data, schema=emp_schema)
emp_df.show()

# Here while converting to timestamp need to provide input format only
emp_df.withColumn("ts", from_unixtime(unix_timestamp(expr("CONCAT(date, ' ',time)"), "dd-MM-yyyy HH:mm")))\
    .withColumn("login", first("ts").over(Window.partitionBy("id", "date").orderBy("ts")))\
    .withColumn("logout", last("ts").over(Window.partitionBy("id", "date").orderBy("ts").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)))\
    .withColumn("login", to_timestamp("login", "yyyy-MM-dd HH:mm:ss"))\
    .withColumn("logout", to_timestamp("logout", "yyyy-MM-dd HH:mm:ss"))\
    .withColumn("less_than_8hr_ofc", (col("logout").cast("long")-col("login").cast("long"))/3600)\
    .filter(col("less_than_8hr_ofc") < 8.0)\
    .show(truncate=False)

+---+------+----------+-----+
| id|  name|      date| time|
+---+------+----------+-----+
|  1|manish|11-07-2023|10:20|
|  1|manish|11-07-2023|11:20|
|  2|rajesh|11-07-2023|11:20|
|  1|manish|11-07-2023|11:50|
|  2|rajesh|11-07-2023|13:20|
|  1|manish|11-07-2023|19:20|
|  2|rajesh|11-07-2023|17:20|
|  1|manish|12-07-2023|10:32|
|  1|manish|12-07-2023|12:20|
|  3|vikash|12-07-2023|09:12|
|  1|manish|12-07-2023|16:23|
|  3|vikash|12-07-2023|18:08|
+---+------+----------+-----+

+---+------+----------+-----+-------------------+-------------------+-------------------+-----------------+
|id |name  |date      |time |ts                 |login              |logout             |less_than_8hr_ofc|
+---+------+----------+-----+-------------------+-------------------+-------------------+-----------------+
|1  |manish|12-07-2023|10:32|2023-07-12 10:32:00|2023-07-12 10:32:00|2023-07-12 16:23:00|5.85             |
|1  |manish|12-07-2023|12:20|2023-07-12 12:20:00|2023-07-12 10:32:00|2023-07-12 16:23:0

In [0]:
# 3. Find out the performance of the sales based on last 3 months average.

from pyspark.sql.functions import *
from pyspark.sql.window import Window

product_data = [
(1,"iphone","01-01-2023",1500000),
(2,"samsung","01-01-2023",1100000),
(3,"oneplus","01-01-2023",1100000),
(1,"iphone","01-02-2023",1300000),
(2,"samsung","01-02-2023",1120000),
(3,"oneplus","01-02-2023",1120000),
(1,"iphone","01-03-2023",1600000),
(2,"samsung","01-03-2023",1080000),
(3,"oneplus","01-03-2023",1160000),
(1,"iphone","01-04-2023",1700000),
(2,"samsung","01-04-2023",1800000),
(3,"oneplus","01-04-2023",1170000),
(1,"iphone","01-05-2023",1200000),
(2,"samsung","01-05-2023",980000),
(3,"oneplus","01-05-2023",1175000),
(1,"iphone","01-06-2023",1100000),
(2,"samsung","01-06-2023",1100000),
(3,"oneplus","01-06-2023",1200000)
]

product_schema=["product_id","product_name","sales_date","sales"]

product_df = spark.createDataFrame(data=product_data,schema=product_schema)
product_df.show()

product_df.withColumn("running_sum", sum("sales").over(Window.partitionBy("product_id").orderBy("sales_date").rowsBetween(-2,0)))\
    .withColumn("rn", row_number().over(Window.partitionBy("product_id").orderBy("sales_date")))\
    .filter(col("rn")>2)\
    .withColumn("avg_sales", round(col("sales")/3.0, 2))\
    .drop("rn").show()

+----------+------------+----------+-------+
|product_id|product_name|sales_date|  sales|
+----------+------------+----------+-------+
|         1|      iphone|01-01-2023|1500000|
|         2|     samsung|01-01-2023|1100000|
|         3|     oneplus|01-01-2023|1100000|
|         1|      iphone|01-02-2023|1300000|
|         2|     samsung|01-02-2023|1120000|
|         3|     oneplus|01-02-2023|1120000|
|         1|      iphone|01-03-2023|1600000|
|         2|     samsung|01-03-2023|1080000|
|         3|     oneplus|01-03-2023|1160000|
|         1|      iphone|01-04-2023|1700000|
|         2|     samsung|01-04-2023|1800000|
|         3|     oneplus|01-04-2023|1170000|
|         1|      iphone|01-05-2023|1200000|
|         2|     samsung|01-05-2023| 980000|
|         3|     oneplus|01-05-2023|1175000|
|         1|      iphone|01-06-2023|1100000|
|         2|     samsung|01-06-2023|1100000|
|         3|     oneplus|01-06-2023|1200000|
+----------+------------+----------+-------+

+--------