In [0]:
"""
There are multiple antivirus software which are running on the system and you have the data of how many malware they have detected in each run.  

You need to find out how many malwares each software has detected in their latest run and what is the difference between the number of malwares detected in latest run and the second last run for each software. 

Please note that list only the software which have run for at least 2 times and have detected at least 10 malware in the latest run.

Table: malware
+------------------+-----------+
| COLUMN_NAME      | DATA_TYPE |
+------------------+-----------+
| software_id      | int       |
| run_date         | datetime  |
| malware_detected | int       |
+------------------+-----------+

Input
+-------------+---------------------+------------------+
| software_id | run_date            | malware_detected |
+-------------+---------------------+------------------+
|         100 | 2024-01-01 02:00:01 |               12 |
|         100 | 2024-01-01 03:12:01 |               15 |
|         100 | 2024-01-01 16:00:01 |                9 |
|         101 | 2024-01-01 12:00:00 |                9 |
|         101 | 2024-01-01 16:00:00 |               10 |
|         101 | 2024-01-01 18:00:00 |               12 |
|         102 | 2024-01-01 12:00:00 |               14 |
|         102 | 2024-01-01 14:00:00 |               13 |
|         103 | 2024-01-01 15:00:00 |               11 |
|         103 | 2024-01-01 17:00:00 |               16 |
|         104 | 2024-01-01 18:30:00 |                8 |
|         104 | 2024-01-01 19:45:00 |                7 |
|         105 | 2024-01-01 10:45:00 |               15 |
|         106 | 2024-01-01 08:00:00 |                9 |
|         106 | 2024-01-01 09:30:00 |               14 |
|         107 | 2024-01-01 07:00:00 |                5 |
|         108 | 2024-01-01 06:00:00 |                6 |
|         109 | 2024-01-01 05:00:00 |                4 |
|         110 | 2024-01-01 04:00:00 |                3 |
|         111 | 2024-01-01 03:00:00 |                2 |
|         112 | 2024-01-01 02:00:00 |                1 |
+-------------+---------------------+------------------+


Output
+-------------+------------------+------------------------+
| software_id | latest_run_count | difference_to_previous |
+-------------+------------------+------------------------+
|         101 |               12 |                      2 |
|         102 |               13 |                     -1 |
|         103 |               16 |                      5 |
|         106 |               14 |                      5 |
+-------------+------------------+------------------------+
"""

malware_df = spark.createDataFrame([
    (100, '2024-01-01 02:00:01',2),
    (100, '2024-01-01 03:12:01',5),
    (100, '2024-01-01 16:00:01',9),
    (101, '2024-01-01 12:00:00',9), 
    (101, '2024-01-01 16:00:00',10),
    (101, '2024-01-01 18:00:00',12),
    (102, '2024-01-01 12:00:00',14),
    (102, '2024-01-01 14:00:00',13),
    (103, '2024-01-01 15:00:00',11),
    (103, '2024-01-01 17:00:00',16),
    (104, '2024-01-01 18:30:00',8),
    (104, '2024-01-01 19:45:00',7), 
    (105, '2024-01-01 10:45:00',15),
    (106, '2024-01-01 08:00:00',9), 
    (106, '2024-01-01 09:30:00',14),
    (107, '2024-01-01 07:00:00',5), 
    (108, '2024-01-01 06:00:00',6), 
    (109, '2024-01-01 05:00:00',4),
    (110, '2024-01-01 04:00:00',3),
    (111, '2024-01-01 03:00:00',2), 
    (112, '2024-01-01 02:00:00',1)
], ["software_id", "run_date", "malware_detected"])

from pyspark.sql import functions as F
from pyspark.sql.window import *

malware_df = malware_df.withColumn("run_date", F.col("run_date").cast("timestamp"))

malware_df.show()
malware_df.printSchema()

+-----------+-------------------+----------------+
|software_id|           run_date|malware_detected|
+-----------+-------------------+----------------+
|        100|2024-01-01 02:00:01|               2|
|        100|2024-01-01 03:12:01|               5|
|        100|2024-01-01 16:00:01|               9|
|        101|2024-01-01 12:00:00|               9|
|        101|2024-01-01 16:00:00|              10|
|        101|2024-01-01 18:00:00|              12|
|        102|2024-01-01 12:00:00|              14|
|        102|2024-01-01 14:00:00|              13|
|        103|2024-01-01 15:00:00|              11|
|        103|2024-01-01 17:00:00|              16|
|        104|2024-01-01 18:30:00|               8|
|        104|2024-01-01 19:45:00|               7|
|        105|2024-01-01 10:45:00|              15|
|        106|2024-01-01 08:00:00|               9|
|        106|2024-01-01 09:30:00|              14|
|        107|2024-01-01 07:00:00|               5|
|        108|2024-01-01 06:00:0

In [0]:

malware_df \
    .withColumn("rn", F.row_number().over(Window.partitionBy(F.col("software_id")).orderBy(F.desc(F.col("run_date"))))) \
    .filter((F.col("rn")==F.lit(1)) | (F.col("rn")==F.lit(2))) \
    .withColumn("last_run", F.when(F.col("rn")==F.lit(1), F.col("malware_detected"))) \
    .withColumn("second_last_run", F.when(F.col("rn")==F.lit(2), F.col("malware_detected"))) \
    .groupBy(F.col("software_id")).agg(
        F.max(F.col("last_run")).alias("last_run") ,
        F.max(F.col("second_last_run")).alias("second_last_run") ,
        F.count("*").alias("total_rows_per_software") 
    ) \
    .filter( (F.col("total_rows_per_software")==F.lit(2)) & (F.col("last_run") >= F.lit(10)) ) \
    .drop(F.col("total_rows_per_software")) \
    .withColumn("difference_to_previous", F.col("last_run")-F.col("second_last_run")) \
    .select("software_id", "last_run", "difference_to_previous") \
    .show()

+-----------+--------+----------------------+
|software_id|last_run|difference_to_previous|
+-----------+--------+----------------------+
|        101|      12|                     2|
|        102|      13|                    -1|
|        103|      16|                     5|
|        106|      14|                     5|
+-----------+--------+----------------------+



In [0]:
malware_df.createOrReplaceTempView("malware")

spark.sql("""
          with cte as (
          select
            *,
            row_number() over(partition by software_id order by run_date desc) rn
          from malware
          ), cte2 as (
          select 
            software_id,
            count(*) as total_rows_per_software,
            max(case when rn=1 then malware_detected end) as last_run,
            max(case when rn=2 then malware_detected end) as second_last_run
          from cte where rn in (1,2)
          group by software_id
          )
          select
            software_id, last_run, last_run-second_last_run as difference_to_previous
          from cte2 
          where last_run >= 10 and total_rows_per_software = 2
          """).show()

+-----------+--------+----------------------+
|software_id|last_run|difference_to_previous|
+-----------+--------+----------------------+
|        101|      12|                     2|
|        102|      13|                    -1|
|        103|      16|                     5|
|        106|      14|                     5|
+-----------+--------+----------------------+

