In [24]:
/*
    Identify dates where a user has logged in for 5 or more consecutive days, sorting based on user_id

    USE DENSE_RANK() and CHECK LAST CELL AS datediff() excludes 1 day
    
*/



import org.apache.spark.sql._
import org.apache.spark.sql.expressions._


val df = Seq(
    (1, "2024-03-01"),
    (1, "2024-03-02"),
    (1, "2024-03-03"),
    (1, "2024-03-04"),
    (1, "2024-03-06"),
    (1, "2024-03-10"),
    (1, "2024-03-11"),
    (1, "2024-03-12"),
    (1, "2024-03-13"),
    (1, "2024-03-14"),
    (1, "2024-03-20"),
    (1, "2024-03-25"),
    (1, "2024-03-26"),
    (1, "2024-03-27"),
    (1, "2024-03-28"),
    (1, "2024-03-29"),
    (1, "2024-03-30"),
    (1, "2024-03-31"),
    (1, "2024-04-01"),
    (1, "2024-04-02"),
    (2, "2024-03-01"),
    (2, "2024-03-02"),
    (2, "2024-03-03"),
    (2, "2024-03-04"),
    (3, "2024-03-01"),
    (3, "2024-03-02"),
    (3, "2024-03-03"),
    (3, "2024-03-04"),
    (3, "2024-03-04"),
    (3, "2024-03-04"),
    (3, "2024-03-05"),
    (4, "2024-03-01"),
    (4, "2024-03-02"),
    (4, "2024-03-03"),
    (4, "2024-03-04"),
    (4, "2024-03-04")
).toDF("user_id", "login_date")


df.show()

+-------+----------+
|user_id|login_date|
+-------+----------+
|      1|2024-03-01|
|      1|2024-03-02|
|      1|2024-03-03|
|      1|2024-03-04|
|      1|2024-03-06|
|      1|2024-03-10|
|      1|2024-03-11|
|      1|2024-03-12|
|      1|2024-03-13|
|      1|2024-03-14|
|      1|2024-03-20|
|      1|2024-03-25|
|      1|2024-03-26|
|      1|2024-03-27|
|      1|2024-03-28|
|      1|2024-03-29|
|      1|2024-03-30|
|      1|2024-03-31|
|      1|2024-04-01|
|      1|2024-04-02|
+-------+----------+
only showing top 20 rows



import org.apache.spark.sql._
import org.apache.spark.sql.expressions._
df: org.apache.spark.sql.DataFrame = [user_id: int, login_date: string]


In [25]:
val formatted_date = df.withColumn("login_date", to_date($"login_date", "yyyy-MM-dd"))
formatted_date.show(false)

+-------+----------+
|user_id|login_date|
+-------+----------+
|1      |2024-03-01|
|1      |2024-03-02|
|1      |2024-03-03|
|1      |2024-03-04|
|1      |2024-03-06|
|1      |2024-03-10|
|1      |2024-03-11|
|1      |2024-03-12|
|1      |2024-03-13|
|1      |2024-03-14|
|1      |2024-03-20|
|1      |2024-03-25|
|1      |2024-03-26|
|1      |2024-03-27|
|1      |2024-03-28|
|1      |2024-03-29|
|1      |2024-03-30|
|1      |2024-03-31|
|1      |2024-04-01|
|1      |2024-04-02|
+-------+----------+
only showing top 20 rows



formatted_date: org.apache.spark.sql.DataFrame = [user_id: int, login_date: date]


In [26]:
val dr_df = formatted_date.withColumn("rn", row_number().over(Window.partitionBy($"user_id").orderBy($"login_date"))
               ).withColumn("dr", dense_rank().over(Window.partitionBy($"user_id").orderBy($"login_date"))
               )

dr_df.show(50,false)

+-------+----------+---+---+
|user_id|login_date|rn |dr |
+-------+----------+---+---+
|1      |2024-03-01|1  |1  |
|1      |2024-03-02|2  |2  |
|1      |2024-03-03|3  |3  |
|1      |2024-03-04|4  |4  |
|1      |2024-03-06|5  |5  |
|1      |2024-03-10|6  |6  |
|1      |2024-03-11|7  |7  |
|1      |2024-03-12|8  |8  |
|1      |2024-03-13|9  |9  |
|1      |2024-03-14|10 |10 |
|1      |2024-03-20|11 |11 |
|1      |2024-03-25|12 |12 |
|1      |2024-03-26|13 |13 |
|1      |2024-03-27|14 |14 |
|1      |2024-03-28|15 |15 |
|1      |2024-03-29|16 |16 |
|1      |2024-03-30|17 |17 |
|1      |2024-03-31|18 |18 |
|1      |2024-04-01|19 |19 |
|1      |2024-04-02|20 |20 |
|2      |2024-03-01|1  |1  |
|2      |2024-03-02|2  |2  |
|2      |2024-03-03|3  |3  |
|2      |2024-03-04|4  |4  |
|3      |2024-03-01|1  |1  |
|3      |2024-03-02|2  |2  |
|3      |2024-03-03|3  |3  |
|3      |2024-03-04|4  |4  |
|3      |2024-03-04|5  |4  |
|3      |2024-03-04|6  |4  |
|3      |2024-03-05|7  |5  |
|4      |2024-

dr_df: org.apache.spark.sql.DataFrame = [user_id: int, login_date: date ... 2 more fields]


In [27]:
val differ_dt = dr_df.withColumn("group_", $"login_date" - $"dr").drop("rn")

differ_dt.show(50,false)

+-------+----------+---+----------+
|user_id|login_date|dr |group_    |
+-------+----------+---+----------+
|1      |2024-03-01|1  |2024-02-29|
|1      |2024-03-02|2  |2024-02-29|
|1      |2024-03-03|3  |2024-02-29|
|1      |2024-03-04|4  |2024-02-29|
|1      |2024-03-06|5  |2024-03-01|
|1      |2024-03-10|6  |2024-03-04|
|1      |2024-03-11|7  |2024-03-04|
|1      |2024-03-12|8  |2024-03-04|
|1      |2024-03-13|9  |2024-03-04|
|1      |2024-03-14|10 |2024-03-04|
|1      |2024-03-20|11 |2024-03-09|
|1      |2024-03-25|12 |2024-03-13|
|1      |2024-03-26|13 |2024-03-13|
|1      |2024-03-27|14 |2024-03-13|
|1      |2024-03-28|15 |2024-03-13|
|1      |2024-03-29|16 |2024-03-13|
|1      |2024-03-30|17 |2024-03-13|
|1      |2024-03-31|18 |2024-03-13|
|1      |2024-04-01|19 |2024-03-13|
|1      |2024-04-02|20 |2024-03-13|
|2      |2024-03-01|1  |2024-02-29|
|2      |2024-03-02|2  |2024-02-29|
|2      |2024-03-03|3  |2024-02-29|
|2      |2024-03-04|4  |2024-02-29|
|3      |2024-03-01|1  |2024

differ_dt: org.apache.spark.sql.DataFrame = [user_id: int, login_date: date ... 2 more fields]


In [29]:
val aggr_dates = differ_dt.groupBy($"user_id", $"group_").agg(min($"login_date").as("start_date"), 
                                             max($"login_date").as("end_date"))

aggr_dates.show(false)

+-------+----------+----------+----------+
|user_id|group_    |start_date|end_date  |
+-------+----------+----------+----------+
|1      |2024-02-29|2024-03-01|2024-03-04|
|1      |2024-03-01|2024-03-06|2024-03-06|
|1      |2024-03-04|2024-03-10|2024-03-14|
|1      |2024-03-09|2024-03-20|2024-03-20|
|1      |2024-03-13|2024-03-25|2024-04-02|
|2      |2024-02-29|2024-03-01|2024-03-04|
|3      |2024-02-29|2024-03-01|2024-03-05|
|4      |2024-02-29|2024-03-01|2024-03-04|
+-------+----------+----------+----------+



aggr_dates: org.apache.spark.sql.DataFrame = [user_id: int, group_: date ... 2 more fields]


In [30]:
aggr_dates.withColumn("consecutive_date_cn", datediff($"end_date", $"start_date") + 1
                     ).filter($"consecutive_date_cn" >= 5).show(false)

+-------+----------+----------+----------+-------------------+
|user_id|group_    |start_date|end_date  |consecutive_date_cn|
+-------+----------+----------+----------+-------------------+
|1      |2024-03-04|2024-03-10|2024-03-14|5                  |
|1      |2024-03-13|2024-03-25|2024-04-02|9                  |
|3      |2024-02-29|2024-03-01|2024-03-05|5                  |
+-------+----------+----------+----------+-------------------+

