In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
spark = SparkSession.builder.appName("PySparkPrac").getOrCreate()

In [2]:
data = [
    ("2025-01-01", 100),
    ("2025-01-02", 150),
    ("2025-01-04", 120),
    ("2025-01-06", 200),
]
sales = spark.createDataFrame(data, ["date", "sales"])

In [3]:
sales = sales.withColumn("date", to_date("date"))

In [4]:
min_max = sales.agg(min("date").alias("min_date"), max("date").alias("max_date")).first()
min_date, max_date = min_max["min_date"], min_max["max_date"]

In [5]:
date_df = spark.sql(f"SELECT sequence(to_date('{min_date}'), to_date('{max_date}'), interval 1 day) as dates")
date_df.show(truncate=False)

+------------------------------------------------------------------------+
|dates                                                                   |
+------------------------------------------------------------------------+
|[2025-01-01, 2025-01-02, 2025-01-03, 2025-01-04, 2025-01-05, 2025-01-06]|
+------------------------------------------------------------------------+



In [6]:
date_df = date_df.select(explode(col("dates")).alias("date"))

In [7]:
date_df.show()

+----------+
|      date|
+----------+
|2025-01-01|
|2025-01-02|
|2025-01-03|
|2025-01-04|
|2025-01-05|
|2025-01-06|
+----------+



In [8]:
missing_dates = date_df.join(sales.select("date"), on="date", how="left_anti")
missing_dates.show()

+----------+
|      date|
+----------+
|2025-01-05|
|2025-01-03|
+----------+



In [26]:
data = [
    (1, "2025-01-01"),
    (1, "2025-01-02"),
    (1, "2025-01-04"),
    (2, "2025-01-01"),
    (2, "2025-01-02"),
    (2, "2025-01-03"),
    (2, "2025-01-05")
]
df = spark.createDataFrame(data, ["user_id", "login_date"])

In [27]:
w = Window.partitionBy("user_id").orderBy("login_date")
df = df.withColumn("rn", row_number().over(w))

In [28]:
df = df.withColumn("diff", date_sub(col("login_date"), col("rn")))

In [29]:
df.show()

+-------+----------+---+----------+
|user_id|login_date| rn|      diff|
+-------+----------+---+----------+
|      1|2025-01-01|  1|2024-12-31|
|      1|2025-01-02|  2|2024-12-31|
|      1|2025-01-04|  3|2025-01-01|
|      2|2025-01-01|  1|2024-12-31|
|      2|2025-01-02|  2|2024-12-31|
|      2|2025-01-03|  3|2024-12-31|
|      2|2025-01-05|  4|2025-01-01|
+-------+----------+---+----------+



In [31]:
counts = df.groupBy("user_id", "diff").agg(count("*").alias("streak"))

In [33]:
counts.show()

+-------+----------+------+
|user_id|      diff|streak|
+-------+----------+------+
|      1|2024-12-31|     2|
|      1|2025-01-01|     1|
|      2|2024-12-31|     3|
|      2|2025-01-01|     1|
+-------+----------+------+



In [32]:
result = counts.groupBy("user_id").agg(max("streak").alias("longest_streak"))
result.show()

+-------+--------------+
|user_id|longest_streak|
+-------+--------------+
|      1|             2|
|      2|             3|
+-------+--------------+

