In [5]:
// 📊 Question:
// Table: (Column_Name, Type)
// Failed -- (fail_date, date)
// Succeeded -- (success_date, date)

// Primary keys are fail_date and success_date.
// Failed table contains the days of failed tasks.
// Succeeded table contains the days of succeeded tasks.

// A system is running one task every day. Every task is independent of the previous tasks. The tasks can fail or succeed.

// 👉 Write an SQL query to generate a report of period_state for each continuous interval of days.

// period_state is 'failed' if tasks in this interval failed or 'succeeded' if tasks in this interval succeeded.
// Interval of days are retrieved as start_date and end_date.

// Order result by start_date.

// The query result format is in the following example:
// Failed table:
// +-------------------+
// | fail_date     |
// +-------------------+
// | 2018-12-28    |
// | 2018-12-29    |
// | 2019-01-04    |
// | 2019-01-05    |
// +-------------------+

// Succeeded table:
// +-------------------+
// | success_date   |
// +-------------------+
// | 2018-12-30    |
// | 2018-12-31    |
// | 2019-01-01    |
// | 2019-01-02    |
// | 2019-01-03    |
// | 2019-01-06    |
// +-------------------+

// Result table:
// +--------------+--------------+--------------+
// | period_state | start_date  | end_date   |
// +--------------+--------------+--------------+
// | failed    | 2018-12-28  | 2018-12-29  |
// | succeeded  | 2019-12-30  | 2019-01-03  |
// | failed    | 2019-01-04  | 2019-01-05  |
// | succeeded  | 2019-01-06  | 2019-01-06  |
// +--------------+--------------+--------------+



val fail_date = Seq(
    ("2018-12-28"),
    ("2018-12-29"),
    ("2019-01-04"),
    ("2019-01-05"),
).toDF("date").withColumn("status", lit("fail"))

val success_date = Seq(
    ("2018-12-30"),
    ("2018-12-31"),
    ("2019-01-01"),
    ("2019-01-02"),
    ("2019-01-03"),
    ("2019-01-06"),
).toDF("date").withColumn("status", lit("success"))


fail_date.show(false)
success_date.show(false)


+----------+------+
|date      |status|
+----------+------+
|2018-12-28|fail  |
|2018-12-29|fail  |
|2019-01-04|fail  |
|2019-01-05|fail  |
+----------+------+

+----------+-------+
|date      |status |
+----------+-------+
|2018-12-30|success|
|2018-12-31|success|
|2019-01-01|success|
|2019-01-02|success|
|2019-01-03|success|
|2019-01-06|success|
+----------+-------+



fail_date: org.apache.spark.sql.DataFrame = [date: string, status: string]
success_date: org.apache.spark.sql.DataFrame = [date: string, status: string]


In [15]:
import org.apache.spark.sql.expressions._

val merged=fail_date.union(success_date)

// merged.sortWithinPartitions("date").withColumn("partition", spark_partition_id())show(false)

val continuous_dates=merged.withColumn(
"rn", to_date($"date","yyyy-MM-dd") - row_number().over(Window.partitionBy($"status").orderBy($"date")))

continuous_dates.show(false)

+----------+-------+----------+
|date      |status |rn        |
+----------+-------+----------+
|2018-12-28|fail   |2018-12-27|
|2018-12-29|fail   |2018-12-27|
|2019-01-04|fail   |2019-01-01|
|2019-01-05|fail   |2019-01-01|
|2018-12-30|success|2018-12-29|
|2018-12-31|success|2018-12-29|
|2019-01-01|success|2018-12-29|
|2019-01-02|success|2018-12-29|
|2019-01-03|success|2018-12-29|
|2019-01-06|success|2018-12-31|
+----------+-------+----------+



import org.apache.spark.sql.expressions._
merged: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [date: string, status: string]
continuous_dates: org.apache.spark.sql.DataFrame = [date: string, status: string ... 1 more field]


In [17]:
continuous_dates.groupBy($"status", $"rn").agg(min($"date").as("start_date"), max($"date").as("max_date")
                                              ).orderBy($"start_date").drop("rn").show(false)

+-------+----------+----------+
|status |start_date|max_date  |
+-------+----------+----------+
|fail   |2018-12-28|2018-12-29|
|success|2018-12-30|2019-01-03|
|fail   |2019-01-04|2019-01-05|
|success|2019-01-06|2019-01-06|
+-------+----------+----------+

