# Assumptions
1. Data is clearned and not errorneous
2. Timezone consideration is not required

# Setup

In [1]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions._
import java.time.temporal.ChronoUnit
import java.time.{Period, LocalDate, Instant}
import java.sql.Timestamp

## Parition control based on core availability

In [2]:
val NUM_CORES = 4
val NUM_PARTITIONS = 3

lazy val spark: SparkSession = SparkSession.builder()
    .master("local")
    .appName("flight")
    .getOrCreate()

spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)

import spark.implicits._

NUM_CORES = 4
NUM_PARTITIONS = 3
spark = <lazy>


<lazy>

In [3]:
val FLIGHTDATA_CSV_PATH = "../resources/flightData.csv"
val PASSENGER_CSV_PATH = "../resources/passengers.csv"

FLIGHTDATA_CSV_PATH = ../resources/flightData.csv
PASSENGER_CSV_PATH = ../resources/passengers.csv


../resources/passengers.csv

## Utilities

### Elapsed time profiler

In [4]:
val timing = new StringBuffer
def timed[T](label: String, code: => T): T = {
    val start = System.currentTimeMillis()
    val result = code
    val stop = System.currentTimeMillis()
    timing.append(s"Processing $label took ${stop - start} ms.\n")
    result
}

timing = 


timed: [T](label: String, code: => T)T


In [5]:
// To flush out error: missing argument list for method timed
println("")

<console>:78: error: missing argument list for method timed
Unapplied methods are only converted to functions when a function type is expected.
You can make this conversion explicit by writing `timed _` or `timed(_,_)` instead of `timed`.
       timed
       ^
lastException: Throwable = null


### UDF

In [6]:
//val BASE_TIMESTAMP = java.sql.Timestamp.valueOf("2017-01-01 00:00:00.0")
val BASE_LOCALDATE = LocalDate.parse("2017-01-01").withDayOfMonth(1)

def get_months_between(to: Timestamp): Short = {
    val monthsBetween = ChronoUnit.MONTHS.between(
        BASE_LOCALDATE,
        to.toLocalDateTime().toLocalDate().withDayOfMonth(1)
    )
    monthsBetween.toShort
}
val udf_months_between = udf((t:Timestamp) => get_months_between(t))

BASE_LOCALDATE = 2017-01-01
udf_months_between = UserDefinedFunction(<function1>,ShortType,Some(List(TimestampType)))


get_months_between: (to: java.sql.Timestamp)Short


UserDefinedFunction(<function1>,ShortType,Some(List(TimestampType)))

In [23]:
// Transformations, no action yet
val flightData = spark.read.format("csv")
    .option("header", "true")
    .option("delimiter", ",")
    .option("dateFormat", "yyyy-MM-dd")
    .option("inferSchema", "true")
    .load("../resources/flightData.csv")
    .withColumn(
        "direction", 
        when(lower(col("from")) === "uk", 1)
        .when(lower(col("to"))   === "uk", -1)
        .otherwise(0)
    )
    .withColumn(
        "count", lit(1)
    )
    .orderBy(asc("passengerId"), asc("date"))

flightData.printSchema()

root
 |-- passengerId: integer (nullable = true)
 |-- flightId: integer (nullable = true)
 |-- from: string (nullable = true)
 |-- to: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- direction: integer (nullable = false)
 |-- count: integer (nullable = false)



flightData = [passengerId: int, flightId: int ... 5 more fields]


[passengerId: int, flightId: int ... 5 more fields]

In [24]:
flightData.show()

+-----------+--------+----+---+-------------------+---------+-----+
|passengerId|flightId|from| to|               date|direction|count|
+-----------+--------+----+---+-------------------+---------+-----+
|          1|       0|  cg| ir|2017-01-01 00:00:00|        0|    1|
|          1|     901|  ir| at|2017-11-29 00:00:00|        0|    1|
|          1|     940|  at| cn|2017-12-12 00:00:00|        0|    1|
|          1|     972|  cn| ch|2017-12-22 00:00:00|        0|    1|
|          1|     993|  ch| pk|2017-12-29 00:00:00|        0|    1|
|          2|       0|  cg| ir|2017-01-01 00:00:00|        0|    1|
|          3|       0|  cg| ir|2017-01-01 00:00:00|        0|    1|
|          3|      32|  ir| sg|2017-01-10 00:00:00|        0|    1|
|          3|     108|  sg| be|2017-02-06 00:00:00|        0|    1|
|          3|     176|  be| ir|2017-03-05 00:00:00|        0|    1|
|          4|       0|  cg| ir|2017-01-01 00:00:00|        0|    1|
|          4|     200|  ir| no|2017-03-14 00:00:

In [28]:
flightData.createOrReplaceTempView("flightData")

lastException: Throwable = null


# Window

## Sum over partition & range
Number of flights per passenger

In [39]:
var query = """
SELECT 
    f.*,
    sum(count) OVER (
        PARTITION BY passengerId 
        ORDER BY 
            passengerId ASC, 
            date DESC
        ROWS BETWEEN 
            UNBOUNDED PRECEDING
            AND 
            1 PRECEDING
    ) as num_flights
FROM
    flightData f
ORDER BY 
    passengerId, date
"""

val countSum = spark.sql(query)
countSum.show()

+-----------+--------+----+---+-------------------+---------+-----+-----------+
|passengerId|flightId|from| to|               date|direction|count|num_flights|
+-----------+--------+----+---+-------------------+---------+-----+-----------+
|          1|       0|  cg| ir|2017-01-01 00:00:00|        0|    1|          4|
|          1|     901|  ir| at|2017-11-29 00:00:00|        0|    1|          3|
|          1|     940|  at| cn|2017-12-12 00:00:00|        0|    1|          2|
|          1|     972|  cn| ch|2017-12-22 00:00:00|        0|    1|          1|
|          1|     993|  ch| pk|2017-12-29 00:00:00|        0|    1|       null|
|          2|       0|  cg| ir|2017-01-01 00:00:00|        0|    1|       null|
|          3|       0|  cg| ir|2017-01-01 00:00:00|        0|    1|          3|
|          3|      32|  ir| sg|2017-01-10 00:00:00|        0|    1|          2|
|          3|     108|  sg| be|2017-02-06 00:00:00|        0|    1|          1|
|          3|     176|  be| ir|2017-03-0

query = 
countSum = [passengerId: int, flightId: int ... 6 more fields]


lastException: Throwable = null
"
SELECT
    f.*,
    sum(count) OVER (
        PARTITION BY passengerId
        ORDER BY
            passengerId ASC,
            date DESC
        ROWS BETWEEN
            UNBOUNDED PRECEDING
            AND
            1 PRECEDING
    ) as num_flights
FROM
    flightData f
ORDER BY
    passengerId, date
"


[passengerId: int, flightId: int ... 6 more fields]

## Lag / Lead

In [61]:
var query = """
SELECT
  lag(v) OVER (ORDER BY v) as LAG_FROM_PREVIOUS_ROW,
  v as ROW_VALUE,
  lead(v) OVER (ORDER BY v) as LEAD_FROM_NEXT_ROW
FROM (
  VALUES (1), (2), (3), (4)
) t(v)
"""
var result = spark.sql(query)
result.show()

+---------------------+---------+------------------+
|LAG_FROM_PREVIOUS_ROW|ROW_VALUE|LEAD_FROM_NEXT_ROW|
+---------------------+---------+------------------+
|                 null|        1|                 2|
|                    1|        2|                 3|
|                    2|        3|                 4|
|                    3|        4|              null|
+---------------------+---------+------------------+



query = 
result = [LAG_FROM_PREVIOUS_ROW: int, ROW_VALUE: int ... 1 more field]


"
SELECT
  lag(v) OVER (ORDER BY v) as LAG_FROM_PREVIOUS_ROW,
  v as ROW_VALUE,
  lead(v) OVER (ORDER BY v) as LEAD_FROM_NEXT_ROW
FROM (
  VALUES (1), (2), (3), (4)
) t(v)
"


[LAG_FROM_PREVIOUS_ROW: int, ROW_VALUE: int ... 1 more field]

## Row Number in Window Partition

In [2]:
var query = """
SELECT 
    f.*,
    ROW_NUMBER() OVER (PARTITION BY passengerId ORDER BY passengerId, date) as seq 
FROM
    flightData f
ORDER BY 
    passengerId, date
"""

val passageSequenced = spark.sql(query)
passageSequenced
    .filter($"passengerId" === 22)
    .show()
passageSequenced.createOrReplaceTempView("passageSequenced")

Name: org.apache.spark.sql.AnalysisException
Message: Table or view not found: flightData; line 6 pos 4
StackTrace:   at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:47)
  at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.org$apache$spark$sql$catalyst$analysis$Analyzer$ResolveRelations$$lookupTableFromCatalog(Analyzer.scala:731)
  at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.resolveRelation(Analyzer.scala:683)
  at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$$anonfun$apply$8.applyOrElse(Analyzer.scala:713)
  at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$$anonfun$apply$8.applyOrElse(Analyzer.scala:706)
  at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$$anonfun$resolveOperatorsUp$1$$anonfun$apply$1.apply(AnalysisHelper.scala:90)
  at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$$anonfun$resolveOperatorsUp$1$$anonfun$apply$1.apply(A

## LAG / LEAD 
Flights starting and ending at UK

In [145]:
val closedPassageQuery = """
SELECT 
    passengerId, 
    from, to, 
    direction, 
    seq,
    -------------------------------------------------------------------------------- 
    -- For a departure flight, take the the return flight, if there is, seq num
    -------------------------------------------------------------------------------- 
    CASE 
        WHEN direction == 1
        THEN lead(seq) OVER (PARTITION BY passengerId ORDER BY seq)
    END AS return,
    -------------------------------------------------------------------------------- 
    -- For a departure flight, count the visiting countries, if returned.
    -------------------------------------------------------------------------------- 
    CASE 
        WHEN direction == 1
        THEN lead(seq) OVER (PARTITION BY passengerId ORDER BY seq) - seq
    END AS countries
FROM passageSequenced p
WHERE 
    direction != 0
    AND EXISTS (  
        SELECT passengerId
        FROM
            passageSequenced
        WHERE 
            direction != 0 AND
            passengerId == p.passengerId
        GROUP BY
            passengerId
        Having count(DISTINCT direction) == 2
    )
ORDER BY 
    passengerId, seq
"""

val closedPassages = spark.sql(closedPassageQuery)
closedPassages.show()
closedPassages.createOrReplaceTempView("closedPassages")

+-----------+----+---+---------+---+------+---------+
|passengerId|from| to|direction|seq|return|countries|
+-----------+----+---+---------+---+------+---------+
|         16|  cg| uk|       -1|  5|  null|     null|
|         16|  uk| cg|        1|  6|  null|     null|
|         22|  iq| uk|       -1| 10|  null|     null|
|         22|  uk| nl|        1| 11|    15|        4|
|         22|  at| uk|       -1| 15|  null|     null|
|         22|  uk| bm|        1| 16|  null|     null|
|         52|  se| uk|       -1|  6|  null|     null|
|         52|  uk| cn|        1|  7|  null|     null|
|         53|  ch| uk|       -1|  6|  null|     null|
|         53|  uk| se|        1|  7|     8|        1|
|         53|  se| uk|       -1|  8|  null|     null|
|         53|  uk| tj|        1|  9|    13|        4|
|         53|  th| uk|       -1| 13|  null|     null|
|         72|  tj| uk|       -1|  9|  null|     null|
|         72|  uk| iq|        1| 10|  null|     null|
|         82|  iq| uk|      

closedPassageQuery = 


lastException: Throwable = null
"
SELECT
    passengerId,
    from, to,
    direction,
    seq,
    --------------------------------------------------------------------------------
    -- For a departure flight, take the the return flight, if there is, seq num
    --------------------------------------------------------------------------------
    CASE
        WHEN direction == 1
        THEN lead(seq) OVER (PARTITION BY passengerId ORDER BY seq)
    END AS return,
    --------------------------------------------------------------------------------
    -- For a departure flight, count the visiting countries, if returned.
    --------------------------------------------------------------------------------
    CASE
        WHEN direction == 1
        THEN lead(seq) OVER (PARTITION BY passeng...


In [146]:
var queryVisitedCountries = """
SELECT 
    passengerId,
    max(countries) as countries
FROM closedPassages
WHERE 
    countries IS NOT NULL
GROUP BY 
    passengerId
ORDER BY 
    passengerId
"""

val visitedCountries = spark.sql(queryVisitedCountries)
visitedCountries.show()

+-----------+---------+
|passengerId|countries|
+-----------+---------+
|         22|        4|
|         53|        4|
|        167|        2|
|        204|        3|
|        227|        1|
|        258|        3|
|        281|        9|
|        305|        3|
|        309|        9|
|        313|        8|
|        315|        3|
|        334|        9|
|        340|        5|
|        348|        3|
|        386|        1|
|        478|        1|
|        494|        9|
|        529|        2|
|        615|        4|
|        652|        1|
+-----------+---------+
only showing top 20 rows



queryVisitedCountries = 
visitedCountries = [passengerId: int, countries: int]


"
SELECT
    passengerId,
    max(countries) as countries
FROM closedPassages
WHERE
    countries IS NOT NULL
GROUP BY
    passengerId
ORDER BY
    passengerId
"


[passengerId: int, countries: int]