# Question 03
Find the greatest number of countries a passenger has been in without being in the UK. For example, if the countries a passenger was in were: UK -> FR -> US -> CN -> UK -> DE -> UK, the correct answer would be 3 countries.

## Assumptions
1. Data is clearned and not errorneous
2. Timezone consideration is not required

# Setup

In [1]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions._
import java.time.temporal.ChronoUnit
import java.time.{Period, LocalDate, Instant}
import java.sql.Timestamp

### Spark parition control based on core availability

In [2]:
val NUM_CORES = 4
val NUM_PARTITIONS = 3

lazy val spark: SparkSession = SparkSession.builder()
    .master("local")
    .appName("flight")
    .getOrCreate()

spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)

import spark.implicits._

NUM_CORES = 4
NUM_PARTITIONS = 3
spark = <lazy>


<lazy>

# Tools

### Elapsed time profiler

In [3]:
val timing = new StringBuffer
def timed[T](label: String, code: => T): T = {
    val start = System.currentTimeMillis()
    val result = code
    val stop = System.currentTimeMillis()
    timing.append(s"Processing $label took ${stop - start} ms.\n")
    result
}

timing = 


timed: [T](label: String, code: => T)T


In [4]:
// To flush out error: missing argument list for method timed
println("")

<console>:46: error: missing argument list for method timed
Unapplied methods are only converted to functions when a function type is expected.
You can make this conversion explicit by writing `timed _` or `timed(_,_)` instead of `timed`.
       timed
       ^
lastException: Throwable = null


# Main

## Constants

In [5]:
val FLIGHTDATA_CSV_PATH = "../resources/flightData.csv"
val PASSENGER_CSV_PATH = "../resources/passengers.csv"
val RESULT_DIR = "results/longestRun"

FLIGHTDATA_CSV_PATH = ../resources/flightData.csv
PASSENGER_CSV_PATH = ../resources/passengers.csv
RESULT_DIR = results/longestRun


results/longestRun

### UDF

In [6]:
//val BASE_TIMESTAMP = java.sql.Timestamp.valueOf("2017-01-01 00:00:00.0")
val BASE_LOCALDATE = LocalDate.parse("2017-01-01").withDayOfMonth(1)

def get_months_between(to: Timestamp): Short = {
    val monthsBetween = ChronoUnit.MONTHS.between(
        BASE_LOCALDATE,
        to.toLocalDateTime().toLocalDate().withDayOfMonth(1)
    )
    monthsBetween.toShort
}
val udf_months_between = udf((t:Timestamp) => get_months_between(t))

BASE_LOCALDATE = 2017-01-01
udf_months_between = UserDefinedFunction(<function1>,ShortType,Some(List(TimestampType)))


get_months_between: (to: java.sql.Timestamp)Short


UserDefinedFunction(<function1>,ShortType,Some(List(TimestampType)))

In [7]:
// Transformations, no action yet
val flightData = spark.read.format("csv")
    .option("header", "true")
    .option("delimiter", ",")
    .option("dateFormat", "yyyy-MM-dd")
    .option("inferSchema", "true")
    .load("../resources/flightData.csv")
    .withColumn(
        "direction", 
        when(lower(col("from")) === "uk", 1)
        .when(lower(col("to"))   === "uk", -1)
        .otherwise(0)
    )
    .withColumn(
        "count", lit(1)
    )
    .orderBy(asc("passengerId"), asc("date"))

flightData.printSchema()

root
 |-- passengerId: integer (nullable = true)
 |-- flightId: integer (nullable = true)
 |-- from: string (nullable = true)
 |-- to: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- direction: integer (nullable = false)
 |-- count: integer (nullable = false)



flightData = [passengerId: int, flightId: int ... 5 more fields]


[passengerId: int, flightId: int ... 5 more fields]

In [8]:
flightData.createOrReplaceTempView("flightData")

In [9]:
//flightData.show(5)

Name: Syntax Error.
Message: 
StackTrace: 

In [10]:
val querySequencedRun = """
SELECT 
    f.*,
    ROW_NUMBER() OVER (PARTITION BY passengerId ORDER BY passengerId, date) as seq 
FROM
    flightData f
ORDER BY 
    passengerId, date
"""

val sequencedRun = spark.sql(querySequencedRun)
sequencedRun.createOrReplaceTempView("sequencedRun")

/* For debug only
sequencedRun
    .filter(col("passengerId") === 53)
    .show()
*/

querySequencedRun = 
sequencedRun = [passengerId: int, flightId: int ... 6 more fields]


"
SELECT
    f.*,
    ROW_NUMBER() OVER (PARTITION BY passengerId ORDER BY passengerId, date) as seq
FROM
    flightData f
ORDER BY
    passengerId, date
"


[passengerId: int, flightId: int ... 6 more fields]

## Longest run per passenger

In [11]:
val queryLongestRun = """
WITH 
    closedRun AS (
        SELECT 
            passengerId, 
            from, to, 
            direction, 
            seq,
            -------------------------------------------------------------------------------- 
            -- For a departure flight, take the the return flight, if there is, seq num
            -------------------------------------------------------------------------------- 
            CASE 
                WHEN direction == 1
                THEN lead(seq) OVER (PARTITION BY passengerId ORDER BY seq)
            END AS return,
            -------------------------------------------------------------------------------- 
            -- For a departure flight, count the visiting countries, if returned.
            -------------------------------------------------------------------------------- 
            CASE 
                WHEN direction == 1
                THEN lead(seq) OVER (PARTITION BY passengerId ORDER BY seq) - seq
            END AS countries
        FROM sequencedRun s
        WHERE 
            direction != 0
            AND EXISTS (  
                SELECT passengerId
                FROM
                    sequencedRun
                WHERE 
                    direction != 0 AND
                    passengerId == s.passengerId
                GROUP BY
                    passengerId
                Having count(DISTINCT direction) == 2
            )
        ORDER BY 
            passengerId, seq
    )
    

SELECT 
    passengerId,
    max(countries) as longestRun
FROM closedRun
WHERE 
    countries IS NOT NULL
GROUP BY 
    passengerId
ORDER BY 
    passengerId
"""

queryLongestRun = 


"
WITH
    closedRun AS (
        SELECT
            passengerId,
            from, to,
            direction,
            seq,
            --------------------------------------------------------------------------------
            -- For a departure flight, take the the return flight, if there is, seq num
            --------------------------------------------------------------------------------
            CASE
                WHEN direction == 1
                THEN lead(seq) OVER (PARTITION BY passengerId ORDER BY seq)
            END AS return,
            --------------------------------------------------------------------------------
            -- For a departure flight, count the visiting countries, if returned.
            --------------------------...


In [12]:
val longestRun = spark.sql(queryLongestRun)

timed(
    "Run longest closed run.",
    longestRun.show(5)
)
println(timing)
println(sequencedRun.rdd.toDebugString)
println(longestRun.rdd.toDebugString)

longestRun
    .coalesce(1)
    .write
    .format("csv")
    .mode(SaveMode.Overwrite)
    .option("header", "true")
    .save(RESULT_DIR)

+-----------+----------+
|passengerId|longestRun|
+-----------+----------+
|         22|         4|
|         53|         4|
|        167|         2|
|        204|         3|
|        227|         1|
+-----------+----------+
only showing top 5 rows

Processing Run longest closed run. took 5827 ms.

(12) MapPartitionsRDD[78] at rdd at <console>:59 []
 |   MapPartitionsRDD[77] at rdd at <console>:59 []
 |   MapPartitionsRDD[76] at rdd at <console>:59 []
 |   ShuffledRowRDD[75] at rdd at <console>:59 []
 +-(12) MapPartitionsRDD[74] at rdd at <console>:59 []
    |   MapPartitionsRDD[70] at rdd at <console>:59 []
    |   MapPartitionsRDD[69] at rdd at <console>:59 []
    |   ShuffledRowRDD[68] at rdd at <console>:59 []
    +-(12) MapPartitionsRDD[67] at rdd at <console>:59 []
       |   MapPartitionsRDD[66] at rdd at <console>:59 []
       |   ShuffledRowRDD[65] at rdd at <console>:59 []
       +-(1) MapPartitionsRDD[64] at rdd at <console>:59 []
          |  MapPartitionsRDD[60] at rdd at 

longestRun = [passengerId: int, longestRun: int]


[passengerId: int, longestRun: int]

# Tests
TBD