# Question 04
Find the passengers who have been on more than N flights together within the range (from,to).

## Assumptions
1. Data is clearned and not errorneous
2. Timezone consideration is not required

## TODO
Implement a matrix way. By creating M<sup>T</sup> * M, the diagonal represents the number of flghts of each passenger, and right top part represents how many flights (r/passenger 1, c/passenger2)


# Setup

In [1]:
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import java.time.temporal.ChronoUnit
import java.time.{Period, LocalDate, Instant}
import java.sql.Timestamp

### Spark parition control based on core availability

In [2]:
val NUM_CORES = 4
val NUM_PARTITIONS = 3

lazy val spark: SparkSession = SparkSession.builder()
    .master("local")
    .appName("flight")
    .getOrCreate()

spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)

import spark.implicits._

NUM_CORES = 4
NUM_PARTITIONS = 3
spark = <lazy>


<lazy>

## Constants

In [3]:
val CSV_DELIMITER = ","
val FLIGHTDATA_CSV_PATH = "../resources/flightData.csv"
val PASSENGER_CSV_PATH = "../resources/passengers.csv"

val DATE_FORMAT = "yyyy-MM-dd"
val FLIGHT_DATE_FROM = "2017-01-01"
val FLIGHT_DATE_TO   = "2017-12-31"
val NUM_FLIGHT_TOGETHER = 3

val RESULT_DIR = "results/flightsTogether"

CSV_DELIMITER = ,
FLIGHTDATA_CSV_PATH = ../resources/flightData.csv
PASSENGER_CSV_PATH = ../resources/passengers.csv
DATE_FORMAT = yyyy-MM-dd
FLIGHT_DATE_FROM = 2017-01-01
FLIGHT_DATE_TO = 2017-12-31
NUM_FLIGHT_TOGETHER = 3
RESULT_DIR = results/flightsTogether


results/flightsTogether

# Tools

### Elapsed time profiler

In [4]:
import scala.collection.mutable.ListBuffer

val timing = new StringBuffer
val times = new ListBuffer[Long]()

def clear(): Unit = {
    timing.setLength(0)
    times.clear
}
def average(): Long = {
    times.reduce(_+_) / times.length
}

/**
@param label Description about the run
@code code to execute
@return execution
*/
def timed[T](label: String, code: => T): T = {
    val start = System.currentTimeMillis()
    val result = code
    val stop = System.currentTimeMillis()
    timing.append(s"Processing $label took ${stop - start} ms.\n")
    times.append(stop - start)
    result
}

timing = 
times = ListBuffer()


clear: ()Unit
average: ()Long
timed: [T](label: String, code: => T)T


ListBuffer()

In [5]:
// To flush out error: missing argument list for method timed
println("")

<console>:45: error: missing argument list for method timed
Unapplied methods are only converted to functions when a function type is expected.
You can make this conversion explicit by writing `timed _` or `timed(_,_)` instead of `timed`.
       timed
       ^
lastException: Throwable = null


### Save to file

In [6]:
def save(df: DataFrame) = {
    df.coalesce(1)
    .write
    .format("csv")
    .mode(SaveMode.Overwrite)
    .option("header", "true")
    .save(RESULT_DIR)
}

save: (df: org.apache.spark.sql.DataFrame)Unit


### Spark SQL runner

In [7]:
/**
Run the SparkSQL 
@param label Lable to describe this run
@param query SQL 
@param repeats Number of run
@return Average execution time in msec
*/
def run(label: String, query: String, repeats: Int, toSave: Boolean = false): Long = {
    val result = spark.sql(query)

    clear()
    for (i <- (0 until repeats)){
        timed(
            label,
            result.show(5)
        )
        println(timing)
        println(s"Average time $average ms")
    }
    println(result.rdd.toDebugString)    

    if(toSave) save(result)
    average
}

run: (label: String, query: String, repeats: Int, toSave: Boolean)Long


# Main

## Base DataFrame

In [8]:
// Transformations, no action yet
val flightData = spark.read.format("csv")
    .option("header", "true")
    .option("delimiter", ",")
    .option("dateFormat", "yyyy-MM-dd")
    .option("inferSchema", "true")
    .load("../resources/flightData.csv")
    .select(
        "passengerId",
        "flightId",
        "date"
    )

flightData = [passengerId: int, flightId: int ... 1 more field]


[passengerId: int, flightId: int ... 1 more field]

In [9]:
flightData.select(countDistinct("date")).show()
flightData.select(min("date")).show()
flightData.select(max("date")).show()

flightData.select(countDistinct("passengerId")).show()
flightData.select(min("passengerId")).show()
flightData.select(max("passengerId")).show()

flightData.select(countDistinct("flightId")).show()
flightData.select(min("flightId")).show()
flightData.select(max("flightId")).show()

+--------------------+
|count(DISTINCT date)|
+--------------------+
|                 342|
+--------------------+

+-------------------+
|          min(date)|
+-------------------+
|2017-01-01 00:00:00|
+-------------------+

+-------------------+
|          max(date)|
+-------------------+
|2017-12-31 00:00:00|
+-------------------+

+---------------------------+
|count(DISTINCT passengerId)|
+---------------------------+
|                      15500|
+---------------------------+

+----------------+
|min(passengerId)|
+----------------+
|               1|
+----------------+

+----------------+
|max(passengerId)|
+----------------+
|           15500|
+----------------+

+------------------------+
|count(DISTINCT flightId)|
+------------------------+
|                    1000|
+------------------------+

+-------------+
|min(flightId)|
+-------------+
|            0|
+-------------+

+-------------+
|max(flightId)|
+-------------+
|          999|
+-------------+



## SQL

In [10]:
val queryFlightsTogether = s"""
SELECT 
    f.passengerId AS `Passenger 1 ID`, 
    s.passengerId AS `Passenger 2 ID`, 
    count(s.flightId) AS `Number of flights together`,
    '$FLIGHT_DATE_FROM' as From,
    '$FLIGHT_DATE_TO' as To
FROM
    flightData f INNER JOIN flightData s
    ON f.flightId == s.flightId
WHERE 
    f.passengerId != s.passengerId
    AND f.date >= to_timestamp('$FLIGHT_DATE_FROM', '$DATE_FORMAT')
    AND f.date <= to_timestamp('$FLIGHT_DATE_TO',   '$DATE_FORMAT')
    AND s.date >= to_timestamp('$FLIGHT_DATE_FROM', '$DATE_FORMAT')
    AND s.date <= to_timestamp('$FLIGHT_DATE_TO',   '$DATE_FORMAT')
GROUP BY 
    f.passengerId, s.passengerId
HAVING 
    count(s.flightId) > $NUM_FLIGHT_TOGETHER
ORDER BY 
    f.passengerId, s.passengerId
    
"""

queryFlightsTogether = 


"
SELECT
    f.passengerId AS `Passenger 1 ID`,
    s.passengerId AS `Passenger 2 ID`,
    count(s.flightId) AS `Number of flights together`,
    '2017-01-01' as From,
    '2017-12-31' as To
FROM
    flightData f INNER JOIN flightData s
    ON f.flightId == s.flightId
WHERE
    f.passengerId != s.passengerId
    AND f.date >= to_timestamp('2017-01-01', 'yyyy-MM-dd')
    AND f.date <= to_timestamp('2017-12-31',   'yyyy-MM-dd')
    AND s.date >= to_timestamp('2017-01-01', 'yyyy-MM-dd')
    AND s.date <= to_timestamp('2017-12-31',   'yyyy-MM-dd')
GROUP BY
    f.passengerId, s.passengerId
HAVING
    count(s.flightId) > 3
ORDER BY
    f.passengerId, s.passengerId
"


## Order by (passengerId)

In [16]:
var df = flightData
    .orderBy("passengerId")
    .persist

df.createOrReplaceTempView("flightData")
val timeOrderPassenger = run(
    "Order by (passengerId)",
    queryFlightsTogether,
    3,
    true
)
spark.catalog.dropTempView("df")

df.unpersist

+--------------+--------------+--------------------------+----------+----------+
|Passenger 1 ID|Passenger 2 ID|Number of flights together|      From|        To|
+--------------+--------------+--------------------------+----------+----------+
|             1|            37|                         4|2017-01-01|2017-12-31|
|             1|            38|                         4|2017-01-01|2017-12-31|
|             1|            76|                         4|2017-01-01|2017-12-31|
|             1|           120|                         4|2017-01-01|2017-12-31|
|             1|          1694|                         4|2017-01-01|2017-12-31|
+--------------+--------------+--------------------------+----------+----------+
only showing top 5 rows

Processing Order by (passengerId) took 4494 ms.

Average time 4494 ms
+--------------+--------------+--------------------------+----------+----------+
|Passenger 1 ID|Passenger 2 ID|Number of flights together|      From|        To|
+-------------

df = [passengerId: int, flightId: int ... 1 more field]
timeOrderPassenger = 3751


[passengerId: int, flightId: int ... 1 more field]

## Order by (flightId)

In [12]:
var df = flightData
    .orderBy("flightId")
    .persist

df.createOrReplaceTempView("flightData")
val timeOrderFlight = run(
    "Order by (flightId)",
    queryFlightsTogether,
    3
)
spark.catalog.dropTempView("df")

df.unpersist

+--------------+--------------+--------------------------+----------+----------+
|Passenger 1 ID|Passenger 2 ID|Number of flights together|      From|        To|
+--------------+--------------+--------------------------+----------+----------+
|             1|            37|                         4|2017-01-01|2017-12-31|
|             1|            38|                         4|2017-01-01|2017-12-31|
|             1|            76|                         4|2017-01-01|2017-12-31|
|             1|           120|                         4|2017-01-01|2017-12-31|
|             1|          1694|                         4|2017-01-01|2017-12-31|
+--------------+--------------+--------------------------+----------+----------+
only showing top 5 rows

Processing Order by (flightId) took 7593 ms.

Average time 7593 ms
+--------------+--------------+--------------------------+----------+----------+
|Passenger 1 ID|Passenger 2 ID|Number of flights together|      From|        To|
+--------------+-

df = [passengerId: int, flightId: int ... 1 more field]
timeOrderFlight = 6656


[passengerId: int, flightId: int ... 1 more field]

## Order by ("passengerId", "flightId")

In [13]:
var df = flightData
    .orderBy("passengerId", "flightId")
    .persist

df.createOrReplaceTempView("flightData")
val timeOrderPassengerFlight= run(
    "Order by (passengerId, flightId)",
    queryFlightsTogether,
    3
)
spark.catalog.dropTempView("df")

df.unpersist

+--------------+--------------+--------------------------+----------+----------+
|Passenger 1 ID|Passenger 2 ID|Number of flights together|      From|        To|
+--------------+--------------+--------------------------+----------+----------+
|             1|            37|                         4|2017-01-01|2017-12-31|
|             1|            38|                         4|2017-01-01|2017-12-31|
|             1|            76|                         4|2017-01-01|2017-12-31|
|             1|           120|                         4|2017-01-01|2017-12-31|
|             1|          1694|                         4|2017-01-01|2017-12-31|
+--------------+--------------+--------------------------+----------+----------+
only showing top 5 rows

Processing Order by (passengerId, flightId) took 6750 ms.

Average time 6750 ms
+--------------+--------------+--------------------------+----------+----------+
|Passenger 1 ID|Passenger 2 ID|Number of flights together|      From|        To|
+---

df = [passengerId: int, flightId: int ... 1 more field]
timeOrderPassengerFlight = 5734


[passengerId: int, flightId: int ... 1 more field]

## Order by ("passengerId", "date")¶

In [14]:
var df = flightData
    .orderBy("passengerId", "date")
    .persist

df.createOrReplaceTempView("flightData")
val timeOrderPassengerDate = run(
    "Order by (passengerId, date)",
    queryFlightsTogether,
    3
)
spark.catalog.dropTempView("df")

df.unpersist

+--------------+--------------+--------------------------+----------+----------+
|Passenger 1 ID|Passenger 2 ID|Number of flights together|      From|        To|
+--------------+--------------+--------------------------+----------+----------+
|             1|            37|                         4|2017-01-01|2017-12-31|
|             1|            38|                         4|2017-01-01|2017-12-31|
|             1|            76|                         4|2017-01-01|2017-12-31|
|             1|           120|                         4|2017-01-01|2017-12-31|
|             1|          1694|                         4|2017-01-01|2017-12-31|
+--------------+--------------+--------------------------+----------+----------+
only showing top 5 rows

Processing Order by (passengerId, date) took 6491 ms.

Average time 6491 ms
+--------------+--------------+--------------------------+----------+----------+
|Passenger 1 ID|Passenger 2 ID|Number of flights together|      From|        To|
+-------

df = [passengerId: int, flightId: int ... 1 more field]
timeOrderPassengerDate = 6229


[passengerId: int, flightId: int ... 1 more field]

## Elepased Time

In [15]:
val report = s"""
Order by:
(passengerId)           is $timeOrderPassenger ms
(flightId)              is $timeOrderFlight ms
(passengerId, date)     is $timeOrderPassengerDate ms
(passengerId, flightId) is $timeOrderPassengerFlight ms
"""
println(report)


Order by:
(passengerId)           is 4257 ms
(flightId)              is 6656 ms
(passengerId, date)     is 6229 ms
(passengerId, flightId) is 5734 ms



report = 


"
Order by:
(passengerId)           is 4257 ms
(flightId)              is 6656 ms
(passengerId, date)     is 6229 ms
(passengerId, flightId) is 5734 ms
"
