# Question 04
Find the passengers who have been on more than N flights together within the range (from,to).

## Assumptions
1. Data is clearned and not errorneous
2. Timezone consideration is not required

## Approaches

1. SQL self join that matches with non-self passengerId that have the same flightId.
2. Matrix M<sup>T</sup> * M
Self product matrix M<sup>T</sup> * M has diagonal represents the number of flghts of respective passenger, and right top part (row/passenger 1, column/passenger2) represents how many flights passenger2 shares with passenger 1.

### TODO
Implement a matrix way. 

# Setup

In [1]:
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import java.time.temporal.ChronoUnit
import java.time.{Period, LocalDate, Instant}
import java.sql.Timestamp

### Spark parition control based on core availability

In [2]:
val NUM_CORES = 4
val NUM_PARTITIONS = 3

lazy val spark: SparkSession = SparkSession.builder()
    .master("local")
    .appName("flight")
    .getOrCreate()

spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)

import spark.implicits._

NUM_CORES = 4
NUM_PARTITIONS = 3
spark = <lazy>


<lazy>

## Constants

In [3]:
val CSV_DELIMITER = ","
val FLIGHTDATA_CSV_PATH = "../resources/flightData.csv"
val PASSENGER_CSV_PATH = "../resources/passengers.csv"

val DATE_FORMAT = "yyyy-MM-dd"
val FLIGHT_DATE_FROM = "2017-01-01"
val FLIGHT_DATE_TO   = "2017-12-31"
val NUM_FLIGHT_TOGETHER = 3

val RESULT_DIR = "results/flightsTogether"

CSV_DELIMITER = ,
FLIGHTDATA_CSV_PATH = ../resources/flightData.csv
PASSENGER_CSV_PATH = ../resources/passengers.csv
DATE_FORMAT = yyyy-MM-dd
FLIGHT_DATE_FROM = 2017-01-01
FLIGHT_DATE_TO = 2017-12-31
NUM_FLIGHT_TOGETHER = 3
RESULT_DIR = results/flightsTogether


results/flightsTogether

# Tools

### Elapsed time profiler

In [4]:
import scala.collection.mutable.ListBuffer

val timing = new StringBuffer
val times = new ListBuffer[Long]()

def clear(): Unit = {
    timing.setLength(0)
    times.clear
}
def average(): Long = {
    times.reduce(_+_) / times.length
}

/**
@param label Description about the run
@code code to execute
@return execution
*/
def timed[T](label: String, code: => T): T = {
    val start = System.currentTimeMillis()
    val result = code
    val stop = System.currentTimeMillis()
    timing.append(s"Processing $label took ${stop - start} ms.\n")
    times.append(stop - start)
    result
}

timing = 
times = ListBuffer()


clear: ()Unit
average: ()Long
timed: [T](label: String, code: => T)T


ListBuffer()

In [5]:
// To flush out error: missing argument list for method timed
println("")

<console>:45: error: missing argument list for method timed
Unapplied methods are only converted to functions when a function type is expected.
You can make this conversion explicit by writing `timed _` or `timed(_,_)` instead of `timed`.
       timed
       ^
lastException: Throwable = null


### Save to file

In [6]:
def save(df: DataFrame) = {
    df.coalesce(1)
    .write
    .format("csv")
    .mode(SaveMode.Overwrite)
    .option("header", "true")
    .save(RESULT_DIR)
}

save: (df: org.apache.spark.sql.DataFrame)Unit


### Spark SQL runner

In [7]:
/**
Run the SparkSQL 
@param label Lable to describe this run
@param query SQL 
@param repeats Number of run
@return Average execution time in msec
*/
def run(label: String, query: String, repeats: Int, toSave: Boolean = false): Long = {
    val result = spark.sql(query)

    clear()
    for (i <- (0 until repeats)){
        timed(
            label,
            result.show(5)
        )
        println(timing)
        println(s"Average time $average ms")
    }
    println(result.rdd.toDebugString)    

    if(toSave) save(result)
    average
}

run: (label: String, query: String, repeats: Int, toSave: Boolean)Long


# Main

## Base DataFrame

In [8]:
// Transformations, no action yet
val flightData = spark.read.format("csv")
    .option("header", "true")
    .option("delimiter", ",")
    .option("dateFormat", "yyyy-MM-dd")
    .option("inferSchema", "true")
    .load("../resources/flightData.csv")
    .select(
        "passengerId",
        "flightId",
        "date"
    )

flightData = [passengerId: int, flightId: int ... 1 more field]


[passengerId: int, flightId: int ... 1 more field]

## SQL

In [9]:
val queryFlightsTogether = s"""
WITH 
    --------------------------------------------------------------------------------
    -- Passengers flew more than NUM_FLIGHT_TOGETHER times.
    --------------------------------------------------------------------------------
    more_than_n_flights AS (
        SELECT passengerId
        FROM flightData
        GROUP BY passengerId
        HAVING count(flightId) > $NUM_FLIGHT_TOGETHER
        ORDER BY passengerId
    )

SELECT 
    f.passengerId AS `Passenger 1 ID`, 
    s.passengerId AS `Passenger 2 ID`, 
    count(s.flightId) AS `Number of flights together`,
    '$FLIGHT_DATE_FROM' as From,
    '$FLIGHT_DATE_TO' as To
FROM
    flightData f 
    --------------------------------------------------------------------------------
    -- Passengers more than NUM_FLIGHT_TOGETHER flights
    --------------------------------------------------------------------------------
    INNER JOIN more_than_n_flights m 
        ON f.passengerId == m.passengerId
    --------------------------------------------------------------------------------
    -- Passengers who shared same flights
    --------------------------------------------------------------------------------
    INNER JOIN flightData s 
        ON f.flightId == s.flightId
WHERE
    f.passengerId != s.passengerId AND
    f.date >= to_timestamp('$FLIGHT_DATE_FROM', '$DATE_FORMAT') AND
    f.date <= to_timestamp('$FLIGHT_DATE_TO',   '$DATE_FORMAT') AND
    s.date >= to_timestamp('$FLIGHT_DATE_FROM', '$DATE_FORMAT') AND
    s.date <= to_timestamp('$FLIGHT_DATE_TO',   '$DATE_FORMAT')
GROUP BY 
    f.passengerId, s.passengerId
HAVING 
    count(s.flightId) > $NUM_FLIGHT_TOGETHER
ORDER BY 
    f.passengerId, s.passengerId
"""

queryFlightsTogether = 


"
WITH
    --------------------------------------------------------------------------------
    -- Passengers flew more than NUM_FLIGHT_TOGETHER times.
    --------------------------------------------------------------------------------
    more_than_n_flights AS (
        SELECT passengerId
        FROM flightData
        GROUP BY passengerId
        HAVING count(flightId) > 3
        ORDER BY passengerId
    )
SELECT
    f.passengerId AS `Passenger 1 ID`,
    s.passengerId AS `Passenger 2 ID`,
    count(s.flightId) AS `Number of flights together`,
    '2017-01-01' as From,
    '2017-12-31' as To
FROM
    flightData f
    --------------------------------------------------------------------------------
    -- Passengers more than NUM_FLIGHT_TOGETHER fligh...


## Order by none

In [10]:
var df = flightData
    .persist

df.createOrReplaceTempView("flightData")
val timeOrderNone = run(
    "Order by none",
    queryFlightsTogether,
    3,
    true
)
spark.catalog.dropTempView("df")

df.unpersist

+--------------+--------------+--------------------------+----------+----------+
|Passenger 1 ID|Passenger 2 ID|Number of flights together|      From|        To|
+--------------+--------------+--------------------------+----------+----------+
|             1|            37|                         4|2017-01-01|2017-12-31|
|             1|            38|                         4|2017-01-01|2017-12-31|
|             1|            76|                         4|2017-01-01|2017-12-31|
|             1|           120|                         4|2017-01-01|2017-12-31|
|             1|          1694|                         4|2017-01-01|2017-12-31|
+--------------+--------------+--------------------------+----------+----------+
only showing top 5 rows

Processing Order by none took 12293 ms.

Average time 12293 ms
+--------------+--------------+--------------------------+----------+----------+
|Passenger 1 ID|Passenger 2 ID|Number of flights together|      From|        To|
+--------------+-----

df = [passengerId: int, flightId: int ... 1 more field]
timeOrderNone = 10312


[passengerId: int, flightId: int ... 1 more field]

## Order by (passengerId)

In [11]:
var df = flightData
    .orderBy("passengerId")
    .persist

df.createOrReplaceTempView("flightData")
val timeOrderPassenger = run(
    "Order by (passengerId)",
    queryFlightsTogether,
    3,
    true
)
spark.catalog.dropTempView("df")

df.unpersist

+--------------+--------------+--------------------------+----------+----------+
|Passenger 1 ID|Passenger 2 ID|Number of flights together|      From|        To|
+--------------+--------------+--------------------------+----------+----------+
|             1|            37|                         4|2017-01-01|2017-12-31|
|             1|            38|                         4|2017-01-01|2017-12-31|
|             1|            76|                         4|2017-01-01|2017-12-31|
|             1|           120|                         4|2017-01-01|2017-12-31|
|             1|          1694|                         4|2017-01-01|2017-12-31|
+--------------+--------------+--------------------------+----------+----------+
only showing top 5 rows

Processing Order by (passengerId) took 4529 ms.

Average time 4529 ms
+--------------+--------------+--------------------------+----------+----------+
|Passenger 1 ID|Passenger 2 ID|Number of flights together|      From|        To|
+-------------

df = [passengerId: int, flightId: int ... 1 more field]
timeOrderPassenger = 3582


[passengerId: int, flightId: int ... 1 more field]

## Order by (flightId)

In [12]:
var df = flightData
    .orderBy("flightId")
    .persist

df.createOrReplaceTempView("flightData")
val timeOrderFlight = run(
    "Order by (flightId)",
    queryFlightsTogether,
    3
)
spark.catalog.dropTempView("df")

df.unpersist

+--------------+--------------+--------------------------+----------+----------+
|Passenger 1 ID|Passenger 2 ID|Number of flights together|      From|        To|
+--------------+--------------+--------------------------+----------+----------+
|             1|            37|                         4|2017-01-01|2017-12-31|
|             1|            38|                         4|2017-01-01|2017-12-31|
|             1|            76|                         4|2017-01-01|2017-12-31|
|             1|           120|                         4|2017-01-01|2017-12-31|
|             1|          1694|                         4|2017-01-01|2017-12-31|
+--------------+--------------+--------------------------+----------+----------+
only showing top 5 rows

Processing Order by (flightId) took 7048 ms.

Average time 7048 ms
+--------------+--------------+--------------------------+----------+----------+
|Passenger 1 ID|Passenger 2 ID|Number of flights together|      From|        To|
+--------------+-

df = [passengerId: int, flightId: int ... 1 more field]
timeOrderFlight = 6280


[passengerId: int, flightId: int ... 1 more field]

## Order by ("passengerId", "flightId")

In [13]:
var df = flightData
    .orderBy("passengerId", "flightId")
    .persist

df.createOrReplaceTempView("flightData")
val timeOrderPassengerFlight= run(
    "Order by (passengerId, flightId)",
    queryFlightsTogether,
    3
)
spark.catalog.dropTempView("df")

df.unpersist

+--------------+--------------+--------------------------+----------+----------+
|Passenger 1 ID|Passenger 2 ID|Number of flights together|      From|        To|
+--------------+--------------+--------------------------+----------+----------+
|             1|            37|                         4|2017-01-01|2017-12-31|
|             1|            38|                         4|2017-01-01|2017-12-31|
|             1|            76|                         4|2017-01-01|2017-12-31|
|             1|           120|                         4|2017-01-01|2017-12-31|
|             1|          1694|                         4|2017-01-01|2017-12-31|
+--------------+--------------+--------------------------+----------+----------+
only showing top 5 rows

Processing Order by (passengerId, flightId) took 6211 ms.

Average time 6211 ms
+--------------+--------------+--------------------------+----------+----------+
|Passenger 1 ID|Passenger 2 ID|Number of flights together|      From|        To|
+---

df = [passengerId: int, flightId: int ... 1 more field]
timeOrderPassengerFlight = 5458


[passengerId: int, flightId: int ... 1 more field]

## Order by ("passengerId", "date")¶

In [14]:
var df = flightData
    .orderBy("passengerId", "date")
    .persist

df.createOrReplaceTempView("flightData")
val timeOrderPassengerDate = run(
    "Order by (passengerId, date)",
    queryFlightsTogether,
    3
)
spark.catalog.dropTempView("df")

df.unpersist

+--------------+--------------+--------------------------+----------+----------+
|Passenger 1 ID|Passenger 2 ID|Number of flights together|      From|        To|
+--------------+--------------+--------------------------+----------+----------+
|             1|            37|                         4|2017-01-01|2017-12-31|
|             1|            38|                         4|2017-01-01|2017-12-31|
|             1|            76|                         4|2017-01-01|2017-12-31|
|             1|           120|                         4|2017-01-01|2017-12-31|
|             1|          1694|                         4|2017-01-01|2017-12-31|
+--------------+--------------+--------------------------+----------+----------+
only showing top 5 rows

Processing Order by (passengerId, date) took 6103 ms.

Average time 6103 ms
+--------------+--------------+--------------------------+----------+----------+
|Passenger 1 ID|Passenger 2 ID|Number of flights together|      From|        To|
+-------

df = [passengerId: int, flightId: int ... 1 more field]
timeOrderPassengerDate = 5463


[passengerId: int, flightId: int ... 1 more field]

## Elepased Time

In [15]:
val report = s"""
Order by:
Non                     is $timeOrderNone ms
(passengerId)           is $timeOrderPassenger ms
(flightId)              is $timeOrderFlight ms
(passengerId, date)     is $timeOrderPassengerDate ms
(passengerId, flightId) is $timeOrderPassengerFlight ms
"""
println(report)


Order by:
Non                     is 10312 ms
(passengerId)           is 3582 ms
(flightId)              is 6280 ms
(passengerId, date)     is 5463 ms
(passengerId, flightId) is 5458 ms



report = 


"
Order by:
Non                     is 10312 ms
(passengerId)           is 3582 ms
(flightId)              is 6280 ms
(passengerId, date)     is 5463 ms
(passengerId, flightId) is 5458 ms
"


# Validations

In [16]:
var query = 
s"""
SELECT passengerId, count(flightId) as count
FROM flightData
WHERE passengerId in (1, 37, 38)
GROUP BY passengerId
HAVING count(flightId) > $NUM_FLIGHT_TOGETHER
ORDER BY passengerId
"""
spark.sql(query)
.show()

query = 
s"""
SELECT passengerId, flightId
FROM flightData
WHERE passengerId in (1, 37, 38)
ORDER BY passengerId
"""
spark.sql(query)
.show()

+-----------+-----+
|passengerId|count|
+-----------+-----+
|          1|    5|
|         37|    4|
|         38|    4|
+-----------+-----+

+-----------+--------+
|passengerId|flightId|
+-----------+--------+
|          1|       0|
|          1|     901|
|          1|     940|
|          1|     972|
|          1|     993|
|         37|       0|
|         37|     901|
|         37|     940|
|         37|     972|
|         38|       0|
|         38|     901|
|         38|     940|
|         38|     972|
+-----------+--------+



query = 
query = 


"
SELECT passengerId, flightId
FROM flightData
WHERE passengerId in (1, 37, 38)
ORDER BY passengerId
"
SELECT passengerId, flightId
FROM flightData
WHERE passengerId in (1, 37, 38)
ORDER BY passengerId


### Random samples

In [17]:
val seed = 42
val withReplacement = false
val fraction = 0.01
spark.sql(queryFlightsTogether)
    .sample(withReplacement, fraction, seed)
.show()

+--------------+--------------+--------------------------+----------+----------+
|Passenger 1 ID|Passenger 2 ID|Number of flights together|      From|        To|
+--------------+--------------+--------------------------+----------+----------+
|             1|          2658|                         4|2017-01-01|2017-12-31|
|            18|          7544|                         5|2017-01-01|2017-12-31|
|            18|         13862|                         4|2017-01-01|2017-12-31|
|            53|          6769|                         5|2017-01-01|2017-12-31|
|            56|            22|                         4|2017-01-01|2017-12-31|
|            58|          1328|                         5|2017-01-01|2017-12-31|
|            64|          3087|                         6|2017-01-01|2017-12-31|
|            92|          9466|                         5|2017-01-01|2017-12-31|
|           117|          7551|                         6|2017-01-01|2017-12-31|
|           119|          36

seed = 42
withReplacement = false
fraction = 0.01


0.01

In [18]:
"""
comm -12 \
<(cat ../../../main/resources/flightData.csv | awk '{FS=","} /^1381,/{print $2}' | sort ) \
<(cat ../../../main/resources/flightData.csv | awk '{FS=","} /^58,/{print $2}' | sort)

131
189
217
247
272
283
331

comm -12 \
<(cat ../../../main/resources/flightData.csv | awk '{FS=","} /^58,/{print $2}' | sort ) \
<(cat ../../../main/resources/flightData.csv | awk '{FS=","} /^2942,/{print $2}' | sort)

131
189
217
247

"""

val filter01 = col("Passenger 2 ID") === 2942
val filter02 = col("Passenger 2 ID") === 1381
spark.sql(queryFlightsTogether)
    .where(col("Passenger 1 ID") === 58)
    .where(filter01.or(filter02))
    .show()

+--------------+--------------+--------------------------+----------+----------+
|Passenger 1 ID|Passenger 2 ID|Number of flights together|      From|        To|
+--------------+--------------+--------------------------+----------+----------+
|            58|          1381|                         7|2017-01-01|2017-12-31|
|            58|          2942|                         4|2017-01-01|2017-12-31|
+--------------+--------------+--------------------------+----------+----------+



filter01 = (Passenger 2 ID = 2942)
filter02 = (Passenger 2 ID = 1381)


"""
^


(Passenger 2 ID = 1381)