# Question 02
Find the names of the 100 most frequent flyers.

## Assumptions
1. Data is clearned and not errorneous
2. Timezone consideration is not required

# Setup

In [1]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions._
import java.time.temporal.ChronoUnit
import java.time.{Period, LocalDate, Instant}
import java.sql.Timestamp

### Spark parition control based on core availability

In [2]:
val NUM_CORES = 4
val NUM_PARTITIONS = 3

lazy val spark: SparkSession = SparkSession.builder()
    .master("local")
    .appName("flight")
    .getOrCreate()

spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)

import spark.implicits._

NUM_CORES = 4
NUM_PARTITIONS = 3
spark = <lazy>


<lazy>

# Tools

### Elapsed time profiler

In [3]:
val timing = new StringBuffer
def timed[T](label: String, code: => T): T = {
    val start = System.currentTimeMillis()
    val result = code
    val stop = System.currentTimeMillis()
    timing.append(s"Processing $label took ${stop - start} ms.\n")
    result
}

timing = 


timed: [T](label: String, code: => T)T


# Main

## Constants

In [4]:
val DATE_FORMAT = "yyyy-MM-dd"

val FLIGHTDATA_CSV_PATH = "../resources/flightData.csv"
val PASSENGER_CSV_PATH = "../resources/passengers.csv"

val NUM_TOP_PASSENGER = 100
val RESULT_DIR = "results/topFrequentFlyers"

DATE_FORMAT = yyyy-MM-dd
FLIGHTDATA_CSV_PATH = ../resources/flightData.csv
PASSENGER_CSV_PATH = ../resources/passengers.csv
NUM_TOP_PASSENGER = 100
RESULT_DIR = results/topFrequentFlyers


<console>:46: error: missing argument list for method timed
Unapplied methods are only converted to functions when a function type is expected.
You can make this conversion explicit by writing `timed _` or `timed(_,_)` instead of `timed`.
       timed
       ^
lastException: Throwable = null


results/topFrequentFlyers

In [5]:
// To flush out error: missing argument list for method timed
println("")

## Frequent Flyers
Top N flyers

In [6]:
val frequentFlyers = spark.read.format("csv")
    .option("header", "true")
    .option("delimiter", ",")
    .option("dateFormat", DATE_FORMAT)
    .option("inferSchema", "true")
    .load(FLIGHTDATA_CSV_PATH)
    .select("passengerId", "flightId")
    .groupBy("passengerId")
    .count
    .orderBy(desc("count"))
    .withColumnRenamed("count", "numberOfFlights")
    //--------------------------------------------------------------------------------
    // TOP_N flyers 
    //--------------------------------------------------------------------------------
    .limit(NUM_TOP_PASSENGER)
    //--------------------------------------------------------------------------------
    // Re-sort for passengerId match
    //--------------------------------------------------------------------------------
    .orderBy(asc("passengerId"))
    .persist

frequentFlyers.printSchema()

root
 |-- passengerId: integer (nullable = true)
 |-- numberOfFlights: long (nullable = false)



frequentFlyers = [passengerId: int, numberOfFlights: bigint]


[passengerId: int, numberOfFlights: bigint]

##  Passenger lookup table

In [7]:
val LIMIT = 100
val passengers = spark.read.format("csv")
    .option("header", "true")
    .option("delimiter", ",")
    .option("inferSchema", "true")
    .load(PASSENGER_CSV_PATH)
    //--------------------------------------------------------------------------------
    // Sort for passengerId match
    //--------------------------------------------------------------------------------
    .orderBy(asc("passengerId"))
    .persist

passengers.printSchema()

root
 |-- passengerId: integer (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)



LIMIT = 100
passengers = [passengerId: int, firstName: string ... 1 more field]


[passengerId: int, firstName: string ... 1 more field]

## Frequent flyer listing

In [8]:
frequentFlyers.createOrReplaceTempView("frequentFlyers")
passengers.createOrReplaceTempView("passengers")

In [9]:
val queryTopFrequetFlyers = """
SELECT 
    f.passengerId AS passenger_id,
    f.numberOfFlights AS number_of_flights,
    p.firstName AS first_name,
    p.lastName as last_name
FROM
    frequentFlyers f
    INNER JOIN passengers p
    ON f.passengerId = p.passengerId
ORDER BY number_of_flights DESC
"""
val topFrequentFlyers = spark.sql(queryTopFrequetFlyers)

queryTopFrequetFlyers = 
topFrequentFlyers = [passenger_id: int, number_of_flights: bigint ... 2 more fields]


"
SELECT
    f.passengerId AS passenger_id,
    f.numberOfFlights AS number_of_flights,
    p.firstName AS first_name,
    p.lastName as last_name
FROM
    frequentFlyers f
    INNER JOIN passengers p
    ON f.passengerId = p.passengerId
ORDER BY number_of_flights DESC
"


[passenger_id: int, number_of_flights: bigint ... 2 more fields]

In [10]:
timed(
    "Query top frequent flyers from flight data.",
    topFrequentFlyers.show(5)
)
println(timing)
println(topFrequentFlyers.rdd.toDebugString)

topFrequentFlyers
    // Coalesce to save in the driver node as one file, otherwise no need
    .coalesce(1)   
    // .persist
    .write
    .format("csv")
    .mode(SaveMode.Overwrite)
    .option("header", "true")
    .save(RESULT_DIR) 

+------------+-----------------+----------+---------+
|passenger_id|number_of_flights|first_name|last_name|
+------------+-----------------+----------+---------+
|        2068|               32|   Yolande|     Pete|
|        4827|               27|     Jaime|    Renay|
|        1677|               27| Katherina| Vasiliki|
|        3173|               26|  Sunshine|    Scott|
|        8961|               26|     Ginny|    Clara|
+------------+-----------------+----------+---------+
only showing top 5 rows

Processing Query top frequent flyers from flight data. took 4777 ms.

(7) MapPartitionsRDD[64] at rdd at <console>:55 []
 |  MapPartitionsRDD[63] at rdd at <console>:55 []
 |  MapPartitionsRDD[62] at rdd at <console>:55 []
 |  ShuffledRowRDD[61] at rdd at <console>:55 []
 +-(12) MapPartitionsRDD[60] at rdd at <console>:55 []
    |   MapPartitionsRDD[56] at rdd at <console>:55 []
    |   MapPartitionsRDD[55] at rdd at <console>:55 []
    |   MapPartitionsRDD[54] at rdd at <console>:55 

# Test

TBD. Need to create test units.

### Simple bash test
```
$ cat flightData.csv | awk '/^139,/{ print }' | wc -w
20
```

In [11]:
val pids = Vector(20, 32)

var query = """
SELECT numberOfFlights
FROM frequentFlyers
WHERE passengerId IN (139, 2068)
"""

val flightOf139 = spark.sql(query)
for((value, index) <- flightOf139.collect().zipWithIndex){
    if (pids(index) != value.getLong(0).toInt) println("Frequent flyer ID %d has incorrect count %d".format(index, value))

}

pids = Vector(20, 32)
query = 
flightOf139 = [numberOfFlights: bigint]


"
SELECT numberOfFlights
FROM frequentFlyers
WHERE passengerId IN (139, 2068)
"


[numberOfFlights: bigint]

In [12]:
frequentFlyers.unpersist
passengers.unpersist

[passengerId: int, firstName: string ... 1 more field]