# Question 01
Find the total number of flights for each month.

## Assumptions
1. Data is clearned and not errorneous
2. Timezone consideration is not required

# Setup

In [14]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions._
import java.time.temporal.ChronoUnit
import java.time.{Period, LocalDate, Instant}
import java.sql.Timestamp

### Parition control based on core availability

In [15]:
val NUM_CORES = 4
val NUM_PARTITIONS = 3

lazy val spark: SparkSession = SparkSession.builder()
    .master("local")
    .appName("flight")
    .getOrCreate()

spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)

NUM_CORES = 4
NUM_PARTITIONS = 3
spark = <lazy>


<lazy>

# Tools

### Elapsed time profiler

In [16]:
val timing = new StringBuffer
def timed[T](label: String, code: => T): T = {
    val start = System.currentTimeMillis()
    val result = code
    val stop = System.currentTimeMillis()
    timing.append(s"Processing $label took ${stop - start} ms.\n")
    result
}

timing = 


timed: [T](label: String, code: => T)T


In [17]:
// To flush out error: missing argument list for method timed
println("")

<console>:54: error: missing argument list for method timed
Unapplied methods are only converted to functions when a function type is expected.
You can make this conversion explicit by writing `timed _` or `timed(_,_)` instead of `timed`.
       timed
       ^
lastException: Throwable = null


# Main

### UDF

In [18]:
//val BASE_TIMESTAMP = java.sql.Timestamp.valueOf("2017-01-01 00:00:00.0")
val BASE_LOCALDATE = LocalDate.parse("2017-01-01").withDayOfMonth(1)

def get_months_between(to: Timestamp): Short = {
    val monthsBetween = ChronoUnit.MONTHS.between(
        BASE_LOCALDATE,
        to.toLocalDateTime().toLocalDate().withDayOfMonth(1)
    )
    monthsBetween.toShort
}
val udf_months_between = udf((t:Timestamp) => get_months_between(t))

BASE_LOCALDATE = 2017-01-01
udf_months_between = UserDefinedFunction(<function1>,ShortType,Some(List(TimestampType)))


get_months_between: (to: java.sql.Timestamp)Short


UserDefinedFunction(<function1>,ShortType,Some(List(TimestampType)))

## Constants

In [19]:
val FLIGHTDATA_CSV_PATH = "../resources/flightData.csv"
val PASSENGER_CSV_PATH = "../resources/passengers.csv"
val RESULT_DIR = "results/flightsPerMonth"

FLIGHTDATA_CSV_PATH = ../resources/flightData.csv
PASSENGER_CSV_PATH = ../resources/passengers.csv
RESULT_DIR = results/flightsPerMonth


results/flightsPerMonth

## Base DataFrame

In [20]:
val flightData = spark.read.format("csv")
    .option("header", "true")
    .option("delimiter", ",")
    .option("dateFormat", "yyyy-MM-dd")
    .option("inferSchema", "true")
    .load(FLIGHTDATA_CSV_PATH)

flightData = [passengerId: int, flightId: int ... 3 more fields]


[passengerId: int, flightId: int ... 3 more fields]

##  Total flights per month
Month is in-between the first day of the month and the first day of the next month.

In [21]:
val flightsPerMonth = flightData
    .select("flightId", "date")
    .distinct()
    .groupBy(
        trunc(col("date"), "month").alias("Month")
    )
    .agg(count("flightId").alias("Number of Flights"))
    .orderBy(asc("Month"))
    .withColumn(
        "Month", udf_months_between(col("Month"))
    )

//flightsPerMonth.printSchema()

flightsPerMonth = [Month: smallint, Number of Flights: bigint]


[Month: smallint, Number of Flights: bigint]

In [22]:
timed(
    "Run flights per month",
    flightsPerMonth.show()
)
println(timing)
println(flightsPerMonth.rdd.toDebugString)

flightsPerMonth
    .coalesce(1)
    .write
    .format("csv")
    .mode(SaveMode.Overwrite)
    .option("header", "true")
    .save(RESULT_DIR)

+-----+-----------------+
|Month|Number of Flights|
+-----+-----------------+
|    0|               97|
|    1|               73|
|    2|               82|
|    3|               92|
|    4|               92|
|    5|               71|
|    6|               87|
|    7|               76|
|    8|               85|
|    9|               76|
|   10|               75|
|   11|               94|
+-----+-----------------+

Processing Run flights per month took 3123 ms.

(12) MapPartitionsRDD[34] at rdd at <console>:63 []
 |   MapPartitionsRDD[33] at rdd at <console>:63 []
 |   MapPartitionsRDD[32] at rdd at <console>:63 []
 |   ShuffledRowRDD[31] at rdd at <console>:63 []
 +-(12) MapPartitionsRDD[30] at rdd at <console>:63 []
    |   MapPartitionsRDD[26] at rdd at <console>:63 []
    |   ShuffledRowRDD[25] at rdd at <console>:63 []
    +-(12) MapPartitionsRDD[24] at rdd at <console>:63 []
       |   MapPartitionsRDD[23] at rdd at <console>:63 []
       |   ShuffledRowRDD[22] at rdd at <console>:

# Validation

In [23]:
flightData.select(
    countDistinct("date"),
    min("date"),
    min(udf_months_between(col("date"))),
    max("date"),
    max(udf_months_between(col("date")))
).show

flightData.select(
    countDistinct("passengerId"),
    min("passengerId"),
    max("passengerId")
).show

flightData.select(
    countDistinct("flightId"),
    min("flightId"),
    max("flightId")
).show

+--------------------+-------------------+--------------+-------------------+--------------+
|count(DISTINCT date)|          min(date)|min(UDF(date))|          max(date)|max(UDF(date))|
+--------------------+-------------------+--------------+-------------------+--------------+
|                 342|2017-01-01 00:00:00|             0|2017-12-31 00:00:00|            11|
+--------------------+-------------------+--------------+-------------------+--------------+

+---------------------------+----------------+----------------+
|count(DISTINCT passengerId)|min(passengerId)|max(passengerId)|
+---------------------------+----------------+----------------+
|                      15500|               1|           15500|
+---------------------------+----------------+----------------+

+------------------------+-------------+-------------+
|count(DISTINCT flightId)|min(flightId)|max(flightId)|
+------------------------+-------------+-------------+
|                    1000|            0|        

## Assertions

In [24]:
val distinctFlights = flightData.select(
    countDistinct("flightId")
).collect().map(r => r.getLong(0)).head

val totalFlights = flightsPerMonth.select(
    sum("Number of Flights")
).collect().map(r => r.getLong(0)).head

println(s"Distincts $distinctFlights Total $totalFlights")
require(distinctFlights == totalFlights)

Distincts 1000 Total 1000


distinctFlights = 1000
totalFlights = 1000


1000