# Assumptions
1. Data is clearned and not errorneous
2. Timezone consideration is not required

# Setup

In [68]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions._
import java.time.temporal.ChronoUnit
import java.time.{Period, LocalDate, Instant}
import java.sql.Timestamp

## Parition control based on core availability

In [63]:
val NUM_CORES = 4
val NUM_PARTITIONS = 3

spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)

NUM_CORES = 4
NUM_PARTITIONS = 3


3

## Utilities

### Elapsed time profiler

In [64]:
val timing = new StringBuffer
def timed[T](label: String, code: => T): T = {
    val start = System.currentTimeMillis()
    val result = code
    val stop = System.currentTimeMillis()
    timing.append(s"Processing $label took ${stop - start} ms.\n")
    result
}

timing = 


timed: [T](label: String, code: => T)T


In [65]:
// To flush out error: missing argument list for method timed
println("")

<console>:93: error: missing argument list for method timed
Unapplied methods are only converted to functions when a function type is expected.
You can make this conversion explicit by writing `timed _` or `timed(_,_)` instead of `timed`.
       timed
       ^
lastException: Throwable = null


### UDF

In [71]:
//val BASE_TIMESTAMP = java.sql.Timestamp.valueOf("2017-01-01 00:00:00.0")
val BASE_LOCALDATE = LocalDate.parse("2017-01-01").withDayOfMonth(1)

def get_months_between(to: Timestamp): Short = {
    val monthsBetween = ChronoUnit.MONTHS.between(
        BASE_LOCALDATE,
        to.toLocalDateTime().toLocalDate().withDayOfMonth(1)
    )
    monthsBetween.toShort
}
val udf_months_between = udf((t:Timestamp) => get_months_between(t))

BASE_DATE = 2017-01-01
BASE_LOCALDATE = 2017-01-01
udf_months_between = UserDefinedFunction(<function1>,ShortType,Some(List(TimestampType)))


get_months_between: (to: java.sql.Timestamp)Short


UserDefinedFunction(<function1>,ShortType,Some(List(TimestampType)))

# Total flights per month
Month is in-between the first day of the month and the first day of the next month.

In [83]:
// Transformations, no action yet
val flightsPerMonth = spark.read.format("csv")
    .option("header", "true")
    .option("delimiter", ",")
    .option("dateFormat", "yyyy-MM-dd")
    .option("inferSchema", "true")
    .load("../resources/flightData.csv")
    .withColumn(
        "month", udf_months_between(col("date"))
    )
    .select("flightId", "month")
    .distinct()
    .groupBy("month")
    .agg(count("flightId"))
    .orderBy(asc("month"))
    .withColumnRenamed("count(flightId)", "Flights")

flightsPerMonth.printSchema()

root
 |-- month: short (nullable = false)
 |-- Flights: long (nullable = false)



flightsPerMonth = [month: smallint, Flights: bigint]


[month: smallint, Flights: bigint]

In [84]:
// Action
flightsPerMonth
    // Coalesce to save in the driver node as one file, otherwise no need
    .coalesce(1)   
    // .persist
    .write
    .format("csv")
    .mode(SaveMode.Overwrite)
    .option("header", "true")
    .save("flightsPerMonth") 

/*
flights.show(5)
flights.unpersist()
*/

# Frequent flyers
Top 100 frequent flyers

In [88]:
val LIMIT = 100
val frequentFlyers = spark.read.format("csv")
    .option("header", "true")
    .option("delimiter", ",")
    .option("dateFormat", "yyyy-MM-dd")
    .option("inferSchema", "true")
    .load("../resources/flightData.csv")
    .select("passengerId")
    .groupBy("passengerId")
    .count
    .orderBy(desc("count"))
    .limit(LIMIT)
    .withColumnRenamed("count", "numberOfFlights")

frequentFlyers.printSchema()

root
 |-- passengerId: integer (nullable = true)
 |-- numberOfFlights: long (nullable = false)



LIMIT = 100
frequentFlyers = [passengerId: int, numberOfFlights: bigint]


[passengerId: int, numberOfFlights: bigint]

In [89]:
frequentFlyers
    // Coalesce to save in the driver node as one file, otherwise no need
    .coalesce(1)   
    // .persist
    .write
    .format("csv")
    .mode(SaveMode.Overwrite)
    .option("header", "true")
    .save("frequentFlyers") 

/*
frequentFlyers.show(5)
*/