# Chapter 6 - Date/Time

# Setup

In [1]:
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.linalg.distributed._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.DataFrame

import java.time.temporal.ChronoUnit
import java.time.{Period, LocalDate, Instant}
import java.sql.Timestamp

### Spark parition control based on core availability

In [2]:
val NUM_CORES = 2
val NUM_PARTITIONS = 2

lazy val spark: SparkSession = SparkSession.builder()
    .appName("dataframe-datatime")
    .getOrCreate()

spark.conf.set("spark.default.parallelism", 8)
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
/*
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.driver.memory", "6g")
spark.conf.set("spark.executor.memory", "2g")
spark.conf.set("spark.master", "spark://masa:7077")
*/
import spark.implicits._

NUM_CORES = 2
NUM_PARTITIONS = 2
spark = <lazy>


<lazy>

In [4]:
val configMap = spark.conf.getAll.foreach(println)

(spark.serializer,org.apache.spark.serializer.KryoSerializer)
(spark.driver.host,192.168.1.116)
(spark.eventLog.enabled,true)
(spark.driver.port,34465)
(spark.hadoop.validateOutputSpecs,True)
(spark.repl.class.uri,spark://192.168.1.116:34465/classes)
(spark.jars,file:/home/oonisim/.local/share/jupyter/kernels/apache_toree_scala/lib/toree-assembly-0.3.0-incubating.jar)
(spark.repl.class.outputDir,/tmp/spark-8747b854-8bc9-4b2b-a0d3-59be4e5e8749/repl-ccfa7214-cdcf-436b-a0f8-b9bb6a095c57)
(spark.app.name,flight)
(spark.driver.memory,2g)
(spark.executor.instances,2)
(spark.history.fs.logdirectory,hdfs://localhost:8020/logs_spark)
(spark.default.parallelism,8)
(spark.executor.id,driver)
(spark.submit.deployMode,client)
(spark.master,local)
(spark.executor.memory,4g)
(spark.eventLog.dir,hdfs://localhost:8020/logs_spark)
(spark.executor.cores,4)
(spark.app.id,local-1575786019867)
(spark.sql.shuffle.partitions,4)


configMap: Unit = ()


## Constants

# Tools

### Elapsed time profiler

In [10]:
import scala.collection.mutable.ListBuffer

val timing = new StringBuffer
val times = new ListBuffer[Long]()

def clear(): Unit = {
    timing.setLength(0)
    times.clear
}
def average(): Long = {
    times.reduce(_+_) / times.length
}

/**
@param label Description about the run
@code code to execute
@return execution
*/
def _timed[T](label: String, code: => T): T = {
    val start = System.currentTimeMillis()
    val result = code
    val stop = System.currentTimeMillis()
    timing.append(s"Processing $label took ${stop - start} ms.\n")
    times.append(stop - start)
    result
}

val timed = _timed _

timing = 
times = ListBuffer()
timed = > Nothing) => Nothing = <function2>


clear: ()Unit
average: ()Long
_timed: [T](label: String, code: => T)T


> Nothing) => Nothing = <function2>

In [11]:
// To flush out error: missing argument list for method timed
println("")

<console>:54: error: missing argument list for method _timed
Unapplied methods are only converted to functions when a function type is expected.
You can make this conversion explicit by writing `_timed _` or `_timed(_,_)` instead of `_timed`.
       _timed
       ^
lastException: Throwable = null


### Save to file

In [8]:
def save(df: DataFrame) = {
    df.coalesce(1)
    .write
    .format("csv")
    .mode(SaveMode.Overwrite)
    .option("header", "true")
    .save(RESULT_DIR)
}

save: (df: org.apache.spark.sql.DataFrame)Unit


# Dataframe

In [1]:
val dateDF = spark.range(10)
  .withColumn("today", current_date())
  .withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")

dateDF.printSchema()

root
 |-- id: long (nullable = false)
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)



dateDF = [id: bigint, today: date ... 1 more field]


[id: bigint, today: date ... 1 more field]

# Null

Spark can optimize working with null values more than empty strings or other values. The primary way of interacting with null values, at DataFrame scale, is to use the .na subpackage on a DataFrame.

When we declare a column as not having a null time, that is **NOT actually enforced**. To reiterate, when you define a schema in which all columns are declared to not have null values, Spark will not enforce that and will happily let null values into that column. **The nullable signal is simply to help Spark SQL optimize** for handling that column.

## Handling Null

* Coalesce
* ifnull
* nullIf 
* nvl
* nvl2

```
SELECT
  ifnull(null, 'return_value'),
  nullif('value', 'value'),
  nvl(null, 'return_value'),
  nvl2('not_null', 'return_value', "else_value")
FROM dfTable LIMIT 1

+------------+----+------------+------------+
|           a|   b|           c|           d|
+------------+----+------------+------------+
|return_value|null|return_value|return_value|
+------------+----+------------+------------+
```

In [24]:
df.na.drop()
df.na.drop("any")  // Same with drop() removing any row that includes a null column
df.na.drop("all")  // Only when all columns are null
df.na.drop("all", Seq("StockCode", "InvoiceNo")) // Only when specified columns are null

// Fill 5 for the specified columns if they are null
df.na.fill("All Null values become this string")
df.na.fill(5, Seq("StockCode", "InvoiceNo"))

// Specify value for each column if they are null
val fillColValues = Map("StockCode" -> 5, "Description" -> "No Value")
df.na.fill(fillColValues)

fillColValues = Map(StockCode -> 5, Description -> No Value)


[InvoiceNo: string, StockCode: string ... 6 more fields]

# Add/Subtract/Difference

In [7]:
dateDF.select(
    col("today"),
    date_sub(col("today"), 5).alias("behind"), 
    date_add(col("today"), 5).alias("ahead")
).withColumn(
    "date_between",
    datediff(
        col("ahead"),
        col("behind")
    )
).show(3)

+----------+----------+----------+------------+
|     today|    behind|     ahead|date_between|
+----------+----------+----------+------------+
|2019-12-09|2019-12-04|2019-12-14|          10|
|2019-12-09|2019-12-04|2019-12-14|          10|
|2019-12-09|2019-12-04|2019-12-14|          10|
+----------+----------+----------+------------+
only showing top 3 rows



## to_date
Optionally specify format in the Java SimpleDateFormat. 
Spark will not throw an error if it cannot parse the date; rather, it will just return null. 

Spark is working with Java dates and timestamps and therefore conforms to those standards.

In [17]:
dateDF.select(
    to_date(lit("2016-01-01 14:00"), "yyyy-MM-dd HH:mm").alias("start"),
    to_date(lit("2017-05-22")).alias("end"),
    to_date(lit("2017-22-11")).alias("error_case")  // Should specify yyyy-dd-MM format
).select(
    col("start"),
    col("end"),
    months_between(col("start"), col("end")),
    col("error_case")
).show(1)

+----------+----------+--------------------------------+----------+
|     start|       end|months_between(start, end, true)|error_case|
+----------+----------+--------------------------------+----------+
|2016-01-01|2017-05-22|                    -16.67741935|      null|
+----------+----------+--------------------------------+----------+
only showing top 1 row



# Timestamp

## Precision
Spark’s TimestampType class supports only second-level precision, which means that if you’re going to be working with milliseconds or microseconds, you’ll need to work around this problem by potentially operating on them as longs. Any more precision when coercing to a TimestampType will be removed.

In [18]:
val dateTimeDF = spark.range(10)
  .withColumn("today", current_date())
  .withColumn("now", current_timestamp())
dateTimeDF.createOrReplaceTempView("dateTable")

dateTimeDF = [id: bigint, today: date ... 1 more field]


[id: bigint, today: date ... 1 more field]

In [21]:
dateTimeDF.show(3, false)

+---+----------+-----------------------+
|id |today     |now                    |
+---+----------+-----------------------+
|0  |2019-12-09|2019-12-09 17:46:10.859|
|1  |2019-12-09|2019-12-09 17:46:10.859|
|2  |2019-12-09|2019-12-09 17:46:10.859|
+---+----------+-----------------------+
only showing top 3 rows

