# Chapter 6 - Data Types

# Setup

In [1]:
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.linalg.distributed._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.DataFrame

import java.time.temporal.ChronoUnit
import java.time.{Period, LocalDate, Instant}
import java.sql.Timestamp

### Spark parition control based on core availability

In [2]:
val NUM_CORES = 2
val NUM_PARTITIONS = 2

lazy val spark: SparkSession = SparkSession.builder()
    .master("local")
    .appName("flight")
    .getOrCreate()

spark.conf.set("spark.default.parallelism", 8)
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
/*
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.driver.memory", "6g")
spark.conf.set("spark.executor.memory", "2g")
spark.conf.set("spark.master", "spark://masa:7077")
*/
import spark.implicits._

NUM_CORES = 2
NUM_PARTITIONS = 2
spark = <lazy>


<lazy>

In [4]:
val configMap = spark.conf.getAll.foreach(println)

(spark.serializer,org.apache.spark.serializer.KryoSerializer)
(spark.driver.host,192.168.1.116)
(spark.eventLog.enabled,true)
(spark.driver.port,34465)
(spark.hadoop.validateOutputSpecs,True)
(spark.repl.class.uri,spark://192.168.1.116:34465/classes)
(spark.jars,file:/home/oonisim/.local/share/jupyter/kernels/apache_toree_scala/lib/toree-assembly-0.3.0-incubating.jar)
(spark.repl.class.outputDir,/tmp/spark-8747b854-8bc9-4b2b-a0d3-59be4e5e8749/repl-ccfa7214-cdcf-436b-a0f8-b9bb6a095c57)
(spark.app.name,flight)
(spark.driver.memory,2g)
(spark.executor.instances,2)
(spark.history.fs.logdirectory,hdfs://localhost:8020/logs_spark)
(spark.default.parallelism,8)
(spark.executor.id,driver)
(spark.submit.deployMode,client)
(spark.master,local)
(spark.executor.memory,4g)
(spark.eventLog.dir,hdfs://localhost:8020/logs_spark)
(spark.executor.cores,4)
(spark.app.id,local-1575786019867)
(spark.sql.shuffle.partitions,4)


configMap: Unit = ()


## Constants

# Tools

### Elapsed time profiler

In [10]:
import scala.collection.mutable.ListBuffer

val timing = new StringBuffer
val times = new ListBuffer[Long]()

def clear(): Unit = {
    timing.setLength(0)
    times.clear
}
def average(): Long = {
    times.reduce(_+_) / times.length
}

/**
@param label Description about the run
@code code to execute
@return execution
*/
def _timed[T](label: String, code: => T): T = {
    val start = System.currentTimeMillis()
    val result = code
    val stop = System.currentTimeMillis()
    timing.append(s"Processing $label took ${stop - start} ms.\n")
    times.append(stop - start)
    result
}

val timed = _timed _

timing = 
times = ListBuffer()
timed = > Nothing) => Nothing = <function2>


clear: ()Unit
average: ()Long
_timed: [T](label: String, code: => T)T


> Nothing) => Nothing = <function2>

In [11]:
// To flush out error: missing argument list for method timed
println("")

<console>:54: error: missing argument list for method _timed
Unapplied methods are only converted to functions when a function type is expected.
You can make this conversion explicit by writing `_timed _` or `_timed(_,_)` instead of `_timed`.
       _timed
       ^
lastException: Throwable = null


### Save to file

In [8]:
def save(df: DataFrame) = {
    df.coalesce(1)
    .write
    .format("csv")
    .mode(SaveMode.Overwrite)
    .option("header", "true")
    .save(RESULT_DIR)
}

save: (df: org.apache.spark.sql.DataFrame)Unit


# Dataframe

In [12]:
val df = spark.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load("../data/retail-data/by-day/2010-12-01.csv")
df.printSchema()
df.createOrReplaceTempView("dfTable")

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



df = [InvoiceNo: string, StockCode: string ... 6 more fields]


[InvoiceNo: string, StockCode: string ... 6 more fields]

In [20]:
df.show(5, true)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 5 rows



# Literal

SQL can select literals:
```
SELECT CAST(5 AS int), "five", 5.0 FROM DUAL
```

SparkSQL lit function provides the equivalent. 

In [16]:
df.select(lit(5), lit("five"), lit(5.0))
    .limit(5)
    .show

+---+----+---+
|  5|five|5.0|
+---+----+---+
|  5|five|5.0|
|  5|five|5.0|
|  5|five|5.0|
|  5|five|5.0|
|  5|five|5.0|
+---+----+---+



# Select

## Row number

In [48]:
df.select(
        monotonically_increasing_id(),
        col("customerId"),
        col("UnitPrice"),
        col("Quantity")
    )
    .show(5)

+-----------------------------+----------+---------+--------+
|monotonically_increasing_id()|customerId|UnitPrice|Quantity|
+-----------------------------+----------+---------+--------+
|                            0|   17850.0|     2.55|       6|
|                            1|   17850.0|     3.39|       6|
|                            2|   17850.0|     2.75|       8|
|                            3|   17850.0|     3.39|       6|
|                            4|   17850.0|     3.39|       6|
+-----------------------------+----------+---------+--------+
only showing top 5 rows



# Numerical

In [23]:
/*
SELECT 
    customerId, 
    (POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity
FROM dfTable
*/
df.selectExpr(
  "CustomerId",
  "(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity"
).show(2)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



In [34]:
df.select(
    col("UnitPrice"),
    round(
        col("UnitPrice"), 
        1
    ).alias("rounded"), 
    col("Quantity"),
    round(
        (col("UnitPrice") * col("Quantity")),
        5
    ).alias("total")
)
.show(5)

+---------+-------+--------+-----+
|UnitPrice|rounded|Quantity|total|
+---------+-------+--------+-----+
|     2.55|    2.6|       6| 15.3|
|     3.39|    3.4|       6|20.34|
|     2.75|    2.8|       8| 22.0|
|     3.39|    3.4|       6|20.34|
|     3.39|    3.4|       6|20.34|
+---------+-------+--------+-----+
only showing top 5 rows



# Statistics

In [38]:
df.selectExpr(
        "UnitPrice",
        "Quantity"
    )
    .describe()
    .show(5, true)

+-------+------------------+------------------+
|summary|         UnitPrice|          Quantity|
+-------+------------------+------------------+
|  count|              3108|              3108|
|   mean| 4.151946589446603| 8.627413127413128|
| stddev|15.638659854603892|26.371821677029203|
|    min|               0.0|               -24|
|    max|            607.49|               600|
+-------+------------------+------------------+



In [45]:
df
    .select(
        stddev(col("UnitPrice")),
        max(col("UnitPrice")),
        count(col("UnitPrice"))
    )
    .show

+----------------------+--------------+----------------+
|stddev_samp(UnitPrice)|max(UnitPrice)|count(UnitPrice)|
+----------------------+--------------+----------------+
|    15.638659854603892|        607.49|            3108|
+----------------------+--------------+----------------+



## Correlation

In [39]:
df.stat.corr("UnitPrice", "Quantity")

-0.04112314436835552

# String
* padding
* lower/upper

In [73]:
df
    .select(
        upper(col("Description")),
        //--------------------------------------------------------------------------------
        // lpad(str, len, pad) - left-padded with pad to a length of len. 
        // If str is longer than len, the return value is shortened to len characters.
        //--------------------------------------------------------------------------------
        lpad(
            lower(col("Description")),
            35,
            " "
        ).alias("leftpadded")
    )
    .show(5, false)

+-----------------------------------+-----------------------------------+
|upper(Description)                 |leftpadded                         |
+-----------------------------------+-----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER | white hanging heart t-light holder|
|WHITE METAL LANTERN                |                white metal lantern|
|CREAM CUPID HEARTS COAT HANGER     |     cream cupid hearts coat hanger|
|KNITTED UNION FLAG HOT WATER BOTTLE|knitted union flag hot water bottle|
|RED WOOLLY HOTTIE WHITE HEART.     |     red woolly hottie white heart.|
+-----------------------------------+-----------------------------------+
only showing top 5 rows



## Regexp

In [79]:
df.select(
  regexp_replace(col("Description"), "W(.*)E| METAL", "Replaced").alias("regexp_replaced"),
  col("Description")).show(3)

+--------------------+--------------------+
|     regexp_replaced|         Description|
+--------------------+--------------------+
|           ReplacedR|WHITE HANGING HEA...|
|          ReplacedRN| WHITE METAL LANTERN|
|CREAM CUPID HEART...|CREAM CUPID HEART...|
+--------------------+--------------------+
only showing top 3 rows



## Replace maching characters

In [81]:
df.select(translate(col("Description"), "LEHT", "leht"), col("Description"))
  .show(2)

+----------------------------------+--------------------+
|translate(Description, LEHT, leht)|         Description|
+----------------------------------+--------------------+
|              WhIte hANGING heA...|WHITE HANGING HEA...|
|               WhIte MetAl lANteRN| WHITE METAL LANTERN|
+----------------------------------+--------------------+
only showing top 2 rows

