# Chapter 6 - Complex Structure - Array

# Setup

In [1]:
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.linalg.distributed._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.DataFrame

import java.time.temporal.ChronoUnit
import java.time.{Period, LocalDate, Instant}
import java.sql.Timestamp

### Spark parition control based on core availability

In [2]:
val NUM_CORES = 2
val NUM_PARTITIONS = 2

lazy val spark: SparkSession = SparkSession.builder()
    .appName("dataframe-array")
    .getOrCreate()

spark.conf.set("spark.default.parallelism", 8)
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
/*
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.driver.memory", "6g")
spark.conf.set("spark.executor.memory", "2g")
spark.conf.set("spark.master", "spark://masa:7077")
*/
import spark.implicits._

NUM_CORES = 2
NUM_PARTITIONS = 2
spark = <lazy>


<lazy>

In [4]:
val configMap = spark.conf.getAll.foreach(println)

(spark.serializer,org.apache.spark.serializer.KryoSerializer)
(spark.driver.host,192.168.1.116)
(spark.eventLog.enabled,true)
(spark.driver.port,34465)
(spark.hadoop.validateOutputSpecs,True)
(spark.repl.class.uri,spark://192.168.1.116:34465/classes)
(spark.jars,file:/home/oonisim/.local/share/jupyter/kernels/apache_toree_scala/lib/toree-assembly-0.3.0-incubating.jar)
(spark.repl.class.outputDir,/tmp/spark-8747b854-8bc9-4b2b-a0d3-59be4e5e8749/repl-ccfa7214-cdcf-436b-a0f8-b9bb6a095c57)
(spark.app.name,flight)
(spark.driver.memory,2g)
(spark.executor.instances,2)
(spark.history.fs.logdirectory,hdfs://localhost:8020/logs_spark)
(spark.default.parallelism,8)
(spark.executor.id,driver)
(spark.submit.deployMode,client)
(spark.master,local)
(spark.executor.memory,4g)
(spark.eventLog.dir,hdfs://localhost:8020/logs_spark)
(spark.executor.cores,4)
(spark.app.id,local-1575786019867)
(spark.sql.shuffle.partitions,4)


configMap: Unit = ()


# Dataframe

In [1]:
val dateDF = spark.range(10)
  .withColumn("today", current_date())
  .withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")

dateDF.printSchema()

root
 |-- id: long (nullable = false)
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)



dateDF = [id: bigint, today: date ... 1 more field]


[id: bigint, today: date ... 1 more field]

# Array

In [86]:
val complexDF = df.selectExpr("(Description, InvoiceNo) as complex", "InvoiceDate")
complexDF.show(3, false)

+--------------------------------------------+-------------------+
|complex                                     |InvoiceDate        |
+--------------------------------------------+-------------------+
|[WHITE HANGING HEART T-LIGHT HOLDER, 536365]|2010-12-01 08:26:00|
|[WHITE METAL LANTERN, 536365]               |2010-12-01 08:26:00|
|[CREAM CUPID HEARTS COAT HANGER, 536365]    |2010-12-01 08:26:00|
+--------------------------------------------+-------------------+
only showing top 3 rows



complexDF = [complex: struct<Description: string, InvoiceNo: string>, InvoiceDate: timestamp]


[complex: struct<Description: string, InvoiceNo: string>, InvoiceDate: timestamp]

In [87]:
complexDF.select("complex.Description").show(3, false)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|CREAM CUPID HEARTS COAT HANGER    |
+----------------------------------+
only showing top 3 rows



In [38]:
complexDF.select(col("complex").getField("InvoiceNo")).show(3)

+-----------------+
|complex.InvoiceNo|
+-----------------+
|           536365|
|           536365|
|           536365|
+-----------------+
only showing top 3 rows



# Split a column into array

In [70]:
val arrayDF = df.select(split(col("Description"), " ").alias("array_col"))
.select(
    col("array_col"),
    size(col("array_col"))
)
.withColumn(
    "contains_WHITE",
    array_contains(col("array_col"), "WHITE")
)
arrayDF.show(3, false)

+----------------------------------------+---------------+--------------+
|array_col                               |size(array_col)|contains_WHITE|
+----------------------------------------+---------------+--------------+
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|5              |true          |
|[WHITE, METAL, LANTERN]                 |3              |true          |
|[CREAM, CUPID, HEARTS, COAT, HANGER]    |5              |false         |
+----------------------------------------+---------------+--------------+
only showing top 3 rows



arrayDF = [array_col: array<string>, size(array_col): int ... 1 more field]


[array_col: array<string>, size(array_col): int ... 1 more field]

# Array elements into rows
Create a row for each array value

In [76]:
val arrayRowDF = arrayDF.selectExpr(
    "array_col"
)
.limit(1)

arrayRowDF.show(1, false)

+----------------------------------------+
|array_col                               |
+----------------------------------------+
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|
+----------------------------------------+



arrayRowDF = [array_col: array<string>]


[array_col: array<string>]

In [84]:
// For each value in [WHITE, HANGING, HEART, T-LIGHT, HOLDER], a row is created
arrayRowDF
  .withColumn("exploded", explode(col("array_col")))
  .selectExpr(
      "array_col", "exploded"
  )
  .show(false)

+----------------------------------------+--------+
|array_col                               |exploded|
+----------------------------------------+--------+
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|WHITE   |
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|HANGING |
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|HEART   |
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|T-LIGHT |
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|HOLDER  |
+----------------------------------------+--------+

