# Chapter 6 - Complex Structure - Map
Create (key, value) pair from dataframe columns

# Setup

In [2]:
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.linalg.distributed._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.DataFrame

import java.time.temporal.ChronoUnit
import java.time.{Period, LocalDate, Instant}
import java.sql.Timestamp

### Spark parition control based on core availability

In [3]:
val NUM_CORES = 2
val NUM_PARTITIONS = 2

lazy val spark: SparkSession = SparkSession.builder()
    .appName("dataframe-map")
    .getOrCreate()

spark.conf.set("spark.default.parallelism", 8)
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
/*
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.driver.memory", "6g")
spark.conf.set("spark.executor.memory", "2g")
spark.conf.set("spark.master", "spark://masa:7077")
*/
import spark.implicits._

NUM_CORES = 2
NUM_PARTITIONS = 2
spark = <lazy>


<lazy>

In [4]:
val configMap = spark.conf.getAll.foreach(println)

(spark.serializer,org.apache.spark.serializer.KryoSerializer)
(spark.driver.host,192.168.1.116)
(spark.eventLog.enabled,true)
(spark.driver.port,43377)
(spark.hadoop.validateOutputSpecs,True)
(spark.repl.class.uri,spark://192.168.1.116:43377/classes)
(spark.jars,file:/home/oonisim/.local/share/jupyter/kernels/apache_toree_scala/lib/toree-assembly-0.3.0-incubating.jar)
(spark.repl.class.outputDir,/tmp/spark-ee7db58f-554b-46c0-b5d8-3be1b666c151/repl-6d42b196-e318-4058-a217-5e26e818ce1b)
(spark.app.name,flight)
(spark.driver.memory,2g)
(spark.executor.instances,2)
(spark.history.fs.logdirectory,hdfs://localhost:8020/logs_spark)
(spark.default.parallelism,8)
(spark.executor.id,driver)
(spark.submit.deployMode,client)
(spark.master,local)
(spark.executor.memory,4g)
(spark.eventLog.dir,hdfs://localhost:8020/logs_spark)
(spark.executor.cores,4)
(spark.app.id,local-1575884632753)
(spark.sql.shuffle.partitions,4)


configMap: Unit = ()


# Dataframe

In [8]:
val df = spark.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load("../data/retail-data/by-day/2010-12-01.csv")
df.printSchema()
df.createOrReplaceTempView("dfTable")

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



df = [InvoiceNo: string, StockCode: string ... 6 more fields]


[InvoiceNo: string, StockCode: string ... 6 more fields]

# Map


In [24]:
val mapDF = df.select(
    map(col("CustomerId"), col("InvoiceNo")).alias("customer_to_invoice"),
    col("CustomerId"),
    col("InvoiceNo")
)
mapDF.show(2, false)

+-------------------+----------+---------+
|customer_to_invoice|CustomerId|InvoiceNo|
+-------------------+----------+---------+
|[17850.0 -> 536365]|17850.0   |536365   |
|[17850.0 -> 536365]|17850.0   |536365   |
+-------------------+----------+---------+
only showing top 2 rows



mapDF = [customer_to_invoice: map<double,string>, CustomerId: double ... 1 more field]


[customer_to_invoice: map<double,string>, CustomerId: double ... 1 more field]

In [27]:
mapDF.selectExpr(
    "customer_to_invoice[17850.0]"
).show(2,false)

+--------------------------------------------+
|customer_to_invoice[CAST(17850.0 AS DOUBLE)]|
+--------------------------------------------+
|536365                                      |
|536365                                      |
+--------------------------------------------+
only showing top 2 rows



In [28]:
mapDF.select(
    explode(col("customer_to_invoice"))
).show(3, false)

+-------+------+
|key    |value |
+-------+------+
|17850.0|536365|
|17850.0|536365|
|17850.0|536365|
+-------+------+
only showing top 3 rows

