In [1]:
import org.apache.spark.sql.SparkSession

val spark = SparkSession.builder().
    appName("scala_rdd").
    config("spark.executor.instances","2").
    config("spark.executor.cores","2").
    config("spark.executor.memory", "4g").
    config("spark.yarn.executor.memoryOverhead", "1g").
    getOrCreate()

println("http://10.8.2.1:8089/proxy/"+ spark.sparkContext.applicationId)

http://10.8.2.1:8089/proxy/application_1515394405830_3960


# Transformations

In [2]:
var rdd = spark.sparkContext.parallelize(Array(1, 2, 3, 4))
println("Before: " + rdd.collect().mkString(","))
println("After * 2: " + rdd.map(_ * 2).collect().mkString(","))
println("Filter even: " + rdd.filter(_ % 2 == 0).collect().mkString(","))

Before: 1,2,3,4
After * 2: 2,4,6,8
Filter even: 2,4


In [3]:
rdd = spark.sparkContext.parallelize(Array(1, 2, 2, 3, 4))
println("Before: " + rdd.collect().mkString(","))
println("Distinct: " + rdd.distinct().collect().mkString(","))

Before: 1,2,2,3,4
Distinct: 4,1,2,3


In [4]:
rdd = spark.sparkContext.parallelize(Array(1, 2, 3))
println("Before: " + rdd.collect().mkString(","))
println("To array:")
rdd.map(x => Array(x, x + 5)).collect()

Before: 1,2,3
To array:


Array(Array(1, 6), Array(2, 7), Array(3, 8))

In [5]:
println("To flat array:")
rdd.flatMap(x => Array(x, x + 5)).collect()

To flat array:


Array(1, 6, 2, 7, 3, 8)

# Actions

In [6]:
// Python rdd.reduce(lambda a, b: a * b)
rdd = spark.sparkContext.parallelize(Array(1, 2, 3))
println("Before: " + rdd.collect().mkString(","))
println("Reduce: " + rdd.reduce((a, b) => a * b))
println("Take 2: " + rdd.take(2).mkString(","))
println("Collect: " + rdd.collect().mkString(","))
println("Count: " + rdd.count())

Before: 1,2,3
Reduce: 6
Take 2: 1,2
Collect: 1,2,3
Count: 3


# Key-Value RDDs

In [7]:
val keyValReduceByKey = spark.sparkContext.parallelize(Seq((1, 2), (3, 4), (3, 6)))
keyValReduceByKey.reduceByKey((a, b) => a + b).collect()

Array((1,2), (3,10))

In [8]:
val keyValSortByKey = spark.sparkContext.parallelize(Seq((1, "a"), (2, "c"), (1, "b")))
keyValSortByKey.sortByKey().collect()

Array((1,a), (1,b), (2,c))

In [9]:
val keyValGroupByKey = spark.sparkContext.parallelize(Seq((1, "a"), (2, "c"), (1, "b")))
keyValGroupByKey.groupByKey().collect()

Array((1,CompactBuffer(a, b)), (2,CompactBuffer(c)))

In [10]:
val x = spark.sparkContext.parallelize(Seq(("a", 1), ("b", 4)))
val y = spark.sparkContext.parallelize(Seq(("a", 2), ("a", 3)))
x.join(y).collect()

Array((a,(1,2)), (a,(1,3)))

In [11]:
x.leftOuterJoin(y).collect()

Array((a,(1,Some(2))), (a,(1,Some(3))), (b,(4,None)))

In [12]:
x.rightOuterJoin(y).collect()

Array((a,(Some(1),2)), (a,(Some(1),3)))

In [13]:
x.fullOuterJoin(y).collect()

Array((a,(Some(1),Some(2))), (a,(Some(1),Some(3))), (b,(Some(4),None)))