# Chapter 3: Programming with RDDs

###### Creating RDDs
> - file을 로드
> - 직접 데이터 선언

In [1]:
val lines = sc.textFile("/home/sparkuser/spark-1.6.2-bin-hadoop2.6/README.md")
lines.take(5)

Array(# Apache Spark, "", Spark is a fast and general cluster computing system for Big Data. It provides, high-level APIs in Scala, Java, Python, and R, and an optimized engine that, supports general computation graphs for data analysis. It also supports a)

In [2]:
val test = sc.parallelize(List[String]("pandas", "I like pandas"))
test.take(5)                        

Array(pandas, I like pandas)

### Transformation for a normal RDDs
> - 하나의 normal RDD에 대한 Transformation에 대해 알아본다.

###### map()
> - Transformation for a RDD

In [7]:
lines.map(x => x.split(" ")).take(5)

Array(Array(#, Apache, Spark), Array(""), Array(Spark, is, a, fast, and, general, cluster, computing, system, for, Big, Data., It, provides), Array(high-level, APIs, in, Scala,, Java,, Python,, and, R,, and, an, optimized, engine, that), Array(supports, general, computation, graphs, for, data, analysis., It, also, supports, a))

###### flatMap()
> - Transformation for a RDD

In [1]:
val words = lines.flatMap(x => x.split(" "))
words.take(5).foreach(println)
words.count()

#
Apache
Spark

Spark


507

###### filter()
> - Transformation for a RDD

In [44]:
words.filter(x => x.length > 3).take(5)

Array(Apache, Spark, Spark, fast, general)

###### distinct()
> - Transformation for a RDD

In [40]:
words.distinct().count()

260

###### sample()
> - Transformation for a RDD

In [43]:
words.sample(false, 0.1).count()

49

### Transformation for two normal RDDs
> - 두개의 normal RDD에 대한 Transformation에 대해 알아본다.
> - 테스트를 위해 short_words와 long_words란 RDD를 만들어 보자.

In [79]:
val short_words = words.filter(x => x.length <= 4 & x.length >= 3)
short_words.take(5).foreach(println)
println("-- Total count: " + short_words.count() + "--")

fast
and
for
Big
APIs
-- Total count: 129--


In [80]:
val long_words = words.filter(x => (x.length >= 4 & x.length < 5))
long_words.take(5).foreach(x => println(x))
println("-- Total count: " + long_words.count() + "--")

fast
APIs
that
data
also
-- Total count: 43--


###### union()

In [88]:
val union_words = short_words.union(long_words)
union_words.take(5).foreach(println)
union_words.count()

fast
and
for
Big
APIs


172

###### intersection()

In [87]:
val inter_words = short_words.intersection(long_words)
inter_words.take(5).foreach(println)
inter_words.count()

this
URL,
[run
APIs
same


35

###### subtract()

In [90]:
val sub_words = short_words.subtract(long_words)
sub_words.take(5).foreach(println)
sub_words.count()

use
use
use
run
run


86

###### cartesian()

In [92]:
val cart_words = short_words.cartesian(long_words)
cart_words.take(5).foreach(println)
cart_words.count()

(fast,fast)
(fast,APIs)
(fast,that)
(fast,data)
(fast,also)


5547

### Actions

###### collect()
> Return all elements from the RDD.

In [8]:
val lines = sc.textFile("/home/sparkuser/spark-1.6.2-bin-hadoop2.6/README.md")
val words = lines.flatMap(x => x.split(" "))
words.collect()

Array(#, Apache, Spark, "", Spark, is, a, fast, and, general, cluster, computing, system, for, Big, Data., It, provides, high-level, APIs, in, Scala,, Java,, Python,, and, R,, and, an, optimized, engine, that, supports, general, computation, graphs, for, data, analysis., It, also, supports, a, rich, set, of, higher-level, tools, including, Spark, SQL, for, SQL, and, DataFrames,, MLlib, for, machine, learning,, GraphX, for, graph, processing,, and, Spark, Streaming, for, stream, processing., "", <http://spark.apache.org/>, "", "", ##, Online, Documentation, "", You, can, find, the, latest, Spark, documentation,, including, a, programming, guide,, on, the, [project, web, page](http://spark.apache.org/documentation.html), and, [project, wiki](https://cwiki.apache.or...

###### count()

In [9]:
words.count()

507

###### countByValue()

In [11]:
words.countByValue()

Map(site, -> 1, Please -> 3, GraphX -> 1, "" -> 67, for -> 11, find -> 1, Apache -> 1, package -> 1, Hadoop, -> 2, Once -> 1, For -> 2, name -> 1, this -> 1, protocols -> 1, Hive -> 2, in -> 5, "local[N]" -> 1, MASTER=spark://host:7077 -> 1, have -> 1, your -> 1, are -> 1, is -> 6, HDFS -> 1, Data. -> 1, built -> 1, thread, -> 1, examples -> 2, using -> 2, system -> 1, Shell -> 2, mesos:// -> 1, easiest -> 1, This -> 2, [Apache -> 1, N -> 1, <class> -> 1, different -> 1, "local" -> 1, README -> 1, online -> 1, spark:// -> 1, return -> 2, Note -> 1, if -> 4, project -> 1, Scala -> 2, You -> 3, running -> 1, usage -> 1, versions -> 1, uses -> 1, must -> 1, do -> 2, programming -> 1, runs. -> 1, R, -> 1, distribution -> 1, print -> 1, About -> 1,...

###### take(num)

In [12]:
words.take(5)

Array(#, Apache, Spark, "", Spark)

###### top(num)

In [33]:
words.top(5)

Array(your, you, you, you, you)

###### takeOrdered(num)(ordering)
> - ordering을 이용하는 다양한 방법이 있음..

In [53]:
words.sample(false, 0.1).takeOrdered(5)(Ordering[String].reverse)

Array(way, using:, them,, the, the)

###### taksSample(withReplacement, num, [seed])

In [56]:
words.takeSample(false, 1)

Array([run)

###### reduce(func)

In [68]:
words.reduce((s, t) => s + ", " + t)

#, Apache, Spark, , Spark, is, a, fast, and, general, cluster, computing, system, for, Big, Data., It, provides, high-level, APIs, in, Scala,, Java,, Python,, and, R,, and, an, optimized, engine, that, supports, general, computation, graphs, for, data, analysis., It, also, supports, a, rich, set, of, higher-level, tools, including, Spark, SQL, for, SQL, and, DataFrames,, MLlib, for, machine, learning,, GraphX, for, graph, processing,, and, Spark, Streaming, for, stream, processing., , <http://spark.apache.org/>, , , ##, Online, Documentation, , You, can, find, the, latest, Spark, documentation,, including, a, programming, guide,, on, the, [project, web, page](http://spark.apache.org/documentation.html), and, [project, wiki](https://cwiki.apache.org/confluence/display/SP...

###### fold(zero)(func)

In [69]:
words.fold("|")((s, t) => s + ", " + t)

|, |, #, Apache, Spark, , Spark, is, a, fast, and, general, cluster, computing, system, for, Big, Data., It, provides, high-level, APIs, in, Scala,, Java,, Python,, and, R,, and, an, optimized, engine, that, supports, general, computation, graphs, for, data, analysis., It, also, supports, a, rich, set, of, higher-level, tools, including, Spark, SQL, for, SQL, and, DataFrames,, MLlib, for, machine, learning,, GraphX, for, graph, processing,, and, Spark, Streaming, for, stream, processing., , <http://spark.apache.org/>, , , ##, Online, Documentation, , You, can, find, the, latest, Spark, documentation,, including, a, programming, guide,, on, the, [project, web, page](http://spark.apache.org/documentation.html), and, [project, wiki](https://cwiki.apache.org/confluence/disp...

###### aggregate(zeroValue)(seqOp, combOp)
> - zeroValue는 연산 시작값
> - seqOp는 accmulator와 value와의 연산을 정의
> - combOp는 accmulator간의 연산을 정의
> - pair RDD에 사용하는 combineByKey()와 마찬가지로 input data와 다른 type의 값을 반환 받을 수 있다.

In [73]:
words.map(x => (x, x.length)).take(5).foreach(println)

(#,1)
(Apache,6)
(Spark,5)
(,0)
(Spark,5)


In [77]:
words.map(x => x.length).aggregate((0, 0))((acc, value) => (acc._1 + value, acc._2 + 1), (acc1, acc2) => (acc1._1 + acc2._1, acc1._2 + acc2._2))

(2852,507)

###### foreach(func)
> - RDD의 각 element에 대하여 어떤 동작을 취하고, 그 결과를 반환하지는 않는다.
> - Scala Array의 foreach와 구분해야 한다.

In [84]:
// Called foreach of RDD
words.foreach(println)

In [85]:
// Called foreach of Scala.Array
words.take(5).foreach(println)

#
Apache
Spark

Spark


### Persistence (Caching)
> - 사실 persist() 명령을 내리는 순간에는 아무 일도 일어나지 않는다. 이후 실행되는 Transformation 이나 Action이 있을 때 Persistence가 진행된다. 

> - unpersist()로 uncashing할 수 있다.

Before persist

In [212]:
def time[A](f: => A) = {
  val s = System.nanoTime
  val ret = f
  println("time: "+(System.nanoTime-s)/1e6+"ms")
  ret
}

In [211]:
time {
  val cart_RDD = words.cartesian(words).filter(x => ((x._1.length + x._2.length) > 10))
  cart_RDD.count()
}

time: 47.50313ms


99913

After persist

In [213]:
import org.apache.spark.storage.StorageLevel
val cart_RDD = words.cartesian(words).persist(StorageLevel.MEMORY_ONLY)

time {
  cart_RDD.count()
}

cart_RDD.unpersist()

time: 125.779017ms


CartesianRDD[280] at cartesian at <console>:115

In [214]:
import org.apache.spark.storage.StorageLevel
val cart_RDD = words.cartesian(words).persist(StorageLevel.MEMORY_ONLY)

time {
  cart_RDD.count()
}


time: 84.495402ms


257049

In [215]:
time {
  cart_RDD.count()
}

cart_RDD.unpersist()

time: 25.36117ms


CartesianRDD[281] at cartesian at <console>:117

###### end of chapter 3