# RDD

* Original Spark API
* Rows of data
* Uses Spark Context object (sc)

## Transformations
* map (applies a funcion to every row of the RDD)
* flatmap
* filter
* distinct
* sample
* union, intersection, subtract, cartesian

Many RDD methods accept a function as a parameter (functional programming).

## Actions

Returns a value back to the driver program.

* collect
* count
* countByValue
* take
* top
* redeuce 
* ...

## Lazy evaluation

Nothing actually happens in your driver program until an action is called!

## Key-value pairs RDDs
* reduceByKey
* groupByKey
* sortByKey
* keys, values
* join, rightOuterJoin, leftOuterJoin
* mapValues, flatMapValues

In [1]:
// Ratings historiogram example: print how many times each rating appear
//sc

val lines = sc.textFile("data/ml-100k/u.data")
//lines.collect()

val ratings = lines.map(x => x.toString().split("\t")(2))
//ratings.collect()

val results = ratings.countByValue()

val sortedResults = results.toSeq.sortBy(_._1)

sortedResults.foreach(println)

Intitializing Scala interpreter ...

Spark Web UI available at http://315284ee4037:4040
SparkContext available as 'sc' (version = 3.1.1, master = local[*], app id = local-1625058675256)
SparkSession available as 'spark'


(1,6110)
(2,11370)
(3,27145)
(4,34174)
(5,21201)


lines: org.apache.spark.rdd.RDD[String] = data/ml-100k/u.data MapPartitionsRDD[1] at textFile at <console>:28
ratings: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[2] at map at <console>:31
results: scala.collection.Map[String,Long] = Map(4 -> 34174, 5 -> 21201, 1 -> 6110, 2 -> 11370, 3 -> 27145)
sortedResults: Seq[(String, Long)] = Vector((1,6110), (2,11370), (3,27145), (4,34174), (5,21201))


In [2]:
// Key-value RDD example: average # of friends per age

val lines = sc.parallelize(List(
    "0,Will,33,385",
    "1,Jean-Luc,33,2",
    "2,Hugh,55,221",
    "3,Deanna,40,465",
    "4,Quark,68,21"
))
//lines.collect()

def parseLine(line: String) = {
    val fields = line.split(",")
    val age = fields(2).toInt
    val numFriends = fields(3).toInt
    (age, numFriends)
}

val rdd = lines.map(parseLine)
//rdd.collect()

// Sum # of friends and # of people by age (age is the key)
val totalsByAge = rdd.mapValues(x => (x, 1)).reduceByKey( (x , y) => (x._1 + y._1, x._2 + y._2) )
//totalsByAge.collect()

val results = totalsByAge.mapValues(x => x._1 / x._2).collect()

results.foreach(println)

(55,221)
(68,21)
(33,193)
(40,465)


lines: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[6] at parallelize at <console>:30
parseLine: (line: String)(Int, Int)
rdd: org.apache.spark.rdd.RDD[(Int, Int)] = MapPartitionsRDD[7] at map at <console>:46
totalsByAge: org.apache.spark.rdd.RDD[(Int, (Int, Int))] = ShuffledRDD[9] at reduceByKey at <console>:50
results: Array[(Int, Int)] = Array((55,221), (68,21), (33,193), (40,465))


In [33]:
// Filtering example

val rawFile = sc.textFile("data/1800.csv")
//rawFile.foreach(println)

def parseLine(line: String) = {
    val fields = line.split(",")
    val id = fields(0)
    val typ = fields(2)
    val temp = fields(3).toFloat / 10
    (id, typ, temp)
}

val extractedFields = rawFile.map(parseLine)
val minTemps = extractedFields.filter(_._2 == "TMIN")
val minTempMap = minTemps.map(x => (x._1, x._3))
val results = minTempMap.reduceByKey( (acc, el) => List(acc,el).min )
results.foreach(println)

(EZE00100082,-13.5)
(ITE00100554,-14.8)


rawFile: org.apache.spark.rdd.RDD[String] = data/1800.csv MapPartitionsRDD[75] at textFile at <console>:34
parseLine: (line: String)(String, String, Float)
extractedFields: org.apache.spark.rdd.RDD[(String, String, Float)] = MapPartitionsRDD[76] at map at <console>:45
minTemps: org.apache.spark.rdd.RDD[(String, String, Float)] = MapPartitionsRDD[77] at filter at <console>:46
minTempMap: org.apache.spark.rdd.RDD[(String, Float)] = MapPartitionsRDD[78] at map at <console>:47
results: org.apache.spark.rdd.RDD[(String, Float)] = ShuffledRDD[79] at reduceByKey at <console>:48


In [44]:
// Counting words with flatMap

val book = sc.textFile("data/book.txt")

val words = book.flatMap(x => x.split("\\W+"))

val loweCaseWords = words.map(x => x.toLowerCase())

//val wordCount = loweCaseWords.countByValue()

val wordCount = loweCaseWords.map(x => (x, 1)).reduceByKey((x,y) => x + y)

// Flip key and value to sort based on the value
val wordCountFlip = wordCount.map(x => (x._2, x._1))

val wordCountSorted = wordCountFlip.sortByKey()

wordCountSorted.foreach(println)

(3,behind)
(1,transitions)
(3,tips)
(1,intimately)
(1,312)
(3,clarity)
(1,conjure)
(1,requiring)
(3,planned)
(1,reserve)
(3,lived)
(3,button)
(1,wallets)
(3,eliminate)
(1,file)
(1,afterward)
(1,customizing)
(3,download)
(3,proper)
(1,tune)
(3,corporation)
(3,viewed)
(1,optional)
(1,stern)
(1,vietnam)
(3,resource)
(1,monitoring)
(3,workplace)
(1,completing)
(3,alternatives)
(1,rebel)
(3,field)
(1,divorced)
(3,encouraging)
(3,ensured)
(1,scam)
(1,dead)
(3,senior)
(3,seattle)
(1,unimaginably)
(3,participate)
(1,screw)
(3,statements)
(3,intellectual)
(1,competes)
(1,pursue)
(3,michal)
(3,freelancing)
(1,technically)
(3,risks)
(3,inside)
(1,offsets)
(3,peoples)
(3,funded)
(1,subscriptions)
(3,tempting)
(3,beauty)
(1,pivoting)
(3,abilities)
(1,backed)
(3,constant)
(3,professionally)
(3,covers)
(3,commercial)
(1,massachusetts)
(1,finalized)
(3,trademarks)
(3,moment)
(1,cdc)
(1,converted)
(3,waters)
(3,school)
(1,deductibles)
(3,improve)
(3,type)
(1,overwhelmingly)
(1,hits)
(3,vesting)
(3,conn

(5,details)
(5,worse)
(5,review)
(5,coffee)
(5,capital)
(5,importance)
(5,paragraph)
(5,earning)
(5,buying)
(5,total)
(5,honestly)
(5,rank)
(5,near)
(5,county)
(5,placements)
(5,farm)
(5,step)
(5,waste)
(5,space)
(5,lunch)
(5,developer)
(5,areas)
(5,prototype)
(5,his)
(5,2)
(5,creatively)
(5,launched)
(5,competing)
(5,raise)
(5,lucky)
(5,forums)
(5,regardless)
(5,storefront)
(5,meetup)
(5,post)
(5,trolls)
(5,ends)
(5,program)
(5,travel)
(5,quite)
(5,locally)
(5,backup)
(5,relies)
(5,numbers)
(5,minimize)
(5,fast)
(5,clouds)
(5,grows)
(5,focusing)
(5,our)
(5,endeavor)
(5,fixed)
(5,evaluation)
(5,allows)
(5,sounds)
(5,20)
(5,states)
(5,6)
(5,letter)
(5,interesting)
(5,word)
(5,optimizing)
(5,gives)
(5,hundred)
(5,five)
(5,english)
(5,mass)
(5,vary)
(5,hope)
(5,communication)
(5,seek)
(5,lets)
(5,profits)
(5,ten)
(5,bottom)
(5,respond)
(5,project)
(5,appropriate)
(5,placement)
(5,saying)
(5,devices)
(5,surprised)
(5,age)
(5,discover)
(5,lists)
(5,factor)
(5,tools)
(5,seems)
(5,conversatio

(1,upgrade)
(1,traveling)
(1,vote)
(1,immersed)
(1,pricing)
(1,fault)
(1,braces)
(1,caf)
(1,dividends)
(1,bundle)
(1,energetic)
(1,whirlwind)
(1,complaints)
(1,preserved)
(1,remarkably)
(1,weather)
(1,friendlier)
(1,bare)
(1,suite)
(1,receipt)
(1,hospital)
(1,rewarded)
(1,disease)
(1,possesses)
(1,burned)
(1,fiji)
(1,municipality)
(1,relative)
(1,reliever)
(1,canada)
(1,manufacturers)
(1,delivered)
(1,counts)
(1,landscapers)
(1,marriage)
(1,drove)
(1,conversations)
(1,billed)
(1,subset)
(1,thousand)
(1,child)
(1,unsure)
(1,door)
(1,characters)
(1,himself)
(1,disposing)
(1,surprising)
(1,device)
(1,approaches)
(1,bing)
(1,organizations)
(1,express)
(1,reporting)
(1,presumption)
(1,zoho)
(1,preserve)
(1,pitch)
(1,maker)
(1,advocates)
(1,sanity)
(1,history)
(1,achieves)
(1,journey)
(1,discovers)
(1,anticipate)
(1,window)
(1,insurmountably)
(1,steers)
(1,survival)
(1,secondary)
(1,batch)
(1,heavily)
(1,road)
(1,versus)
(1,alibaba)
(1,smarter)
(1,maritime)
(1,convey)
(1,newest)
(1,harper)
(

(1,hot)
(1,css)
(1,teachers)
(1,buys)
(1,suggestions)
(1,notion)
(1,react)
(1,linked)
(1,smallest)
(1,goodbye)
(1,manufacturer)
(1,occasionally)
(1,uncomfortable)
(1,zealand)
(1,habits)
(1,worlds)
(1,wasteful)
(1,recommendation)
(1,stamp)
(1,relationships)
(1,deteriorates)
(1,deviation)
(1,tenure)
(1,worries)
(1,custom)
(1,startlingly)
(1,promoting)
(1,u)
(1,volunteered)
(1,frame)
(1,attaining)
(1,fell)
(1,crafted)
(1,restaurateur)
(1,optimistic)
(1,frankly)
(1,alerts)
(1,overall)
(1,deborah)
(1,tricking)
(1,inbound)
(1,forum)
(1,cheapest)
(1,visiting)
(1,losses)
(1,delay)
(1,formed)
(1,alexa)
(1,millionaire)
(1,unsubscribe)
(1,watching)
(1,bridge)
(1,outliers)
(1,impressive)
(1,lives)
(1,handedly)
(1,skyscraper)
(1,participating)
(1,beast)
(1,reader)
(1,applied)
(1,brought)
(1,reaction)
(1,intended)
(1,average6)
(1,guard)
(1,educational)
(1,validation)
(1,congratulate)
(1,cram)
(1,similarity)
(1,tapping)
(1,dimensions)
(1,repeating)
(1,architectural)
(1,dream)
(1,perfect)
(1,immortali

(1,entrepreneur)
(1,paranoia)
(1,lasts)
(1,spared)
(1,rays)
(1,1099)
(1,chief)
(1,summit)
(1,solicit)
(1,dries)
(1,collected)
(1,swat)
(1,treated)
(1,manufactured)
(1,flows)
(1,trip)
(1,strategies)
(1,foam)
(1,reclaiming)
(1,reinforce)
(1,increasing)
(1,slim)
(1,improvements)
(1,random)
(1,chump)
(1,barely)
(1,dilemma)
(1,willingness)
(1,durable)
(1,preferred)
(1,copywriting)
(1,mined)
(1,reveal)
(1,decades)
(1,assumes)
(1,extrapolate)
(1,bankrupt)
(1,artist)
(1,arises)
(1,skillset)
(1,square)
(1,marketer)
(1,hyphenation)
(1,cloud)
(1,panic)
(1,explicitly)
(1,represents)
(1,clever)
(1,extensible)
(1,placed)
(1,popularity)
(1,race)
(1,47)
(1,damage)
(1,graduated)
(1,healthy)
(1,notable)
(1,directors)
(1,ctr)
(1,oh)
(1,admitting)
(1,deserves)
(1,happiness)
(1,retire)
(1,promise)
(1,pursuing)
(1,gallup)
(1,k)
(1,arranging)
(1,admittedly)
(1,storefronts)
(1,moleskine)
(1,shocked)
(1,betting)
(1,mentality)
(1,unproven)
(1,aggregate)
(1,notifies)
(1,suit)
(1,contingencies)
(1,chasing)
(1,com

book: org.apache.spark.rdd.RDD[String] = data/book.txt MapPartitionsRDD[122] at textFile at <console>:33
words: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[123] at flatMap at <console>:35
loweCaseWords: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[124] at map at <console>:37
wordCount: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[126] at reduceByKey at <console>:41
wordCountFlip: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[127] at map at <console>:44
wordCountSorted: org.apache.spark.rdd.RDD[(Int, String)] = ShuffledRDD[130] at sortByKey at <console>:46


In [60]:
// Amount spent by customer
val rawOrders = sc.textFile("data/customer-orders.csv")

def parseLine(line: String) = {
    val l = line.split(",")
    (l(0).toInt, l(2).toFloat)
}

val custOrders = rawOrders.map(parseLine)

val totalPerCustomer = custOrders.reduceByKey((acc, el) => acc + el).collect()

totalPerCustomer.foreach(println)

(34,5330.7993)
(52,5245.0605)
(96,3924.23)
(4,4815.05)
(16,4979.0605)
(82,4812.49)
(66,4681.92)
(28,5000.7104)
(54,6065.39)
(80,4727.86)
(98,4297.26)
(30,4990.72)
(14,4735.0303)
(50,4517.2695)
(36,4278.05)
(24,5259.92)
(64,5288.69)
(92,5379.281)
(74,4647.1304)
(90,5290.41)
(72,5337.4395)
(70,5368.2505)
(18,4921.27)
(12,4664.59)
(38,4898.461)
(20,4836.86)
(78,4524.51)
(10,4819.6997)
(94,4475.5703)
(84,4652.9395)
(56,4701.02)
(76,4904.2104)
(22,5019.449)
(46,5963.111)
(48,4384.3296)
(32,5496.0503)
(0,5524.9497)
(62,5253.3213)
(42,5696.8403)
(40,5186.4297)
(6,5397.8794)
(8,5517.24)
(86,4908.809)
(58,5437.7305)
(44,4756.8906)
(88,4830.55)
(60,5040.7095)
(26,5250.4004)
(68,6375.45)
(2,5994.591)
(13,4367.62)
(19,5059.4307)
(39,6193.1104)
(81,5112.71)
(71,5995.66)
(55,5298.09)
(29,5032.5303)
(79,3790.5698)
(65,5140.3496)
(11,5152.29)
(35,5155.42)
(57,4628.3994)
(51,4975.2197)
(37,4735.2)
(75,4178.5)
(45,3309.3804)
(1,4958.5996)
(89,4851.4795)
(63,5415.15)
(83,4635.8003)
(17,5032.6797)
(9,5322

rawOrders: org.apache.spark.rdd.RDD[String] = data/customer-orders.csv MapPartitionsRDD[214] at textFile at <console>:31
parseLine: (line: String)(Int, Float)
custOrders: org.apache.spark.rdd.RDD[(Int, Float)] = MapPartitionsRDD[215] at map at <console>:38
totalPerCustomer: Array[(Int, Float)] = Array((34,5330.7993), (52,5245.0605), (96,3924.23), (4,4815.05), (16,4979.0605), (82,4812.49), (66,4681.92), (28,5000.7104), (54,6065.39), (80,4727.86), (98,4297.26), (30,4990.72), (14,4735.0303), (50,4517.2695), (36,4278.05), (24,5259.92), (64,5288.69), (92,5379.281), (74,4647.1304), (90,5290.41), (72,5337.4395), (70,5368.2505), (18,4921.27), (12,4664.59), (38,4898.461), (20,4836.86), (78,4524.51), (10,4819.6997), (94,4475.5703), (84,4652.9395), (56,4701.02), (76,4904.2104), (22,5019.449), (46,...
