# Lecture 12 Spark RDD APIs

In [1]:
import findspark

findspark.init("/opt/spark")

In [2]:
from pyspark.context import SparkContext

In [3]:
sc = SparkContext('local','lec 12 RDD moynihanl')

 ## Example 1; NUMBER ARRAYS

In [5]:
rdd = sc.parallelize([1,2,3,4,5,6,7,8,9,10])

In [6]:
squares = rdd.map(lambda x: x*x)

In [7]:
squares.collect()

[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]

In [9]:
even = squares.filter(lambda x: x%2==0)

In [10]:
even.collect()

[4, 16, 36, 64, 100]

In [11]:
even.take(2)

[4, 16]

In [12]:
even.count()

5

In [13]:
even.reduce(lambda x, y: x+y)

220

## Example 2: Key value pairs as input data

In [14]:
pets = sc.parallelize([('cat',1),('dog',1),('cat',2)])

In [16]:
pet_count = pets.reduceByKey(lambda x, y: x+y)
pet_count.collect()

[('cat', 3), ('dog', 1)]

In [18]:
pet_group = pets.groupByKey()
pet_group.collect()

[('cat', <pyspark.resultiterable.ResultIterable at 0x7faaa2002f50>),
 ('dog', <pyspark.resultiterable.ResultIterable at 0x7faaa2002f10>)]

In [22]:
for key, value in pet_group.collect():   #tuple unpacking
    print(key, list(value))

cat [1, 2]
dog [1]


In [23]:
#sort alphabetical order
pet_sorted = pets.sortByKey()

pet_sorted.collect()

[('cat', 1), ('cat', 2), ('dog', 1)]

In [24]:
pet_duplicate = pets.mapValues(lambda x: x*2)  #only mapping values part (second part)
pet_duplicate.collect()

[('cat', 2), ('dog', 2), ('cat', 4)]

## Exercise 2: Average Friends

In [25]:
file_url = 'file:////home/moynihanl/ds420/lec12_friends.csv'

In [27]:
lines = sc.textFile(file_url)

In [28]:
lines.take(5)

['0,Will,33,385',
 '1,Jean-Luc,26,2',
 '2,Hugh,55,221',
 '3,Deanna,40,465',
 '4,Quark,68,21']

In [32]:
def parseLine(line):
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)

In [33]:
rdd = lines.map(parseLine)
rdd.take(5)

[(33, 385), (26, 2), (55, 221), (40, 465), (68, 21)]

In [34]:
rdd_addcount = rdd.mapValues(lambda x: (x, 1))
rdd_addcount.take(5)

[(33, (385, 1)), (26, (2, 1)), (55, (221, 1)), (40, (465, 1)), (68, (21, 1))]

In [36]:
totalsByAge = rdd_addcount.reduceByKey(lambda x, y: (x[0]+y[0], x[1]+y[1]))

In [37]:
totalsByAge.take(5)

[(33, (3904, 12)),
 (26, (4115, 17)),
 (55, (3842, 13)),
 (40, (4264, 17)),
 (68, (2696, 10))]

In [39]:
avgByAge = totalsByAge.mapValues(lambda x: x[0]/x[1])
avgByAge.take(5)

[(33, 325.3333333333333),
 (26, 242.05882352941177),
 (55, 295.53846153846155),
 (40, 250.8235294117647),
 (68, 269.6)]

In [40]:
result = avgByAge.collect()
result

[(33, 325.3333333333333),
 (26, 242.05882352941177),
 (55, 295.53846153846155),
 (40, 250.8235294117647),
 (68, 269.6),
 (59, 220.0),
 (37, 249.33333333333334),
 (54, 278.0769230769231),
 (38, 193.53333333333333),
 (27, 228.125),
 (53, 222.85714285714286),
 (57, 258.8333333333333),
 (56, 306.6666666666667),
 (43, 230.57142857142858),
 (36, 246.6),
 (22, 206.42857142857142),
 (35, 211.625),
 (45, 309.53846153846155),
 (60, 202.71428571428572),
 (67, 214.625),
 (19, 213.27272727272728),
 (30, 235.8181818181818),
 (51, 302.14285714285717),
 (25, 197.45454545454547),
 (21, 350.875),
 (42, 303.5),
 (49, 184.66666666666666),
 (48, 281.4),
 (50, 254.6),
 (39, 169.28571428571428),
 (32, 207.9090909090909),
 (58, 116.54545454545455),
 (64, 281.3333333333333),
 (31, 267.25),
 (52, 340.6363636363636),
 (24, 233.8),
 (20, 165.0),
 (62, 220.76923076923077),
 (41, 268.55555555555554),
 (44, 282.1666666666667),
 (69, 235.2),
 (65, 298.2),
 (61, 256.22222222222223),
 (28, 209.1),
 (66, 276.44444444444