## RDD Action

In [19]:
# 1. collect
rdd = sc.parallelize(range(1, 11))
result = rdd.collect()
print(result)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [20]:
# 2. count
result = rdd.count()
print(result)

10


In [21]:
# 3. take
rdd = sc.parallelize(range(1, 100))
result = rdd.take(5)
print(result)

[1, 2, 3, 4, 5]


In [22]:
# 4. first
rdd = sc.parallelize([5, 4, 1])
result = rdd.first()
print(result)

5


In [23]:
# 5. countByValue
rdd = sc.parallelize((1, 1, 2, 3, 3))
result = rdd.countByValue()
print(result)

defaultdict(<class 'int'>, {1: 2, 2: 1, 3: 2})


In [24]:
# 6. reduce
rdd = sc.parallelize(list(range(1, 11)))
result = rdd.reduce(lambda x, y: x + y)
print(result)

55


In [25]:
# 7. sum
result = rdd.sum()
print(result)

55


## RDD Transformation
반환 결과 : 새로운 RDD  
action 연산을 사용할 때 계산하여 실제 값으로 확인 가능함

In [26]:
# 1. map
rdd = sc.parallelize(list(range(1, 6)))
result = rdd.map(lambda x: x + 1)
print(result)

PythonRDD[49] at RDD at PythonRDD.scala:53


In [27]:
print(result.collect())

[2, 3, 4, 5, 6]


In [28]:
# 2. flatMap
fruits = ("apple,orange", "grape,apple,mango", "bluberry,tomato,banana")
rdd1 = sc.parallelize(fruits)
rdd2 = rdd1.flatMap(lambda x: x.split(","))
print(rdd2.collect())

['apple', 'orange', 'grape', 'apple', 'mango', 'bluberry', 'tomato', 'banana']


In [29]:
rdd2 = rdd1.map(lambda x: x.split(","))
print(rdd2.collect())

[['apple', 'orange'], ['grape', 'apple', 'mango'], ['bluberry', 'tomato', 'banana']]


In [30]:
# 3. mapValues
rdd1 = sc.parallelize(["a", "b", "c"])
rdd2 = rdd1.map(lambda v: (v, 1))  # [('a', 1), ('b', 1), ('c', 1)]
rdd3 = rdd2.mapValues(lambda i: i+1)  # 모든 value 값에 +1
print(rdd3.collect())

[('a', 2), ('b', 2), ('c', 2)]


In [31]:
# 4. flatMapValues
rdd1 = sc.parallelize( [(1, "a,b"), (2, "a,c"), (1, "d,e")] )
rdd2 = rdd1.flatMapValues(lambda s: s.split(","))
print(rdd2.collect())  # 결과를 1차원 배열로 반환

[(1, 'a'), (1, 'b'), (2, 'a'), (2, 'c'), (1, 'd'), (1, 'e')]


In [32]:
# 5. zip
rdd1 = sc.parallelize(["a", "b", "c"])
rdd2 = sc.parallelize([1, 2, 3])
result = rdd1.zip(rdd2)
print(result.collect())

[('a', 1), ('b', 2), ('c', 3)]


In [33]:
# 6. groupBy
rdd1 = sc.parallelize(range(1, 11))
rdd2 = rdd1.groupBy(lambda x: "even" if x % 2 == 0 else "odd")
print(rdd2.collect())

[('even', <pyspark.resultiterable.ResultIterable object at 0x7f037d1087d0>), ('odd', <pyspark.resultiterable.ResultIterable object at 0x7f037d108d50>)]


In [34]:
for x in rdd2.collect():
    print(x[0], list(x[1]))

even [2, 4, 6, 8, 10]
odd [1, 3, 5, 7, 9]


In [35]:
# 7. groupByKey
rdd1 = sc.parallelize(["a", "b", "c", "b", "c"]).map(lambda v: (v, 1))
rdd2 = rdd1.groupByKey()
for x in rdd2.collect():
    print(x[0], list(x[1]))

b [1, 1]
c [1, 1]
a [1]


In [37]:
# 8. distinct
rdd = sc.parallelize([1, 2, 3, 1, 2, 3, 1, 2, 3])
result = rdd.distinct()
print(result.collect())

[2, 1, 3]


In [38]:
# 9. filter
rdd1 = sc.parallelize(range(1, 6))
rdd2 = rdd1.filter(lambda i: i > 2)
print(rdd2.collect())

[3, 4, 5]


In [41]:
# 10. sortByKey
rdd = sc.parallelize( [("q", 1), ("z", 1), ("a", 1)] )
result = rdd.sortByKey()
print(result.collect())

[('a', 1), ('q', 1), ('z', 1)]


In [42]:
# 11. keys, values
rdd = sc.parallelize( [("k1", "v1"), ("k2", "v2"), ("k3", "v3")] )
print(rdd.keys().collect())
print(rdd.values().collect())

['k1', 'k2', 'k3']
['v1', 'v2', 'v3']


In [43]:
# 12. sample
rdd = sc.parallelize(range(1, 101))
result1 = rdd.sample(False, 0.5, 100)
result2 = rdd.sample(True, 1.5, 100)
print(result1.take(5))
print(result2.take(5))

[2, 5, 6, 10, 12]
[1, 2, 2, 3, 4]


In [44]:
# 13. catesian
rdd1 = sc.parallelize([1, 2, 3])
rdd2 = sc.parallelize(["a", "b", "c"])
result = rdd1.cartesian(rdd2)
print(result.collect())

[(1, 'a'), (1, 'b'), (1, 'c'), (2, 'a'), (3, 'a'), (2, 'b'), (2, 'c'), (3, 'b'), (3, 'c')]


In [45]:
# 14. subtract
rdd1 = sc.parallelize(["a", "b", "c", "d", "e"])
rdd2 = sc.parallelize(["d", "e"])
result = rdd1.subtract(rdd2)
print(result.collect())

['b', 'c', 'a']


In [46]:
# 15. union
rdd1 = sc.parallelize(["a", "b", "c"])
rdd2 = sc.parallelize(["d", "e", "f"])
result = rdd1.union(rdd2)
print(result.collect())

['a', 'b', 'c', 'd', 'e', 'f']


In [47]:
# 16. intersection
rdd1 = sc.parallelize(["a", "a", "b", "c"])
rdd2 = sc.parallelize(["a", "a", "c", "c"])
result = rdd1.intersection(rdd2)
print(result.collect())

['c', 'a']


In [48]:
# 17. join
rdd1 = sc.parallelize(["a", "b", "c", "d", "e"]).map(lambda v: (v,1))
rdd2 = sc.parallelize(["b", "c"]).map(lambda v: (v, 2))
result = rdd1.join(rdd2)
print(result.collect())

[('b', (1, 2)), ('c', (1, 2))]


In [49]:
# 18. leftOuterJoin, rightOuterJoin
rdd1 = sc.parallelize(["a", "b", "c"]).map(lambda v: (v, 1))
rdd2 = sc.parallelize(["b", "c"]).map(lambda v: (v, 2))

result1 = rdd1.leftOuterJoin(rdd2)
result2 = rdd1.rightOuterJoin(rdd2)

print("Left: %s" % result1.collect())
print("Right: %s" % result2.collect())

Left: [('b', (1, 2)), ('c', (1, 2)), ('a', (1, None))]
Right: [('b', (1, 2)), ('c', (1, 2))]
