In [0]:
# Linking with Spark

from pyspark import SparkContext, SparkConf

In [0]:
print(sc)

<SparkContext master=local[8] appName=Databricks Shell>


In [0]:
# Generate random data:
import random
randomlist = random.sample(range(0,40),10)
print(randomlist)

[17, 7, 14, 33, 38, 34, 22, 9, 28, 27]


In [0]:
# Create RDD:
rdd1 = sc.parallelize(randomlist, 4)

rdd1.collect()

Out[4]: [17, 7, 14, 33, 38, 34, 22, 9, 28, 27]

In [0]:
# union
print(rdd1.collect())
rdd2 = sc.parallelize([1,14,3,20,92,13,7,19],2)
print(rdd2.collect())

rdd_union = rdd1.union(rdd2)
rdd_union.collect()


[17, 7, 14, 33, 38, 34, 22, 9, 28, 27]
[1, 14, 3, 20, 92, 13, 7, 19]
Out[5]: [17, 7, 14, 33, 38, 34, 22, 9, 28, 27, 1, 14, 3, 20, 92, 13, 7, 19]

In [0]:
# intersection
rdd_intersection = rdd1.intersection(rdd2)
print(rdd_intersection.collect())
rdd_intersection.glom().collect()

[7, 14]
Out[8]: [[], [7], [14], [], [], []]

In [0]:
# find empty partitions
counter = 0
for item in rdd_intersection.glom().collect():
    if len(item)==0:
        counter += 1
counter

Out[9]: 4

In [0]:
# coalesce(numPartitions):
rdd_intersection.coalesce(1).glom().collect()

Out[10]: [[7, 14]]

In [0]:
# action
# takeSample(withReplacement, num, [seed]):
rdd1.takeSample(False, 3)

Out[12]: [34, 9, 28]

In [0]:
# takeOrdered(n, [ordering]):
rdd1.takeOrdered(5)

Out[13]: [7, 9, 14, 17, 22]

In [0]:
# reduce():
rdd1.reduce(lambda x,y: x-y)

Out[14]: 67

In [0]:
# reduceByKey():
rdd_rdbk = sc.parallelize([(1,2),(2,3),(3,5),(4,7),(1,2),(2,3),(3,5),(4,7)],2)
print(rdd_rdbk.glom().collect())

rdd_rdbk.reduceByKey(lambda x,y:x+y).collect()

[[(1, 2), (2, 3), (3, 5), (4, 7)], [(1, 2), (2, 3), (3, 5), (4, 7)]]
Out[18]: [(2, 6), (4, 14), (1, 4), (3, 10)]

In [0]:
# sortBykey():
rdd_rdbk.reduceByKey(lambda x,y: x+y).sortByKey(True).collect()

Out[20]: [(1, 4), (2, 6), (3, 10), (4, 14)]

In [0]:
# countByKey():
rdd_rdbk.countByKey()

Out[21]: defaultdict(int, {1: 2, 2: 2, 3: 2, 4: 2})

In [0]:
# groupByKey(): goes to driver
rdd_group = rdd_rdbk.groupByKey()
rdd_group.getNumPartitions()

for item in rdd_group.collect():
    print(item[0], [values for values in item[1]])

2 [3, 3]
4 [7, 7]
1 [2, 2]
3 [5, 5]


In [0]:
# loolup(key):
rdd_rdbk.lookup(2)

Out[24]: [3, 3]

In [0]:
# cache:
rdd_rdbk.persist()

Out[25]: ParallelCollectionRDD[58] at readRDDFromInputStream at PythonRDD.scala:435

In [0]:
# Peristence (https://spark.apache.org/docs/latest/rdd-programming-guide.html#rdd-persistence)

from pyspark import StorageLevel
rdd1.persist(StorageLevel.MEMORY_AND_DISK)

Out[28]: ParallelCollectionRDD[0] at readRDDFromInputStream at PythonRDD.scala:435