Working with RDDs

In [4]:
# SparkContext is one of the two entry points for Spark functionality.

# A SparkContext represents the connection to a Spark cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster.

from pyspark import SparkContext

sc = SparkContext()

# Alternatively, we could use SparkSession:

#from pyspark.sql import SparkSession

#spark = SparkSession\
#        .builder\
#        .appName("Tutorial1")\
#        .getOrCreate()

# In order to work with RDDs we would have access to SparkContext through the SparkSession:
#sc = spark.sparkContext

In [3]:
# Shuts down current context
sc.stop()

In [5]:
# Creating an RDD from a list.

rdd = sc.parallelize([2, 3, 4, 5, 6, 7], 3)
rdd #showing the type

# One important parameter for parallel collections is the number of partitions to cut the dataset into.
# Spark will run one task for each partition of the cluster.
# Typically you want 2-4 partitions for each CPU in your cluster.
# Normally, Spark tries to set the number of partitions automatically based on your cluster.
# However, you can also set it manually by passing it as a second parameter to parallelize (e.g. sc.parallelize(data, 10)).

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274

In [9]:
# PySpark can create distributed datasets from any storage source supported by Hadoop, including your local file system, HDFS, Cassandra, HBase, Amazon S3, etc. Spark supports text files, SequenceFiles, and any other Hadoop InputFormat.
# Text file RDDs can be created using SparkContext’s textFile() method.
# This method takes a URI for the file (either a local path on the machine, or a hdfs://, s3a://, etc URI) and reads it as a collection of lines.

rdd = sc.textFile("book.txt")       # textFile(): Reads a text file and return it as an RDD of Strings. 
#rdd.collect()                       # collect(): returns a list that contains all of the elements in this RDD.
#rdd.foreach(print)                  # foreach(): applies a function to all elements of this RDD.
#rdd.count()                         # count(): returns the number of elements in this RDD.
#rdd.first()                         # first(): return the first element of this RDD.


#Note: foreach(print) should be used only in local mode (when we are using only one machine).  In cluster mode, the output to stdout being called by the executors is now writing to the executor’s stdout instead, not the one on the driver, so stdout on the driver won’t show these.

'This is a book'

In [12]:
#Examples on how to load more than one file to the same rdd

#rdd = sc.textFile("book.txt,hello.txt") #note: no space after the comma
#rdd = sc.textFile("data")
rdd = sc.textFile("data/book*.txt")
rdd.foreach(print)

This is a nice book
One book
This is a book
A book
This is a bad book
A bad book


In [14]:
rdd = sc.parallelize([2, 3, 4, 5, 6, 7])

list = rdd.collect()        # collect(): returns a list that contains all of the elements in this RDD.
#list

#Another way of showing the elements of a list
#for num in list:
#    print('%i ' % (num))

# We should use the collect() on a smaller dataset, usually after filter(), group(), etc. 
# Retrieving on larger datasets results in out of memory.

2 
3 
4 
5 
6 
7 


In [15]:
# take(n): returns a list with the first n elements of the RDD

rdd = sc.parallelize([2, 3, 4, 5, 6, 7])

#rdd.take(1)
rdd.take(2)
#rdd.take(10)

[2, 3]

In [16]:
# map(): returns a new RDD by applying a function to each element of this RDD.

rdd1 = sc.parallelize([2, 3, 4, 5, 6, 7])

rdd1.map(lambda x: x * x).collect()

# Alternative
#rdd2 = rdd1.map(lambda x: x * x)
#rdd2.collect()

                                                                                

[4, 9, 16, 25, 36, 49]

In [18]:
#filter(): returns a new RDD containing only the elements that satisfy a condition.

rdd1 = sc.parallelize(range(100))

rdd2 = rdd1.filter(lambda x: x > 90)
rdd2.collect()

rdd1.filter(lambda x: x > 90).take(3)

[91, 92, 93]

In [21]:
# Lines with some word

rdd1 = sc.textFile("book.txt")

# What is the difference between the two alternatives below?
#rdd2 = rdd1.filter(lambda line: "This" in line)
rdd2 = rdd1.filter(lambda line: "This" in line.split())

rdd2.collect()

['This is a book']

In [22]:
# We can use our own functions

# Lines with some word

rdd1 = sc.textFile("book.txt")

def wordInLine(word, line):
     return word in line.split() # returns True if word is in line, False otherwise

rdd2 = rdd1.filter(lambda line: wordInLine("This", line))

rdd2.collect()

['This is a book']

In [23]:
# Number of lines with some word

rdd = sc.textFile("book.txt")

rdd.filter(lambda line: "This" in line.split()).count()

1

In [25]:
# reduce(): reduces the elements of this RDD using the specified commutative and associative binary operator.

rdd = sc.parallelize([2, 3, 4, 5, 6, 7])

#Adding all the elements in the RDD

# Alternative 1
#rdd.reduce(lambda a, b: a + b)

# Alternative 2
from operator import add
rdd.reduce(add)

27

In [27]:
# Adding up the sizes of all the lines of a text file using the map and reduce operations:

rdd1 = sc.textFile("book.txt")

rdd2 = rdd1.map(lambda line: len(line))
#rdd2.collect()

rdd2.reduce(lambda a, b: a + b)

# Transformations laziness explained:
# - The first line defines a base RDD from an external file. This dataset is not loaded in memory or otherwise acted on: lines is merely a pointer to the file.
# - The second line defines rdd2 as the result of a map transformation. Again, rdd2 is not immediately computed, due to laziness.
# - Finally, we run reduce, which is an action. At this point Spark breaks the computation into tasks to run on separate machines, and each machine runs both its part of the map and a local reduction, returning only its answer to the driver program. If laziness were not applied, all the intermediate results would have been returned to the driver program.

# In one line...
#rdd1.map(lambda line: len(line)).reduce(lambda a, b: a + b)

24

In [28]:
# Number of words of line with more words

rdd = sc.textFile("book.txt")

rdd1 = rdd.map(lambda line: len(line.split())).max()

4

In [30]:
# Just another example...

rdd1 = sc.parallelize(['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'])
#rdd1.collect()
rdd2 = rdd1.map(lambda word: (word[0], word))
rdd2.collect()

[('T', 'The'),
 ('q', 'quick'),
 ('b', 'brown'),
 ('f', 'fox'),
 ('j', 'jumps'),
 ('o', 'over'),
 ('t', 'the'),
 ('l', 'lazy'),
 ('d', 'dog')]

In [32]:
#glom(): returns an RDD created by coalescing all elements within each partition into a list.

rdd = sc.parallelize([0, 2, 3, 4, 6, 7], 3)
#rdd.collect()
rdd.glom().collect()

[[0, 2], [3, 4], [6, 7]]

In [37]:
#Computing the maximum value

rdd = sc.parallelize([50.0, 40.0, 40.0, 70.0], 2)  
#rdd.glom().collect()

# Alternative 1
rdd.reduce(max)

# Although it works, there will be lot of shuffles between partitions for comparisons. That is not good, particularly for large data.

# Rather than comparing all the values, we can
#   1. First find maximum in each partition
#   2. Compare maximum value between partitions to get the final max value

# This can be easily done using glom as follows:

# Alternative 2

rdd2 = rdd.glom().map(lambda v: max(v))
rdd2.collect()
rdd2.reduce(max)

# In one line...
#rdd.glom().map(lambda v: max(v)).reduce(max)

# Source: https://blog.madhukaraphatak.com/glom-in-spark

70.0

In [29]:
# Caching and persisting

# Adding up the sizes of all the lines of a text file using the map and reduce operations:

rdd1 = sc.textFile("book.txt")

rdd1.persist() #rdd1.cache()

rdd2 = rdd1.map(lambda line: len(line))

rdd2.reduce(lambda a, b: a + b)

# Spark’s RDDs are by default recomputed each time we run an action on them. If we want to reuse an RDD in multiple actions, we can ask Spark to persist it using RDD.persist(). After computing it the first time, Spark will store the RDD contents in memory (partitioned across the machines in our cluster), and reuse them in future actions.

# RDD.persist() method is used to store this RDD to the user-defined storage level.
# RDD.cache() persists this RDD with the default storage level (MEMORY_ONLY).

# The two functions are also available for DataFrames (covered in tutorial2_DataFrames.ipynb)

                                                                                

20

Word Count Example Start

In [46]:
# Word count - complete solution

lines = sc.textFile("book.txt")

counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)

# Word count ignoring case
#counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word.lower(), 1)).reduceByKey(lambda a, b: a + b)

counts.saveAsTextFile("word_count")

counts.collect()

# saveAsTextFile(path) -> The path is considered as a directory, and multiple outputs will be produced in that directory.

# Alternative:
#from operator import add
#counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).reduceByKey(add)

                                                                                

[('is', 1), ('book', 2), ('This', 1), ('a', 1), ('A', 1)]

In [42]:
# Word count steps explained - Step 1: computing an RDD with all the words of the text file

lines = sc.textFile("book.txt")

words = lines.flatMap(lambda line: line.split(" "))
words.collect()

# Just for curiosity, in order to see the difference between map() and flatMap():
#words = lines.map(lambda line: line.split(" "))
#words.collect()

# flatMap() transformation flattens the RDD/DataFrame/Dataset after applying the function on every element and returns a new transformed Dataset. The returned Dataset will return more rows than the current DataFrame. It is also referred to as a one-to-many transformation function. This is one of the major differences between flatMap() and map()

# In this case, flatMap() is used because applying line.split(" ") to each line (element) may produce more than one output (a list with all the words in each line)

['This', 'is', 'a', 'book', 'A', 'book']

In [43]:
# Word count steps explained - Step 2: computing an RDD with a tuple (word, 1) per word
 
lines = sc.textFile("book.txt")

single_word_pairs = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1))

single_word_pairs.collect()

[('This', 1), ('is', 1), ('a', 1), ('book', 1), ('A', 1), ('book', 1)]

In [44]:
# Word count steps explained - Step 3: finally, merging (adding, in this case) the pairs resulting from the map operation...

text_file = sc.textFile("book.txt")

counts = text_file.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)

counts.collect()

# reduceByKey(): merges the values for each key using an associative and commutative reduce function.
# This will also perform the merging locally on each mapper before sending results to a reducer, similarly to a “combiner” in MapReduce.

[('is', 1), ('book', 2), ('This', 1), ('a', 1), ('A', 1)]

Word Count Example End

In [50]:
# Computing PI: This code estimates π by "throwing darts" at a circle. We pick random points in the unit square ((0, 0) to (1,1)) and see how many fall in the unit circle. The fraction should be π / 4, so we use this to get our estimate.

import random

NUM_SAMPLES = 10000

def inside(p):
    x, y = random.random() * 2 - 1, random.random() * 2 - 1
    return x * x + y * y < 1

count = sc.parallelize(range(0, NUM_SAMPLES)).filter(inside).count()

print("Pi is roughly %f" % (4.0 * count / NUM_SAMPLES))

Pi is roughly 3.146000


Bibliography:
- https://spark.apache.org/docs/latest/rdd-programming-guide.html#resilient-distributed-datasets-rdds
- https://spark.apache.org/docs/3.1.3/api/python/
- Learning Spark Lightning-Fast Big Data Analysis, Holden Karau, Andy Konwinski, Patrick Wendell, Matei Zaharia. O'Reilley, 2015.
- https://sparkbyexamples.com/