# Parallelized collection (parallelizing)

In [1]:
from pyspark import SparkContext

In [2]:
sc = SparkContext.getOrCreate()

23/05/02 17:59:43 WARN Utils: Your hostname, rig resolves to a loopback address: 127.0.1.1; using 192.168.0.102 instead (on interface enp6s0)
23/05/02 17:59:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/02 17:59:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
numRDD = sc.parallelize([1, 2, 3, 4])
type(numRDD)

pyspark.rdd.RDD

In [4]:
helloRDD = sc.parallelize('Hello world')
type(helloRDD)

pyspark.rdd.RDD

# From external datasets

In [5]:
fileRDD = sc.textFile('Complete_Shakespeare.txt')
type(fileRDD)

pyspark.rdd.RDD

# Understanding Partitioning in PySpark

* A partition is a logical division of a large distributed data set
* parallelize() method

In [6]:
numRDD = sc.parallelize(range(10), numSlices=6)
type(numRDD)

pyspark.rdd.PipelinedRDD

In [7]:
numRDD.getNumPartitions()

6

* textFile() method

In [8]:
fileRDD = sc.textFile("Complete_Shakespeare.txt", minPartitions=6)
type(fileRDD)

pyspark.rdd.RDD

In [9]:
fileRDD.getNumPartitions()

6

The number of partitions in an RDD can be found by using getNumPartitions() method

# RDDs from Parallelized collections

In [10]:
# Create an RDD from a list of words
RDD = sc.parallelize(["Spark", "is", "a", "framework", "for", "Big Data processing"])
# Print out the type of the created object
print("The type of RDD is", type(RDD))
RDD.getNumPartitions()

The type of RDD is <class 'pyspark.rdd.RDD'>


16

# RDDs from External Datasets

In [11]:
# Create a fileRDD from file_path
fileRDD = sc.textFile('Complete_Shakespeare.txt')
# Check the type of fileRDD
print("The file type of fileRDD is", type(fileRDD))
fileRDD.getNumPartitions()

The file type of fileRDD is <class 'pyspark.rdd.RDD'>


2

# Partitions in your data

In [12]:
# Check the number of partitions in fileRDD
print("Number of partitions in fileRDD is", fileRDD.getNumPartitions())
# Create a fileRDD_part from file_path with 5 partitions
fileRDD_part = sc.textFile('Complete_Shakespeare.txt', minPartitions = 5)
# Check the number of partitions in fileRDD_part
print("Number of partitions in fileRDD_part is", fileRDD_part.getNumPartitions())

Number of partitions in fileRDD is 2
Number of partitions in fileRDD_part is 5


# Map and Collect

In [13]:
numbRDD = sc.parallelize(range(1,11))
# Create map() transformation to cube numbers
cubedRDD = numbRDD.map(lambda x: x**3)
# Collect the results
numbers_all = cubedRDD.collect()
# Print the numbers from numbers_all
for numb in numbers_all:
	print(numb)

1
8
27
64
125
216
343
512
729
1000


# Filter and Count

In [14]:
fileRDD = sc.textFile('test.md')
# Filter the fileRDD to select lines with Spark keyword
fileRDD_filter = fileRDD.filter(lambda line: 'Spark' in line)
# How many lines are there in fileRDD?
print("The total number of lines with the keyword Spark is", fileRDD_filter.count())
# Print the first four lines of fileRDD
for line in fileRDD_filter.take(4):
    print(line)

The total number of lines with the keyword Spark is 7
Examples for Learning Spark
Examples for the Learning Spark book. These examples require a number of libraries and as such have long build files. We have also added a stand alone example with minimal dependencies and a small build file
These examples have been updated to run against Spark 1.3 so they may
be slightly different than the versions in your copy of "Learning Spark".


# Creating pair RDDs

In [15]:
my_tuple = [('Sam', 23), ('Mary', 34), ('Peter', 25)]
pairRDD_tuple = sc.parallelize(my_tuple)
pairRDD_tuple.collect()

[('Sam', 23), ('Mary', 34), ('Peter', 25)]

In [16]:
my_list = ['Sam 23', 'Mary 34', 'Peter 25']
regularRDD = sc.parallelize(my_list)
pairRDD_RDD = regularRDD.map(lambda s: tuple(s.split(' ')))
pairRDD_RDD.collect()

[('Sam', '23'), ('Mary', '34'), ('Peter', '25')]

# Transformations on pair RDDs
* All regular transformations work on pair RDD
* Have to pass functions that operate on key value pairs rather than on individual elements
* Examples of paired RDD Transformations
    * reduceByKey(func): Combine values with the same key
    * groupByKey(): Group values with the same key
    * sortByKey(): Return an RDD sorted by the key
    * join(): Join two pair RDDs based on their key

## reduceByKey() transformation
* reduceByKey() transformation combines values with the same key
* It runs parallel operations for each key in the dataset
* It is a transformation and not action

In [17]:
# reduceByKey summing
regularRDD = sc.parallelize([("Messi", 23), ("Ronaldo", 34), ("Neymar", 22), ("Messi", 24)])
pairRDD_reducebykey = regularRDD.reduceByKey(lambda x,y : x + y)
pairRDD_reducebykey.collect()

[('Neymar', 22), ('Ronaldo', 34), ('Messi', 47)]

## sortByKey() transformation
* sortByKey() operation orders pair RDD by key
* It returns an RDD sorted by key in ascending or descending order

In [18]:
# swap key value and sort descending
# pairRDD_reducebykey_rev = pairRDD_reducebykey.map(lambda x: list(reversed(x)))
pairRDD_reducebykey_rev = pairRDD_reducebykey.map(lambda x: (x[1], x[0]))
pairRDD_reducebykey_rev.sortByKey(ascending=False).collect()

[(47, 'Messi'), (34, 'Ronaldo'), (22, 'Neymar')]

## groupByKey() transformation
* groupByKey() groups all the values with the same key in the pair RDD

In [19]:
airports = [("US", "JFK"),("UK", "LHR"),("FR", "CDG"),("US", "SFO")]
regularRDD = sc.parallelize(airports)
pairRDD_group = regularRDD.groupByKey().collect()
for cont, air in pairRDD_group:
    print(cont, list(air))

FR ['CDG']
UK ['LHR']
US ['JFK', 'SFO']


## join() transformation
join() transformation joins the two pair RDDs based on their key

In [20]:
RDD1 = sc.parallelize([("Messi", 34),("Ronaldo", 32),("Neymar", 24)])
RDD2 = sc.parallelize([("Ronaldo", 80),("Neymar", 120),("Messi", 100)])
RDD1.join(RDD2).collect()

[('Neymar', (24, 120)), ('Ronaldo', (32, 80)), ('Messi', (34, 100))]

# ReduceBykey and Collect

In [21]:
# Create PairRDD Rdd with key value pairs
Rdd = sc.parallelize([(1,2),(3,4),(3,6),(4,5)])
# Apply reduceByKey() operation on Rdd
Rdd_Reduced = Rdd.reduceByKey(lambda x, y: x + y)
# Iterate over the result and print the output
for num in Rdd_Reduced.collect():
    print("Key {} has {} Counts".format(num[0], num[1]))

Key 1 has 2 Counts
Key 3 has 10 Counts
Key 4 has 5 Counts


# SortByKey and Collect

In [22]:
# Sort the reduced RDD with the key by descending order
Rdd_Reduced_Sort = Rdd_Reduced.sortByKey(ascending=False)
# Iterate over the result and retrieve all the elements of the RDD
for num in Rdd_Reduced_Sort.collect():
    print("Key {} has {} Counts".format(num[0], num[1]))

Key 4 has 5 Counts
Key 3 has 10 Counts
Key 1 has 2 Counts


# reduce() action
* reduce(func) action is used for aggregating the elements of a regular RDD
* The function should be commutative (changing the order of the operands does not change
the result) and associative
* An example of reduce() action in PySpark

In [23]:
x = [1,3,4,6]
RDD = sc.parallelize(x)
RDD.reduce(lambda x, y : x + y)

14

# saveAsTextFile() action
saveAsTextFile() action saves RDD into a text file  inside a directory with each partition as
a separate file

In [24]:
! rm -rf tempFile/
RDD.saveAsTextFile("tempFile")

coalesce() method can be used to save RDD as a single text file

In [25]:
! rm -rf tempFile2/
RDD.coalesce(1).saveAsTextFile("tempFile2")

# Action Operations on pair RDDs
* RDD actions available for PySpark pair RDDs
* Pair RDD actions leverage the key-value data
* Few examples of pair RDD actions include
    * countByKey()
    * collectAsMap()

## countByKey() action
* countByKey() only available for type (K, V)
* countByKey() action counts the number of elements for each key
Example of countByKey() on a simple list

In [26]:
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
for kee, val in rdd.countByKey().items():
    print(kee, val)

a 2
b 1


> countByKey should only be used on a dataset whose size is small enough to fit in memory.

## collectAsMap() action
* collectAsMap() return the key-value pairs in the RDD as a dictionary
* Example of collectAsMap() on a simple tuple

In [27]:
sc.parallelize([(1, 2), (3, 4)]).collectAsMap()

{1: 2, 3: 4}

> countByKey should only be used on a dataset whose size is small enough to fit in memory.

# Exercises

## CountingBykeys

In [28]:
# Count the unique keys
total = Rdd.countByKey()
# What is the type of total?
print("The type of total is", type(total))
# Iterate over the total and print the output
for k, v in total.items():
    print("key", k, "has", v, "counts")

The type of total is <class 'collections.defaultdict'>
key 1 has 1 counts
key 3 has 2 counts
key 4 has 1 counts


In [29]:
total

defaultdict(int, {1: 1, 3: 2, 4: 1})

## Create a base RDD and transform it

In [30]:
# lines
! wc -l Complete_Shakespeare.txt
# words
! wc -w Complete_Shakespeare.txt

18013 Complete_Shakespeare.txt
128576 Complete_Shakespeare.txt


In [31]:
# Create a baseRDD from the file path
baseRDD = sc.textFile('Complete_Shakespeare.txt')
baseRDD.count() # lines

18014

In [32]:
# take n lines
baseRDD.take(1)

['The Project Gutenberg EBook of The Complete Works of William Shakespeare, by']

In [33]:
# Split the lines of baseRDD into words
splitRDD = baseRDD.flatMap(lambda x: x.split())
# Count the total number of words
print("Total number of words in splitRDD:", splitRDD.count())

Total number of words in splitRDD: 128576


In [34]:
print(splitRDD.take(12))

['The', 'Project', 'Gutenberg', 'EBook', 'of', 'The', 'Complete', 'Works', 'of', 'William', 'Shakespeare,', 'by']


## Remove stop words and reduce the dataset

In [35]:
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', 'don', 'should', 'now']
# Convert the words in lower case and remove stop words from the stop_words curated list
splitRDD_no_stop = splitRDD.filter(lambda x: x.lower() not in stop_words)
# Create a tuple of the word and 1 
splitRDD_no_stop_words = splitRDD_no_stop.map(lambda w: (w, 1))
# Count of the number of occurences of each word
resultRDD = splitRDD_no_stop_words.reduceByKey(lambda x, y: x + y)
resultRDD.count()

19279

In [36]:
resultRDD.take(5)

[('Project', 9), ('EBook', 1), ('Shakespeare', 12), ('use', 38), ('anyone', 1)]

# Print word frequencies

In [37]:
# Display the first 10 words and their frequencies from the input RDD
for word in resultRDD.take(10):
	print(word)

('Project', 9)
('EBook', 1)
('Shakespeare', 12)
('use', 38)
('anyone', 1)
('anywhere', 1)
('restrictions', 1)
('whatsoever.', 1)
('may', 162)
('it,', 74)


In [38]:
! wc -l Complete_Shakespeare.txt

18013 Complete_Shakespeare.txt


In [39]:
# Swap the keys and values from the input RDD
resultRDD_swap = resultRDD.map(lambda x: x[::-1])
# Sort the keys in descending order
resultRDD_swap_sort = resultRDD_swap.sortByKey(ascending=False)
# Show the top 10 most frequent words and their frequencies from the sorted RDD
for item in resultRDD_swap_sort.take(10):
	print(item[::-1])

('thou', 650)
('thy', 574)
('shall', 393)
('would', 311)
('good', 295)
('thee', 286)
('love', 273)
('Enter', 269)
("th'", 254)
('make', 225)
