# Introduction to the Map, Reduce and Filter Abstractions

## The  Map abstraction

In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

# do something to prove it works
rdd = sc.parallelize(range(1000))
rdd.takeSample(False, 5)



[443, 817, 815, 861, 871]

In [2]:
#Initialize data
nums = range(10)
print (nums)
print (type(nums))

range(0, 10)
<class 'range'>


In [3]:
#Python only

map(lambda x: x*x, nums)

<map at 0x7f8fac11c128>

In [4]:
#use a function definition.  This is called a clousure
def square(x):
    return x*x

In [5]:
#python only
results = map(square, nums)
for num in results:
    print(num)


0
1
4
9
16
25
36
49
64
81


## Reduce Abstraction


In [5]:
nums = range(100)
print (nums)

range(0, 100)


In [6]:
#Python only
from functools import reduce
reduce(lambda x,y: x+y, nums)

4950

## Filter Abstraction


In [7]:
lst = ['a', 'ab', 'abc', 'bac']
res = [k for k in lst if 'ab' in k]
res


['ab', 'abc']

In [8]:
res = filter(lambda k: 'ab' in k, lst)
list(res)

['ab', 'abc']

## Spark


In [9]:
#Spark - push nums list onto five executors
sparkNums = sc.parallelize(nums, 5)
#map - this is a transformation
squares = sparkNums.map(lambda x: x*x)
#print result - this is an action - push results back to the driver
print (squares.collect())


[0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256, 289, 324, 361, 400, 441, 484, 529, 576, 625, 676, 729, 784, 841, 900, 961, 1024, 1089, 1156, 1225, 1296, 1369, 1444, 1521, 1600, 1681, 1764, 1849, 1936, 2025, 2116, 2209, 2304, 2401, 2500, 2601, 2704, 2809, 2916, 3025, 3136, 3249, 3364, 3481, 3600, 3721, 3844, 3969, 4096, 4225, 4356, 4489, 4624, 4761, 4900, 5041, 5184, 5329, 5476, 5625, 5776, 5929, 6084, 6241, 6400, 6561, 6724, 6889, 7056, 7225, 7396, 7569, 7744, 7921, 8100, 8281, 8464, 8649, 8836, 9025, 9216, 9409, 9604, 9801]


In [10]:
#spark with rdd's
squares = sparkNums.map(square)
print (squares.collect())

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256, 289, 324, 361, 400, 441, 484, 529, 576, 625, 676, 729, 784, 841, 900, 961, 1024, 1089, 1156, 1225, 1296, 1369, 1444, 1521, 1600, 1681, 1764, 1849, 1936, 2025, 2116, 2209, 2304, 2401, 2500, 2601, 2704, 2809, 2916, 3025, 3136, 3249, 3364, 3481, 3600, 3721, 3844, 3969, 4096, 4225, 4356, 4489, 4624, 4761, 4900, 5041, 5184, 5329, 5476, 5625, 5776, 5929, 6084, 6241, 6400, 6561, 6724, 6889, 7056, 7225, 7396, 7569, 7744, 7921, 8100, 8281, 8464, 8649, 8836, 9025, 9216, 9409, 9604, 9801]


In [11]:
#Spark - push nums list onto five executors
sparkNums = sc.parallelize(nums, 5)
#reduce - this is an action - it returns a value to the driver
summ = sparkNums.reduce(lambda x,y: x+y)
#print summ
print (summ)

4950


In [12]:
def add_num(x1, x2):
    return x1 + x2
#python
print (reduce(add_num, nums))

#spark
print (sparkNums.reduce(add_num))

4950
4950


Exercise 1


In [13]:
#Create a Python list of numbers 0-999
nums = range(1000)
print (nums)

range(0, 1000)


In [14]:
#Spark - push nums list onto ten executors
#Fill in <list> and <numExecutors
sparkNums = sc.parallelize(<list>, <numExecutors>)
sparkNums.take(10)

SyntaxError: invalid syntax (<ipython-input-14-f33dfccc3ae6>, line 3)

In [15]:
#Spark - push nums list onto ten executors
#Fill in <list> and <numExecutors
sparkNums = sc.parallelize(nums, 10)
sparkNums.take(10)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [16]:
#multiply each element of sparkNums by 0.1
multipliedNums = sparkNums.map(lambda x: <function>)

SyntaxError: invalid syntax (<ipython-input-16-4b966c0c1d5a>, line 2)

In [17]:
#multiply each element of sparkNums by 0.5
multipliedNums = sparkNums.map(lambda x: 0.5 * x)
multipliedNums.take(10)

[0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5]

In [18]:
#sum the multiplied numbers
reducedNums = <domain>.map(lambda x,y: <function>)

SyntaxError: invalid syntax (<ipython-input-18-542a384613c0>, line 2)

In [None]:
#sum the multiplied numbers
reducedNums = multipliedNums.reduce(lambda x,y: x+y)
print (reducedNums)