# MIDS-Lecture-10-Spark-IntroductoryExamples

In [1]:
import os
import sys #current as of 9/26/2015
spark_home = os.environ['SPARK_HOME'] = \
   '/Users/jshanahan/Dropbox/Lectures-UC-Berkeley-ML-Class-2015/spark-1.5.0-bin-hadoop2.6/'

if not spark_home:
    raise ValueError('SPARK_HOME enviroment variable is not set')
sys.path.insert(0,os.path.join(spark_home,'python'))
sys.path.insert(0,os.path.join(spark_home,'python/lib/py4j-0.8.2.1-src.zip'))
execfile(os.path.join(spark_home,'python/pyspark/shell.py'))

IOError: [Errno 2] No such file or directory: '/Users/jshanahan/Dropbox/Lectures-UC-Berkeley-ML-Class-2015/spark-1.5.0-bin-hadoop2.6/python/pyspark/shell.py'

In [1]:
import os
import sys #current as of 9/26/2015

import pyspark
from pyspark.sql import SQLContext

# We can give a name to our app (to find it in Spark WebUI) and configure execution mode
# In this case, it is local multicore execution with "local[*]"
app_name = "example-logs"
master = "local[*]"
conf = pyspark.SparkConf().setAppName(app_name).setMaster(master)
sc.stop()
sc = pyspark.SparkContext(conf=conf)
sqlContext = SQLContext(sc)


print sc
print sqlContext

<pyspark.context.SparkContext object at 0x7fbdf9d20990>
<pyspark.sql.context.SQLContext object at 0x7fbdf9d209d0>


In [3]:
import numpy as np
 

dataRDD = sc.parallelize(np.random.random_sample(1000))   
data2X= dataRDD.map(lambda x: x*2)
dataGreaterThan1 = data2X.filter(lambda x: x > 1.0)
cachedRDD = dataGreaterThan1.cache()

In [4]:
cachedRDD.filter(lambda x: x<1).count()

0

In [5]:
cachedRDD.filter(lambda x: x>1).count()

511

In [6]:
cachedRDD.filter(lambda x: x>1).count()

511

In [7]:
for line in cachedRDD.take(10):
    print line

1.32234061912
1.35723072479
1.07000646572
1.14585523013
1.40633864931
1.92258980773
1.07018380684
1.63640864383
1.30737870066
1.98217019246


In [8]:
#RDD mapper
rdd = sc.parallelize([1,2,3,3])
rdd.map(lambda x:  x + 1).collect()
#returns [2, 3, 4, 4]

[2, 3, 4, 4]

In [9]:
#reduce ACTION
rdd = sc.parallelize([1,2,3,3])
sum = rdd.reduce(lambda x, y: x + y)
# sum.collect()
print sum    #returns 9

9


In [10]:
#RDD flatmap ranges of integer ranges
rdd = sc.parallelize([1,2,3,3])
rdd.flatMap(lambda x:  range(x, 8)).collect()
rdd.flatMap(lambda x:  range(x, 10)).distinct().collect()
#returns [8, 1, 9, 2, 3, 4, 5, 6, 7]

[8, 4, 1, 5, 9, 2, 6, 3, 7]

In [11]:
#Example 4-1. Creating a pair RDD using the first word as the key in Python

lines = sc.parallelize(["Data line 1", "Mining line 2", "data line 3", "Data line 4", "Data Mining line 5"])
pairs = lines.map(lambda x: (x.split(" ")[0], x))  #first word and the original line
pairs.collect()


[('Data', 'Data line 1'),
 ('Mining', 'Mining line 2'),
 ('data', 'data line 3'),
 ('Data', 'Data line 4'),
 ('Data', 'Data Mining line 5')]

In [12]:
def filterTerm(line):
    if 'data' in line[1]:
        return (line[0])
    
result = pairs.filter(filterTerm)
result.collect()

[('data', 'data line 3')]

In [13]:
result = pairs.filter(lambda keyValue: len(keyValue[1]) < 12)
result.collect()

[('Data', 'Data line 1'), ('data', 'data line 3'), ('Data', 'Data line 4')]

## PAIRED RDD with super efficient word count example

In [14]:
lines = sc.parallelize(["Data line 1", "Mining line 2", "data line 3", "Data line 4", "Data Mining line 5"])
counts = lines.flatMap(lambda line: line.split(" ")) \
             .map(lambda word: (word, 1)) \
             .reduceByKey(lambda a, b: a + b)
        
counts.collect()

[('1', 1),
 ('data', 1),
 ('5', 1),
 ('Data', 3),
 ('4', 1),
 ('line', 5),
 ('Mining', 2),
 ('3', 1),
 ('2', 1)]

In [15]:
# a more efficent word counter (less network communication)
#could hash and do an in-memory combiner at the record level
lines = sc.parallelize(["Data line 1", "Mining line 2", "data line 3", "Data line 4", "Data Mining line 5"])
def emitWordCounts(line):
    wordCounts = []
    for w in line.split(" "):
        wordCounts.append((w, 1))  #could hash and do an in-memory combiner to be more efficient
    return (wordCounts)
    
counts = lines.flatMap(emitWordCounts) #\
             #.reduceByKey(lambda a, b: a + b)
#counts.first()  
counts.collect()

#Level 1 DEBUG Get the mapper to work in isolation  first
#emitWordCounts("Mining line 2")   

#Level 2 debug check the mapper is working
counts = lines.flatMap(emitWordCounts) #\
#             .reduceByKey(lambda a, b: a + b)
counts.collect()

#Level 3 debug check the mapper is working
counts = lines.flatMap(emitWordCounts) \
            .reduceByKey(lambda a, b: a + b)
counts.collect()


[('1', 1),
 ('data', 1),
 ('5', 1),
 ('Data', 3),
 ('4', 1),
 ('line', 5),
 ('Mining', 2),
 ('3', 1),
 ('2', 1)]

In [16]:
# a more efficent word counter (less network communication)
lines = sc.parallelize(["Data line 1", "Mining line 2", "data line 3", "Data line 4", "Data Mining line 5"])
def emitWordCounts(line):
    wordCounts = []
    for w in line.split(" "):
        wordCounts.append((w, 1))  #could hash and do an in-memory combiner to be more efficient
    return (wordCounts)
    
counts = lines.flatMap(emitWordCounts) \
             .reduceByKey(lambda a, b: a + b)
counts.collect()

[('1', 1),
 ('data', 1),
 ('5', 1),
 ('Data', 3),
 ('4', 1),
 ('line', 5),
 ('Mining', 2),
 ('3', 1),
 ('2', 1)]

In [17]:
# a more efficent word counter (less network communication)
# hash and do an in-memory combiner at the record level
lines = sc.parallelize(["Data line 1", "Mining line 2", "data line 3", "Data line 4", "Data Mining line 5"])
def emitWordCounts(line):
    wordCounts = {}
    for w in line.split(" "):
        if wordCounts.has_key(w):
            wordCounts[w]=wordCounts[w]+1
        else:
            wordCounts[w]=1
    for key, value in  wordCounts.items():
        print key,":", value      
    return (wordCounts.items())  

#Level 1 DEBUG Get the mapper to work in isolation  first
#emitWordCounts("Mining line 2")   

#Level 2 debug check the mapper is working
counts = lines.flatMap(emitWordCounts) #\
#             .reduceByKey(lambda a, b: a + b)
counts.collect()

#Level 3 debug check the mapper is working
counts = lines.flatMap(emitWordCounts) \
            .reduceByKey(lambda a, b: a + b)
counts.collect()

#map partitions

[('1', 1),
 ('data', 1),
 ('5', 1),
 ('Data', 3),
 ('4', 1),
 ('line', 5),
 ('Mining', 2),
 ('3', 1),
 ('2', 1)]

In [18]:
rdd1 = sc.parallelize({(1, 2), (3, 4), (3, 6)})
rdd2 = sc.parallelize({(3, 9), (3, 6)})
joinedRdd=rdd1.join(rdd2)
#RDD1 = {(1, 2), (3, 4), (3, 6)}
#RDD2 = {(3, 9) (3, 6)}
joinedRdd.collect()

[(3, (4, 9)), (3, (4, 6)), (3, (6, 9)), (3, (6, 6))]

## Example 6-14 in Learn Spark Book. Average with mapPartitions() in Python

In [19]:
def combineCtrs(c1, c2):
    return (c1[0] + c2[0], c1[1] + c2[1])
def basicAvg(nums):
    """Compute the average"""
    nums.map(lambda num: (num, 1)).reduce(combineCtrs)

def partitionCtr(nums):
    """Compute sumCounter for partition"""
    sumCount = [0, 0]
    for num in nums:
        sumCount[0] += num
        sumCount[1] += 1
    return [sumCount]

def fastAvg(nums):
    """Compute the avg"""
    sumCount = nums.mapPartitions(partitionCtr).reduce(combineCtrs)
    return sumCount[0] / float(sumCount[1])

nums = fastAvg(sc.parallelize([1,2,3,4]))
print(nums)

2.5


In [20]:
rdd = sc.parallelize([1,2,3,3]).filter(lambda x: x<3)
rdd.collect()

[1, 2]