# Parallelized Collections

In [None]:
data = range(1000)
distData = sc.parallelize(data)
distData.getNumPartitions()

# Load data from text files

In [None]:
rdd = sc.textFile('data/sample.txt')

# Get number of records
print 'Number of records: %d' % rdd.count()

# Examine RDD partions
print 'Number of partions %d' % rdd.getNumPartitions()

# RDD Transformations

## map

In [None]:
rdd = sc.textFile('data/sample.txt')
lengths = rdd.map(lambda s: len(s))
print lengths.collect()

total= lengths.reduce(lambda a, b: a + b)
print total

## flatMap

In [None]:
rdd2 = rdd.flatMap(lambda x: x.split())
rdd2.count()

# Get number of records
print 'Number of words: %d' % rdd2.count()

# Examine RDD partions
print 'Number of partions %d' % rdd2.getNumPartitions()

## filter

In [None]:
rdd3 = rdd2.filter(lambda x: x.lower()=="the")
print '"the" appears %d times' % rdd3.count()

## union

In [None]:
rdd4 = rdd2.filter(lambda x: x.lower()=="a")
print '"a" appears %d times' % rdd4.count()

rdd5 = rdd3.union(rdd4)
print '"a" and "the" appear %d times in total' % rdd5.count()

## distinct

In [None]:
rdd6 = rdd2.distinct()
print '%d distinct words' % rdd6.count()

## sample

In [None]:
rdd7 = rdd2.sample(False, 0.1)
rdd7.count()

## intersection

In [None]:
rdd01 = sc.parallelize([1,2,3])
rdd02 = sc.parallelize([2,3,4])
rdd01.intersection(rdd02).collect()

## subtract

In [None]:
rdd01 = sc.parallelize([1,2,3])
rdd02 = sc.parallelize([2,3,4])
print "rdd01 - rdd02", rdd01.subtract(rdd02).collect()
print "rdd02 - rdd01", rdd02.subtract(rdd01).collect()

## cartesian

In [None]:
rdd01 = sc.parallelize([1,2,3])
rdd02 = sc.parallelize([2,3,4])
rdd01.cartesian(rdd02).collect()

# RDD Actions

# collect

In [41]:
rdd01 = sc.parallelize([1,2,3])
rdd01.collect()

[1, 2, 3]

## count

In [42]:
rdd01 = sc.parallelize([1,2,3])
rdd01.count()

3

## countByValue

In [43]:
rdd01 = sc.parallelize([1,2,3,2,3,4,5])
rdd01.countByValue()

defaultdict(int, {1: 1, 2: 2, 3: 2, 4: 1, 5: 1})

## take

In [44]:
rdd01 = sc.parallelize(range(100))
rdd01.take(10)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

## top

In [45]:
rdd01 = sc.parallelize(range(100))
rdd01.top(10)

[99, 98, 97, 96, 95, 94, 93, 92, 91, 90]

## takeOrdered

In [49]:
print sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7]).takeOrdered(6)
print sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7], 2).takeOrdered(6, key=lambda x: -x)

[1, 2, 3, 4, 5, 6]
[10, 9, 7, 6, 5, 4]


## takeSample

In [51]:
print sc.parallelize(range(100)).takeSample(False, 5)

[67, 55, 4, 90, 36]


## reduce

In [52]:
print sc.parallelize(range(10)).reduce(lambda x, y: x+y)

45


## fold

In [72]:
rdd01 = sc.parallelize(range(10))
print rdd01.getNumPartitions()
print rdd01.fold(0, lambda x, y: x+y)

1
45


## aggregate

In [73]:
rdd01 = sc.parallelize(range(10))
r = rdd01.aggregate(
    (0, 0),
    (lambda acc, value: (acc[0] + value, acc[1] + 1)),
    (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1]))
) 
print r[0] / float(r[1])


4.5


## foreach

In [81]:
def f(x): print x

sc.parallelize(range(10)).foreach(f)