# Resilient Distributed Datasets (RDD)

## Acquiring a RDD

### Creating a RDD

In [1]:
num_rdd = sc.parallelize([i for i in range(10)])
type(num_rdd)

pyspark.rdd.RDD

### Create a pair RDD

In [2]:
pair_rdd = sc.parallelize([(i, i*i) for i in range(10)])
type(num_rdd)

pyspark.rdd.RDD

### Read a RDD from HDFS

In [3]:
data_rdd = sc.textFile('hdfs://localhost/data.csv')
type(data_rdd)

pyspark.rdd.RDD

## Transformations

### Map

In [4]:
num_rdd.map(lambda x: x * x).collect()

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

### Filter

In [5]:
num_rdd.map(lambda x: x % 2 == 0).collect()

[True, False, True, False, True, False, True, False, True, False]

### Flat map

In [6]:
num_rdd.flatMap(lambda x: [x for _ in range(2)]).collect()

[0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9]

### Sample

In [7]:
num_rdd.sample(withReplacement=False, fraction=0.2, seed=37).collect()

[0, 3, 7]

### Union

In [8]:
num_rdd.union(num_rdd).collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

### Intersection

In [9]:
a = sc.parallelize([1, 2, 3])
b = sc.parallelize([3, 4, 5])

a.intersection(b).collect()

[3]

### Distinct

In [10]:
a = sc.parallelize([1, 2, 2, 3, 4])
a.distinct().collect()

[1, 2, 3, 4]

### Group by key

In [11]:
a = sc.parallelize([(1, 1), (1, 2), (2, 4), (2, 3)])

for key, it in a.groupByKey().collect():
    print(key, list(it))

1 [1, 2]
2 [4, 3]


### Reduce by key

In [12]:
a = sc.parallelize([(1, 1), (1, 2), (2, 4), (2, 3)])
a.reduceByKey(lambda a, b: a + b).collect()

[(1, 3), (2, 7)]

### Aggregate by key

In [13]:
a = sc.parallelize([(1, 1), (1, 2), (2, 4), (2, 3)])
a.aggregateByKey('value', lambda s, d: f'{s} {d}', lambda s1, s2: f'{s1}, {s2}').collect()

[(1, 'value 1, value 2'), (2, 'value 4, value 3')]

### Sort by key

In [14]:
a = sc.parallelize([(1, 1), (3, 2), (5, 4), (4, 3)])
a.sortByKey().collect()

[(1, 1), (3, 2), (4, 3), (5, 4)]

### Join

In [15]:
a = sc.parallelize([(1, 1), (2, 2), (3, 3)])
b = sc.parallelize([(1, 2), (2, 3), (3, 4)])
a.join(b).collect()

[(1, (1, 2)), (2, (2, 3)), (3, (3, 4))]

### Left outer join

In [16]:
a = sc.parallelize([(1, 1), (2, 2), (3, 3)])
b = sc.parallelize([(1, 2), (2, 3), (4, 5)])
a.leftOuterJoin(b).collect()

[(1, (1, 2)), (2, (2, 3)), (3, (3, None))]

### Right outer join

In [17]:
a = sc.parallelize([(1, 1), (2, 2), (3, 3)])
b = sc.parallelize([(1, 2), (2, 3), (4, 5)])
a.rightOuterJoin(b).collect()

[(1, (1, 2)), (2, (2, 3)), (4, (None, 5))]

### Full outer join

In [18]:
a = sc.parallelize([(1, 1), (2, 2), (3, 3)])
b = sc.parallelize([(1, 2), (2, 3), (4, 5)])
a.fullOuterJoin(b).collect()

[(1, (1, 2)), (2, (2, 3)), (3, (3, None)), (4, (None, 5))]

### Cogroup

In [19]:
a = sc.parallelize([(1, 1), (2, 2), (3, 3)])
b = sc.parallelize([(1, 'a'), (2, 'b'), (3, 'c')])

for key, (it1, it2) in a.cogroup(b).collect():
    print(key, list(it1), list(it2))

1 [1] ['a']
2 [2] ['b']
3 [3] ['c']


### Cartesian

In [20]:
a = sc.parallelize([1, 2, 3, 4])
b = sc.parallelize(['a', 'b', 'c', 'd'])
a.cartesian(b).collect()

[(1, 'a'),
 (1, 'b'),
 (1, 'c'),
 (1, 'd'),
 (2, 'a'),
 (2, 'b'),
 (2, 'c'),
 (2, 'd'),
 (3, 'a'),
 (3, 'b'),
 (3, 'c'),
 (3, 'd'),
 (4, 'a'),
 (4, 'b'),
 (4, 'c'),
 (4, 'd')]

### Repartition

In [21]:
a = sc.parallelize(['hello', 'world'])
print(a.getNumPartitions())

a = a.repartition(2)
print(a.getNumPartitions())

12
2


### Coalesce

In [22]:
a = sc.parallelize(['hello', 'world'])
print(a.getNumPartitions())

a = a.coalesce(2)
print(a.getNumPartitions())

12
2


### Pipe

In [23]:
a = sc.parallelize(['hello', 'world'])
a.pipe('/root/ipynb/echo.sh').collect()

['One-Off Coder',
 'One-Off Coder',
 'One-Off Coder',
 'One-Off Coder',
 'One-Off Coder',
 'One-Off Coder',
 'hello',
 'One-Off Coder',
 'One-Off Coder',
 'One-Off Coder',
 'One-Off Coder',
 'One-Off Coder',
 'One-Off Coder',
 'world']

In [24]:
a = sc.parallelize(['hello', 'world']).repartition(2)
a.pipe('/root/ipynb/echo.sh').collect()

['One-Off Coder', 'One-Off Coder', 'hello', 'world']

In [25]:
a = sc.parallelize(['hello', 'world']).repartition(1)
a.pipe('/root/ipynb/echo.sh').collect()

['One-Off Coder', 'hello', 'world']

### Repartition and sort within partitions

In [26]:
sc.parallelize([(1, 5), (2, 15), (1, 4), (2, 14), (1, 3), (2, 13)])\
    .map(lambda tup: (tup, tup[1]))\
    .repartitionAndSortWithinPartitions(
        numPartitions=2, 
        partitionFunc=lambda tup: tup[0] % 2)\
    .map(lambda tup: tup[0])\
    .collect()

[(2, 13), (2, 14), (2, 15), (1, 3), (1, 4), (1, 5)]

## Actions

### Reduce

In [27]:
a = sc.parallelize([1, 2, 3])
a.reduce(lambda a, b: a + b)

6

### Collect

In [28]:
a = sc.parallelize([1, 2, 3])
a.collect()

[1, 2, 3]

### Count

In [29]:
a = sc.parallelize([1, 2, 3])
a.count()

3

### First

In [30]:
a = sc.parallelize([1, 2, 3])
a.first()

1

### Take

In [31]:
a = sc.parallelize([1, 2, 3])
a.take(2)

[1, 2]

### Take sample

In [32]:
a = sc.parallelize([i for i in range(100)])
a.takeSample(withReplacement=False, num=10, seed=37)

[52, 58, 63, 69, 23, 22, 75, 9, 0, 99]

### Take ordered

In [33]:
from random import randint

a = sc.parallelize([randint(1, 10000) for _ in range(1000)])
a.takeOrdered(10)

[3, 11, 13, 20, 22, 27, 33, 38, 39, 49]

### Count by key

In [34]:
a = sc.parallelize([(randint(1, 10), 1) for _ in range(10000)])
a.countByKey()

defaultdict(int,
            {7: 1032,
             4: 999,
             1: 1072,
             5: 973,
             2: 933,
             6: 1050,
             3: 947,
             10: 1007,
             9: 973,
             8: 1014})

## Chaining transformations and actions

### Map, filter, reduce

In [35]:
num_rdd = sc.parallelize([i for i in range(10)])

num_rdd\
    .map(lambda x: x * x)\
    .filter(lambda x: x % 2 == 0)\
    .reduce(lambda a, b: a + b)

120

### Filter, map, take

In [36]:
data_rdd = sc.textFile('hdfs://localhost/data.csv')

data_rdd\
    .filter(lambda s: False if s.startswith('x') else True)\
    .map(lambda s: s.split(','))\
    .map(lambda arr: [int(s) for s in arr])\
    .take(10)

[[14, 22, 25, 63, 47, 52, 13, 14, 23, 27],
 [35, 80, 38, 28, 73, 69, 21, 16, 76, 53],
 [46, 37, 46, 55, 78, 68, 61, 62, 81, 82],
 [19, 12, 45, 50, 71, 63, 94, 7, 10, 77],
 [50, 94, 94, 87, 67, 89, 73, 17, 39, 7],
 [47, 97, 64, 7, 47, 40, 77, 63, 50, 21],
 [33, 0, 99, 46, 43, 32, 47, 20, 4, 67],
 [46, 100, 28, 8, 34, 49, 62, 77, 4, 51],
 [14, 12, 50, 96, 57, 59, 40, 87, 44, 48],
 [13, 48, 30, 62, 88, 99, 65, 94, 13, 34]]

### Merging dictionaries

In [37]:
sc.parallelize([(randint(1, 10), 1) for _ in range(10000)])\
    .reduceByKey(lambda a, b: a + b)\
    .map(lambda tup: {tup[0]: tup[1]})\
    .reduce(lambda a, b: {**a, **b})

{1: 1008,
 2: 1036,
 3: 1037,
 4: 1026,
 5: 1038,
 6: 1014,
 7: 950,
 8: 924,
 9: 977,
 10: 990}

## Broadcasting variables

In [38]:
from random import randint

m = {i: randint(1, 10) for i in range(101)}
b = sc.broadcast(m)

sc.parallelize([randint(1, 100) for _ in range(20000)])\
    .map(lambda num: (b.value[num], 1))\
    .reduceByKey(lambda a, b: a + b)\
    .collect()

[(1, 1774),
 (2, 2443),
 (3, 1873),
 (4, 2013),
 (5, 1559),
 (6, 2376),
 (7, 1166),
 (8, 2400),
 (9, 1610),
 (10, 2786)]

## Accumulator

In [39]:
accum = sc.accumulator(0)

sc.parallelize([i for i in range(10000)])\
    .map(lambda num: accum.add(1))\
    .count()

accum.value

10000