In [1]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

# Spark Review 

### Example 0

In [7]:
data = sc.textFile('./data/USF_Mission.txt')

In [8]:
data = data.map(lambda x: x.split()).flatMap(lambda x: x)

In [11]:
word_count = data.map(lambda x: (x,1))
word_count = word_count.reduceByKey(lambda x, y: x+y)

In [13]:
top5_word = word_count.sortBy(lambda x: x[1], ascending=False).take(5)

In [22]:
for w in top5_word: 
    print "%s : %s" % (w[0], w[1])

and : 25
the : 22
of : 19
to : 18
in : 13


## Tuning Parallelism

### Partition: 

- For parallel collections, users can specify the number of partitions to cut the RDD into.

- Form: 
```
sc.parallelize(data).transformation(...,partitionSize)
```

- How to check number of partitions: 
    1. `getNumPartitions()`
    2. `glom()`
    
    
- Thumb Rule for # Partitions: Twice of the number of CPU cores

### Problem
- Sending data back and forth between executors on parallelized distributed system causes network traffic.
- Better to place data that can minimize shuffling and improve performance. 
- Workload might not be evenly distributed in some partitions.
- May cause efficiency or memory issues.
- Example: 
```python
lines = sc.textFile("filtered_registered_business_sf.csv")
sf_business = lines.map(lambda x: (x.split(",")[0],x)).persist()
```
- Efficient if tuples with the same key are in the same partition
    + e.g. reduceByKey: have to shuffle the data -> costly 
    + Solution: putting the data with the same key in the same partition can reduce shuffling

### Example 1

Define “data” which is numbers between 1 and 9 and load them into random number of partitions (between 1 and 5).

In [27]:
import random as rand 
data = sc.parallelize(range(1,10), rand.randint(1,5))
data.getNumPartitions()

5

In [29]:
data.glom().collect()

[[1], [2, 3], [4, 5], [6, 7], [8, 9]]

### Example 2
- Parallelize data into 3 partitioners.
- Perform reduceByKey(lambda x,y : x+y) and check the number of partitioner.
- Perform reduceByKey(lambda x,y : x+y, NUM) and check the number of partitioner.

In [78]:
data = [('a',3),('b',4),('a',5)]
data = sc.parallelize(data, 3)
data.glom().collect()

[[('a', 3)], [('b', 4)], [('a', 5)]]

In [79]:
data1 = data.reduceByKey(lambda x, y: x+y)
print(data1.getNumPartitions())
print(data1.glom().collect())

3
[[], [('b', 4)], [('a', 8)]]


In [80]:
data2 = data.reduceByKey(lambda x, y: x+y, 8)
print(data2.getNumPartitions())
print(data2.glom().collect())

8
[[('a', 8)], [], [], [('b', 4)], [], [], [], []]


### Solution: **Partitioner**
- Pair RDDs. : partitionBy()
- Defines how the elements in a key-value pair RDD are **partitioned by key**.
- Manage data commonly accessible together on the **same node**, i.e. same partition
- Organize data for minimizing network traffic/communication to improve
 performance.
- Maps each key to a partition ID, from 0 to numPartitions - 1. 


- What are the rules that detemrine where the data should go? 
- Types : HashPartitioner, RangePartitioner, Custom Partitioner. 
    1. **HashPartitioner**: partitioner using a hash value of a key.
          `.partitionBy(N)` uses HashPartitioner by default.
    2. **RangePartitioner**: partitioner partitioning sorted RDDs into roughly equal ranges. 
    3. **CustomPartitioner**: User-defined paritioner.


- Example
```python
def custom_partitioner(key): return hash(key + 10)
pair_rdd.partitonBy(N, custom_partitioner)
```

- partitionBy() : Spark know that it is hash-partitioned.
- **persist() : Without persisting an RDD after partitioning, it will cause subsequent uses of the RDD to repeat the partitioning of the data.**
- Much Faster : Only shuffle supervisor data, sending supervisor data with each particular key to the machine that has the corresponding key of business.

### Example 3 
Try different types of operations to change the number of partitions.

In [46]:
data = sc.parallelize([('a',3),('b',4),('a',5), ('c',3), ('b',5)], 8)
data.getNumPartitions()

8

In [47]:
# HashPartitioner - using hash value of a key
data1 = data.partitionBy(8) # partitionBy uses HashPartitioner by default.
data1.glom().collect()

[[('a', 3), ('a', 5)], [], [('c', 3)], [('b', 4), ('b', 5)], [], [], [], []]

In [48]:
# RangePartitioner - sorting key
data2 = data.sortByKey()
data2.glom().collect()

[[('a', 3), ('a', 5)], [], [], [('b', 4), ('b', 5)], [], [], [('c', 3)], []]

In [51]:
# CustomPartitioner
def custom_partitioner(key): return hash(key + 10)

data3 = data.partitionBy(8, custom_partitioner) #partitionBy uses HashPartitioner by default.
# data3.glom().collect()

### Example 4
Join “filtered_registered_business_sf.csv” and “supervisor_sf.csv” efficiently.

In [99]:
biz_raw = sc.textFile('./data/filtered_registered_business_sf.csv')
supervisor_raw = sc.textFile('./data/supervisor_sf.csv')

In [143]:
# biz = biz_raw.flatMap(lambda l: l.split("\n")).map(lambda x: x.split(","))
biz = biz_raw.map(lambda x: x.split(","))
supervisor = supervisor_raw.flatMap(lambda l: l.split("\n")).map(lambda x: x.split(","))

In [144]:
biz.count(), supervisor.count()

(198566, 75)

In [145]:
biz.take(1)

[[u'94123',
  u'Tournahu George L',
  u'3301 Broderick St',
  u'San Francisco',
  u'CA']]

In [146]:
supervisor.take(3)

[[u'94102', u'8'], [u'94102', u'6'], [u'94102', u'3']]

In [147]:
%%time
joined = biz.join(supervisor)

CPU times: user 12.3 ms, sys: 2.77 ms, total: 15.1 ms
Wall time: 25 ms


In [148]:
joined.first()

(u'94123', (u'Tournahu George L', u'2'))

In [149]:
# with partitionBy
biz2 = biz_raw.map(lambda x: (x.split(",")[0],x)).partitionBy(5).persist()

In [150]:
biz2.first()

(u'94109',
 u'94109,Stephens Institute Inc,1835-49 Van Ness Ave,San Francisco,CA')

In [151]:
%%time
joined = biz2.join(supervisor)

CPU times: user 14.1 ms, sys: 5.52 ms, total: 19.6 ms
Wall time: 29.2 ms


### Important Note

In [159]:
sc.parallelize([(1,2), (1,3), (2,3)]).partitionBy(5).glom().collect()

[[], [(1, 2), (1, 3)], [(2, 3)], [], []]

In [162]:
# partitionBy does not work with multiple-value tuples, only tuple with 2 values (a,b)
try: 
    sc.parallelize([(1,2,3), (1,3,4), (2,3,5)]).partitionBy(5).collect()
except: 
    print("ValueError: too many values to unpack")

ValueError: too many values to unpack


### Example 5
Create a custom practitioner using
1. hash value of the key.
2. hash value of the key + 10.

In [164]:
data = sc.parallelize([(1, 'a'), (2, 'b'), (3, 'c'), (1, 'd'), (1, 'e'), (2, 'f')])
data.glom().collect()

[[(1, 'a')], [(2, 'b'), (3, 'c')], [(1, 'd')], [(1, 'e'), (2, 'f')]]

In [167]:
# CustomPartitioner
def custom_partitioner1(key): return hash(key)
def custom_partitioner2(key): return hash(key + 10)

In [170]:
data1 = data.partitionBy(4, custom_partitioner1) #partitionBy uses HashPartitioner by default.
data1.glom().collect()

[[], [(1, 'a'), (1, 'd'), (1, 'e')], [(2, 'b'), (2, 'f')], [(3, 'c')]]

In [171]:
data2 = data.partitionBy(4, custom_partitioner2) #partitionBy uses HashPartitioner by default.
data2.glom().collect()

[[(2, 'b'), (2, 'f')], [(3, 'c')], [], [(1, 'a'), (1, 'd'), (1, 'e')]]

### Operations and Partitioner
*Operations benefiting from partitioning* 

- Operations involving **shuffling data by key** across the network
    + join(), leftOuterJoin(), rightOuterJoin()
    + groupByKey(), reduceByKey(), combineByKey()
    + lookUp(), etc.


- Operations returns RDDs with **known partitioning information**. 
    + sortByKey(): range-partition
    + groupByKey(): hash-partition
    + most of them use hash-partitioning


- Operations forget the parent’s partitioning information.
    + map(): because it can theoretically modify the key of each record.
    + map() does not really benefit from reducing shuffling

### Repartitioning RDDs
Change the partitioning to distribute the workload more efficiently or avoid memory problems.

- **`repartition(numPartitions: Int)`**
    - basically move data from one partition to the other
    - does not guarantee minimize number of shuffling
    - shuffle data across the network to create a new set of partitions.


- **`coalesce(numPartitions: Int, shuffle = false)`**
    - guaranteed minimize number of shufflling
    - **optimized** version of repartition() 
    – avoid data movement and reduce the number of RDD partitions.
    - match the locality as much as possible, but try to balance partitions across the machines.
    - as number of partition increases, the performance of `coalesce()` is not much different from `repartition()`
    - by default, **`shuffle=False`**, the number of partitions would be the same as the original (i.e. no repartitioning)
    - with **`shuffle=True`**, the output is evenly distributed amongst the partitions (and your also able to increase the # of partitions if you wanted)

#### repartition() vs. partitionBy()
`repartition` already exists in RDDs, and does **not handle partitioning by key** (or by any other criterion except Ordering). Now PairRDDs add the notion of keys and subsequently add another method that allows to partition by that key.

### Example 6 

Compare .coalesce() and .repartition().
- Which one shuffles data less?
- Can the number of partitions smaller than its parent’s number of partitions? ==> Yes

In [172]:
data = sc.parallelize([(1, 'a'), (2, 'b'), (3, 'c'), (1, 'd'), (1, 'e'), (2, 'f')])
data.glom().collect()

[[(1, 'a')], [(2, 'b'), (3, 'c')], [(1, 'd')], [(1, 'e'), (2, 'f')]]

In [174]:
%%time
data1 = data.repartition(5)
data1.glom().collect()

CPU times: user 7.33 ms, sys: 2.83 ms, total: 10.2 ms
Wall time: 107 ms


In [175]:
%%time
data2 = data.coalesce(5, shuffle=True)
data2.glom().collect()

CPU times: user 10.4 ms, sys: 3.66 ms, total: 14 ms
Wall time: 120 ms


In [176]:
data3 = data.repartition(2)
data3.glom().collect()

[[(1, 'a'), (2, 'b'), (3, 'c'), (1, 'd'), (1, 'e'), (2, 'f')], []]

### Example 7 
- Determine a measure of importance (a “rank”) to each document based on how many documents have links to it.
- Applications : Rank web pages, influential users in a social network, etc.
- (Also could be implemented using GraphX.)

Write a page rank algorithms where data = [(1,[2,3,4]), (2,[1,3]), (3,[4])] where the format is (URL, [LIST OF URLS])

### Partition-specific Methods 
Operations specifically designed to interact with partitions as atomic units.
- `foreachPartition()` : Apply a function to each partition of an RDD.
- `glom()` : Return all elements within each partition.
- `lookup(key)`: Return values for the key using the partitioner to narrow its search to only the partitions where the key would present.
- `mapPartitions()`: Return a new RDD by applying a function to each partition of the RDD.

## RDD Dependencies / Linage

A dependency between an old and a new RDD is created, every time a transformation is performed on an RDD.
- Spark’s execution model is based on directed acyclic graphs (DAGs), where nodes are RDDs and edges are dependencies.
- The new RDD depend on the old RDD.
- **RDD Resilience** - As Spark records the linage of each RDD, any RDDs can be reconstructed to the state it was at the time of the failure.

### RDD Dependency Types
- **Narrow** – When no data shuffle between partitions is required.
- **Wide** - When it requires shuffle when joining RDDs (more expensive)

### Example 8

In [177]:
list = [rand.randrange(10) for x in range(500)]
listrdd = sc.parallelize(list, 5)

In [178]:
pairs = listrdd.map(lambda x: (x, x*x)) # narrow

In [179]:
reduced = pairs.reduceByKey(lambda v1, v2: v1+v2) # wide

In [182]:
finalrdd = reduced.mapPartitions(lambda itr: 
                                ["K="+str(k) + ", V="+str(v) for (k,v) in itr]) # narrow

In [183]:
finalrdd.collect()

['K=0, V=0',
 'K=5, V=1375',
 'K=1, V=50',
 'K=6, V=1512',
 'K=2, V=244',
 'K=7, V=1666',
 'K=8, V=3136',
 'K=3, V=468',
 'K=9, V=4455',
 'K=4, V=896']

### Spark stages and tasks.
- Every job is divided into **stages** based on the points **where shuffles occur**.
- For each stage, tasks are created and sent to the executors.
- After all tasks of a particular stage complete, the **driver creates tasks** for the next stage and sends them to the executors.
- The results of each stage are saved on disk as **intermediate files** on executor machines
- During the next stage, each partition receives data from these intermediate files belonging to it, and the execution is continued

`toDebugString()`
- Shows a textual representation of RDD dependencies.
- The RDDs in the output appears in reverse order.
- Useful in trying to minimize the number of shuffles.
- The numbers in parentheses show the number of partitions of the corresponding RDD.
- Every time you see a `ShuffleRDD` in the lineage chain, you can be sure that a shuffle will be performed at that point.

In [186]:
# indent "+-" means shuffling
# (5) is the number of partitions
# 2 stages for 1 shuffle, 3 stages for 2 shuffles
print finalrdd.toDebugString()

(5) PythonRDD[535] at collect at <ipython-input-183-036dee88a049>:1 []
 |  MapPartitionsRDD[533] at mapPartitions at PythonRDD.scala:427 []
 |  ShuffledRDD[532] at partitionBy at <unknown>:0 []
 +-(5) PairwiseRDD[531] at reduceByKey at <ipython-input-179-f90113ad700e>:1 []
    |  PythonRDD[530] at reduceByKey at <ipython-input-179-f90113ad700e>:1 []
    |  ParallelCollectionRDD[529] at parallelize at PythonRDD.scala:480 []


In [188]:
order_finalRdd = pairs.reduceByKey(lambda v1, v2: v1+v2).sortByKey()
print order_finalRdd.toDebugString()

(5) PythonRDD[556] at RDD at PythonRDD.scala:48 []
 |  MapPartitionsRDD[555] at mapPartitions at PythonRDD.scala:427 []
 |  ShuffledRDD[554] at partitionBy at <unknown>:0 []
 +-(5) PairwiseRDD[553] at sortByKey at <ipython-input-188-d06ec506fdd0>:1 []
    |  PythonRDD[552] at sortByKey at <ipython-input-188-d06ec506fdd0>:1 []
    |  MapPartitionsRDD[549] at mapPartitions at PythonRDD.scala:427 []
    |  ShuffledRDD[548] at partitionBy at <unknown>:0 []
    +-(5) PairwiseRDD[547] at reduceByKey at <ipython-input-188-d06ec506fdd0>:1 []
       |  PythonRDD[546] at reduceByKey at <ipython-input-188-d06ec506fdd0>:1 []
       |  ParallelCollectionRDD[529] at parallelize at PythonRDD.scala:480 []


In [190]:
print order_finalRdd.repartition(5).toDebugString()

(5) MapPartitionsRDD[566] at coalesce at NativeMethodAccessorImpl.java:0 []
 |  CoalescedRDD[565] at coalesce at NativeMethodAccessorImpl.java:0 []
 |  ShuffledRDD[564] at coalesce at NativeMethodAccessorImpl.java:0 []
 +-(5) MapPartitionsRDD[563] at coalesce at NativeMethodAccessorImpl.java:0 []
    |  PythonRDD[562] at RDD at PythonRDD.scala:48 []
    |  MapPartitionsRDD[555] at mapPartitions at PythonRDD.scala:427 []
    |  ShuffledRDD[554] at partitionBy at <unknown>:0 []
    +-(5) PairwiseRDD[553] at sortByKey at <ipython-input-188-d06ec506fdd0>:1 []
       |  PythonRDD[552] at sortByKey at <ipython-input-188-d06ec506fdd0>:1 []
       |  MapPartitionsRDD[549] at mapPartitions at PythonRDD.scala:427 []
       |  ShuffledRDD[548] at partitionBy at <unknown>:0 []
       +-(5) PairwiseRDD[547] at reduceByKey at <ipython-input-188-d06ec506fdd0>:1 []
          |  PythonRDD[546] at reduceByKey at <ipython-input-188-d06ec506fdd0>:1 []
          |  ParallelCollectionRDD[529] at parallelize

### Checkpoint 

- When caching the RDDs, it will remember all the history in the Linage Graph / Dependencies. Keeping all will lead to overhead
- Use **checkpoint** - saving the data itself on the disk
- Persist RDDs to disk.
- After checkpointing, the RDD ’s dependencies are erased, as well as the information about its parent(s), because they won’t be needed for its recomputation any more.
- cf. persist(), cache() : Keeps RDD’s dependencies.


- Example: 

```python
sc.setCheckpointDir("dir") # sets the directory where RDDs will be checkpointed.
RDD.checkpoint() # will be triggered once an action is called.
RDD.action() # After checkpointing, the RDD linage including its parents’ information will be removed.
```

### Example 8.1

In [197]:
finalRdd = order_finalRdd.repartition(5)

In [198]:
print finalRdd.toDebugString()

(5) MapPartitionsRDD[578] at coalesce at NativeMethodAccessorImpl.java:0 []
 |  CoalescedRDD[577] at coalesce at NativeMethodAccessorImpl.java:0 []
 |  ShuffledRDD[576] at coalesce at NativeMethodAccessorImpl.java:0 []
 +-(5) MapPartitionsRDD[575] at coalesce at NativeMethodAccessorImpl.java:0 []
    |  PythonRDD[574] at RDD at PythonRDD.scala:48 []
    |  MapPartitionsRDD[555] at mapPartitions at PythonRDD.scala:427 []
    |  ShuffledRDD[554] at partitionBy at <unknown>:0 []
    +-(5) PairwiseRDD[553] at sortByKey at <ipython-input-188-d06ec506fdd0>:1 []
       |  PythonRDD[552] at sortByKey at <ipython-input-188-d06ec506fdd0>:1 []
       |  MapPartitionsRDD[549] at mapPartitions at PythonRDD.scala:427 []
       |  ShuffledRDD[548] at partitionBy at <unknown>:0 []
       +-(5) PairwiseRDD[547] at reduceByKey at <ipython-input-188-d06ec506fdd0>:1 []
          |  PythonRDD[546] at reduceByKey at <ipython-input-188-d06ec506fdd0>:1 []
          |  ParallelCollectionRDD[529] at parallelize

In [199]:
sc.setCheckpointDir("checkpoint")
finalRdd.checkpoint()
print finalRdd.isCheckpointed()

False


In [200]:
finalRdd.count() # Have to call an action before calling isCheckPointed
print finalRdd.isCheckpointed()
print finalRdd.getCheckpointFile()

True
file:/Users/ThyKhueLy/msan697/inclass/checkpoint/51ba1be1-c290-4e25-9d19-5f34017eac01/rdd-578


In [201]:
print finalRdd.toDebugString()

(5) MapPartitionsRDD[578] at coalesce at NativeMethodAccessorImpl.java:0 []
 |  ReliableCheckpointRDD[580] at count at <ipython-input-200-87fc17c4d4eb>:1 []


## Shared Variables

Using shared variables to communicate with Spark executors.


- Normally, when a function passed to a Spark operation is executed, it works on separate copies of all the variables used in the function. 
- These variables are copied to each partition, and updates to the variables are not propagated back to the driver program.

- Solution : Shared variables, help maintain a global state or share data across tasks and partitions.

**1. Accumulator : Aggregate information from executor nodes to the driver node - Write-only**
- Is shared across executors that you can only add to.
- Useful to implement global sums and counters.
- Can be accessed by **driver node**, not from an executor node.
- Create using

```python
sc.accumulator(inital_value)
```

- The executor can add to the accumulator with .add(val) method or +. (it is **write-only.**) ==> Cannot read!
- The driver can call it using `.value` - NOT the executor. i.e. you cannot pass it within a transformation or action
- Not optimal for production. Most for debugging
    
**2. Broadcast variable : Efficiently distribute large read-only values to executor nodes - Read-only**.
- Efficiently distribute large read-only values such as lookup tables to executor nodes.
- The value is sent to each node only once (not per task). 
- Create a broadcast variable using 

```
sc.broadcast(value)
```

- If pass it via a map/reduce function ==> Error!
- Access the value with `.value.`
- Should `.unpersist()` for removing a broadcast variable from memory on all workers.
- example: `rdd.filter(lambda x: x[1] == prod.value)` is good but you can't have `prod.value+1`

- Advantages
    + Use an efficient and scalable peer-to-peer distribution mechanism. 
    + Eliminate the need for a shuffle operation.
    + Replicate data once per worker (not once per task). 
    + Are serialized objects (can be read efficiently.).

### Example 9
- Define a accumulator variable and initialize the value to be 0.
- Generate values between 1 and 10,000,000 using .parallelize().
- For each value, increment accumulator variable.

In [202]:
acc = sc.accumulator(0)

In [204]:
nums = sc.parallelize(range(1, 10000001))

In [205]:
nums.foreach(lambda x: acc.add(1)) # for each value, increment accumulator variable.

In [206]:
acc.value # similar as list.count()

10000000

Exception occurs when you tried to access accumulator values in executor nodes:

In [208]:
try: nums.foreach(lambda x: acc.value)
except: print("Accumulator.value cannot be accessed inside tasks")

Accumulator.value cannot be accessed inside tasks


### Example 10

- Count how many status data are collected where station_id is ’10’.
- Compare an output of 1) .count(), 2) an accumulator value incremented within a transformation and 3) an accumulator value incremented within an action.

In [39]:
raw = sc.textFile('../hw1/input_1/status.csv')

In [40]:
# bike = raw.map(lambda x: ((x.split(","))[0], x))
bike = raw.map(lambda x: x.split(",")).map(lambda x: (x[0], x[1:]))

In [41]:
bike = bike.partitionBy(10)

In [42]:
bike.take(1)

[(u'13', [u'7', u'8', u'"2014-09-26 08:12:02"'])]

In [23]:
bike_filtered1 = bike.filter(lambda x: x[0]=='10')

In [24]:
%%time
bike_filtered1.count()

CPU times: user 8.7 ms, sys: 4.67 ms, total: 13.4 ms
Wall time: 34.6 s


523623

In [25]:
bike_filtered1.take(1)

[(u'10', [u'7', u'8', u'"2014-12-30 15:37:02"'])]

In [44]:
acc1 = sc.accumulator(0) # within transformation
acc2 = sc.accumulator(0) # within action

In [45]:
# an accumulator value incremented within a transformation
# does not always work, should work for 
def filter_st(x): 
    if (x[0]=='10'): 
        acc1.add(1)
        return x

In [46]:
bike_filtered2 = bike.map(filter_st)

In [49]:
%%time
# an accumulator value incremented within an action.
bike.foreach(lambda x: acc2.add(1))

CPU times: user 10.1 ms, sys: 10.7 ms, total: 20.9 ms
Wall time: 50 s


In [18]:
print bike_filtered2.count()
print acc1.value
print acc2.value

36647622
1047246
36647622


In [48]:
bike_filtered2.take(2)

[None, None]

### Example 11
Generate a broadcast variable and access, modify and unpersist it.

In [50]:
nums = sc.parallelize(range(10, 100))
bc = sc.broadcast([1,2,3])
type(bc)

pyspark.broadcast.Broadcast

In [51]:
bc.value

[1, 2, 3]

In [52]:
add_bc = nums.map(lambda x: x + bc.value[0])

In [53]:
add_bc.collect()[:5]

[11, 12, 13, 14, 15]

In [54]:
bc = bc + 1
bc.value

TypeError: unsupported operand type(s) for +: 'Broadcast' and 'int'

In [59]:
bc.value + 1

TypeError: can only concatenate list (not "int") to list

In [38]:
bc.unpersist() # unpersist method removes from executor node but still stays on driver node
bc.value

[1, 2, 3]

### Example 12 

Data
- bike_share/stations.csv includes station_id, num_bikes_available, num_docks_available and timestamp.
- bike_share/status.csv includes station_id, name, lat, lon, total_num_dock, city, station_installed_date.


Write a code returns timestamp, name, num_bikes_available, num_docks_available of a given station name.

In [61]:
%%time
input_file1 = "../hw1/input_1/status_small.csv"
input_file2 = "../hw1/input_1/stations.csv"
station_name = "San Jose City Hall"

# # Stations
# raw2 = sc.textFile(input_file2, 10)
# stations = raw2.map(lambda x: x.split(",")).map(lambda x: (x[1], x[0]))
# stationId_input = stations.lookup(station_name)

# if len(stationId_input) == 1:
#     stationId_input = stationId_input[0]
# else:
#     stationId_input = ""

# # Status
# raw1 = sc.textFile(input_file1, 10)
# status = raw1.map(lambda x: x.split(",")).map(lambda x: (x[0], ([x[3], x[1], x[2]])))
# status = status.partitionBy(70).persist()
# station_details = status.lookup(stationId_input)

# for e in sorted(station_details):
#     print ",".join(e)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 16.2 µs


## Extra Examples

### Example 11

In [62]:
acc = sc.accumulator(0)
list = sc.parallelize(range(1,5))
list.foreach(lambda x: acc.add(1))
acc.value

4

In [64]:
try: list.map(lambda x: x + acc.value).collect()
except: print('Exception: Accumulator.value cannot be accessed inside tasks')

Exception: Accumulator.value cannot be accessed inside tasks


### Example 12

In [67]:
list = sc.parallelize(range(1,5))
print list.collect()
bc = sc.broadcast([1,2,3])
print list.map(lambda x: x + bc.value[0]).collect()

[1, 2, 3, 4]
[2, 3, 4, 5]


### Example 13 

In [76]:
file_name = './data/USF_Mission.txt'
lines = sc.textFile(file_name)
word = lines.flatMap(lambda l: l.split())
word_map = word.map(lambda w: (w,1))
word_map.collect()
word_count = word_map.reduceByKey(lambda a,b: a+b) # shuffle
finalRdd = word_count.collect()

In [77]:
print word_count.toDebugString()

(2) PythonRDD[91] at collect at <ipython-input-76-155699745f6c>:7 []
 |  MapPartitionsRDD[90] at mapPartitions at PythonRDD.scala:427 []
 |  ShuffledRDD[89] at partitionBy at NativeMethodAccessorImpl.java:0 []
 +-(2) PairwiseRDD[88] at reduceByKey at <ipython-input-76-155699745f6c>:6 []
    |  PythonRDD[87] at reduceByKey at <ipython-input-76-155699745f6c>:6 []
    |  ./data/USF_Mission.txt MapPartitionsRDD[85] at textFile at NativeMethodAccessorImpl.java:0 []
    |  ./data/USF_Mission.txt HadoopRDD[84] at textFile at NativeMethodAccessorImpl.java:0 []
