# Spark Stream, DStreams

## Transformations

### Map

In [48]:
from pyspark.streaming import StreamingContext
from random import choice

ssc = StreamingContext(sc, 1)
ssc.checkpoint('/tmp')

alphabets = list('abcdefghijklmnopqrstuvwxyz')
input_data = [[choice(alphabets) for _ in range(100)] for _ in range(100)]
rdd_queue = [ssc.sparkContext.parallelize(item) for item in input_data]

stream = ssc.queueStream(rdd_queue).map(lambda word: (word, 1))
stream.pprint()

ssc.start()
ssc.stop(stopSparkContext=False, stopGraceFully=True)

-------------------------------------------
Time: 2019-10-31 09:34:34
-------------------------------------------
('v', 1)
('l', 1)
('s', 1)
('l', 1)
('p', 1)
('v', 1)
('f', 1)
('a', 1)
('h', 1)
('x', 1)
...

-------------------------------------------
Time: 2019-10-31 09:34:35
-------------------------------------------
('s', 1)
('z', 1)
('h', 1)
('c', 1)
('e', 1)
('u', 1)
('v', 1)
('c', 1)
('h', 1)
('b', 1)
...



### Flat map

In [47]:
from pyspark.streaming import StreamingContext
from random import randint

ssc = StreamingContext(sc, 1)
ssc.checkpoint('/tmp')

input_data = [[i for i in range(100)] for _ in range(100)]
rdd_queue = [ssc.sparkContext.parallelize(item) for item in input_data]

stream = ssc.queueStream(rdd_queue).flatMap(lambda num: [(num, randint(1, 10)) for _ in range(num)])
stream.pprint()

ssc.start()
ssc.stop(stopSparkContext=False, stopGraceFully=True)

-------------------------------------------
Time: 2019-10-31 09:34:28
-------------------------------------------
(1, 6)
(2, 10)
(2, 6)
(3, 8)
(3, 3)
(3, 1)
(4, 4)
(4, 8)
(4, 7)
(4, 9)
...

-------------------------------------------
Time: 2019-10-31 09:34:29
-------------------------------------------
(1, 6)
(2, 10)
(2, 6)
(3, 8)
(3, 3)
(3, 1)
(4, 4)
(4, 8)
(4, 7)
(4, 9)
...



### Filter

In [46]:
from pyspark.streaming import StreamingContext
from random import randint

ssc = StreamingContext(sc, 1)
ssc.checkpoint('/tmp')

input_data = [[randint(1, 100) for i in range(100)] for _ in range(100)]
rdd_queue = [ssc.sparkContext.parallelize(item) for item in input_data]

stream = ssc.queueStream(rdd_queue).filter(lambda num: num % 2 == 0)
stream.pprint()

ssc.start()
ssc.stop(stopSparkContext=False, stopGraceFully=True)

-------------------------------------------
Time: 2019-10-31 09:34:19
-------------------------------------------
54
22
88
66
78
90
22
40
2
52
...

-------------------------------------------
Time: 2019-10-31 09:34:20
-------------------------------------------
52
82
16
16
38
46
12
6
74
46
...



### Repartition

In [45]:
from pyspark.streaming import StreamingContext
from random import randint

ssc = StreamingContext(sc, 1)
ssc.checkpoint('/tmp')

input_data = [[randint(1, 100) for i in range(100)] for _ in range(100)]
rdd_queue = [ssc.sparkContext.parallelize(item) for item in input_data]

stream = ssc.queueStream(rdd_queue).repartition(1)
stream.pprint()

ssc.start()
ssc.stop(stopSparkContext=False, stopGraceFully=True)

-------------------------------------------
Time: 2019-10-31 09:34:10
-------------------------------------------
44
91
86
98
51
47
60
74
3
96
...

-------------------------------------------
Time: 2019-10-31 09:34:11
-------------------------------------------
16
86
82
93
18
88
27
43
89
82
...



### Union

In [44]:
from pyspark.streaming import StreamingContext
from random import randint

ssc = StreamingContext(sc, 1)
ssc.checkpoint('/tmp')

input_data1 = [[randint(1, 100) for i in range(100)] for _ in range(100)]
input_data2 = [[randint(1, 100) for i in range(100)] for _ in range(100)]

rdd_queue1 = [ssc.sparkContext.parallelize(item) for item in input_data1]
rdd_queue2 = [ssc.sparkContext.parallelize(item) for item in input_data2]

stream1 = ssc.queueStream(rdd_queue1).filter(lambda num: num % 2 == 0)
stream2 = ssc.queueStream(rdd_queue2).filter(lambda num: num % 2 == 0)

stream = stream1.union(stream2)
stream.pprint()

ssc.start()
ssc.stop(stopSparkContext=False, stopGraceFully=True)

-------------------------------------------
Time: 2019-10-31 09:33:58
-------------------------------------------
34
60
70
18
56
72
82
62
2
44
...

-------------------------------------------
Time: 2019-10-31 09:33:59
-------------------------------------------
68
78
98
50
32
48
86
20
64
50
...



### Count

In [24]:
from pyspark.streaming import StreamingContext
from random import randint

ssc = StreamingContext(sc, 1)
ssc.checkpoint('/tmp')

input_data = [[randint(1, 100) for _ in range(randint(1, 20))] for i in range(100)]
rdd_queue = [ssc.sparkContext.parallelize(item) for item in input_data]

stream = ssc.queueStream(rdd_queue).count()
stream.pprint()

ssc.start()
ssc.stop(stopSparkContext=False, stopGraceFully=True)

-------------------------------------------
Time: 2019-10-31 09:16:04
-------------------------------------------
11

-------------------------------------------
Time: 2019-10-31 09:16:05
-------------------------------------------
13



### Reduce

In [25]:
from pyspark.streaming import StreamingContext
from random import randint

ssc = StreamingContext(sc, 1)
ssc.checkpoint('/tmp')

input_data = [[randint(1, 100) for _ in range(randint(1, 20))] for i in range(100)]
rdd_queue = [ssc.sparkContext.parallelize(item) for item in input_data]

stream = ssc.queueStream(rdd_queue).reduce(lambda a, b: a + b)
stream.pprint()

ssc.start()
ssc.stop(stopSparkContext=False, stopGraceFully=True)

-------------------------------------------
Time: 2019-10-31 09:16:51
-------------------------------------------
671

-------------------------------------------
Time: 2019-10-31 09:16:52
-------------------------------------------
421



### Count by value

In [26]:
from pyspark.streaming import StreamingContext
from random import randint

ssc = StreamingContext(sc, 1)
ssc.checkpoint('/tmp')

input_data = [[randint(1, 100) for _ in range(randint(1, 20))] for i in range(100)]
rdd_queue = [ssc.sparkContext.parallelize(item) for item in input_data]

stream = ssc.queueStream(rdd_queue).countByValue()
stream.pprint()

ssc.start()
ssc.stop(stopSparkContext=False, stopGraceFully=True)

-------------------------------------------
Time: 2019-10-31 09:17:48
-------------------------------------------
(16, 1)
(100, 1)
(91, 1)
(34, 1)
(11, 1)

-------------------------------------------
Time: 2019-10-31 09:17:49
-------------------------------------------
(51, 1)
(16, 2)
(52, 1)
(17, 1)
(54, 1)
(30, 1)
(79, 1)
(44, 1)
(59, 1)
(95, 1)



### Reduce by key

In [28]:
from pyspark.streaming import StreamingContext
from random import randint

ssc = StreamingContext(sc, 1)
ssc.checkpoint('/tmp')

input_data = [[(randint(1, 2), randint(1, 100)) for _ in range(randint(1, 20))] for i in range(100)]
rdd_queue = [ssc.sparkContext.parallelize(item) for item in input_data]

stream = ssc.queueStream(rdd_queue).reduceByKey(lambda a, b: a + b)
stream.pprint()

ssc.start()
ssc.stop(stopSparkContext=False, stopGraceFully=True)

-------------------------------------------
Time: 2019-10-31 09:19:58
-------------------------------------------
(1, 301)
(2, 260)

-------------------------------------------
Time: 2019-10-31 09:19:59
-------------------------------------------
(1, 147)
(2, 379)



### Join

In [43]:
from pyspark.streaming import StreamingContext
from random import randint, choice

ssc = StreamingContext(sc, 1)
ssc.checkpoint('/tmp')

input_data1 = [[(choice([0, 1]), randint(1, 2)) for _ in range(5)] for _ in range(100)]
input_data2 = [[(choice([0, 1]), choice(['a', 'b'])) for _ in range(5)] for _ in range(100)]

rdd_queue1 = [ssc.sparkContext.parallelize(item) for item in input_data1]
rdd_queue2 = [ssc.sparkContext.parallelize(item) for item in input_data2]

counts1 = ssc.queueStream(rdd_queue1)
counts2 = ssc.queueStream(rdd_queue2)

stream = counts1.join(counts2)
stream.pprint()

ssc.start()
ssc.stop(stopSparkContext=False, stopGraceFully=True)

-------------------------------------------
Time: 2019-10-31 09:32:47
-------------------------------------------
(0, (1, 'b'))
(0, (1, 'a'))
(0, (1, 'b'))
(0, (2, 'b'))
(0, (2, 'a'))
(0, (2, 'b'))
(1, (1, 'b'))
(1, (1, 'a'))
(1, (2, 'b'))
(1, (2, 'a'))
...

-------------------------------------------
Time: 2019-10-31 09:32:48
-------------------------------------------
(0, (1, 'a'))
(0, (1, 'b'))
(0, (1, 'a'))
(0, (2, 'a'))
(0, (2, 'b'))
(0, (2, 'a'))
(0, (2, 'a'))
(0, (2, 'b'))
(0, (2, 'a'))
(1, (2, 'a'))
...



### Cogroup

In [42]:
from pyspark.streaming import StreamingContext
from random import randint, choice

ssc = StreamingContext(sc, 1)
ssc.checkpoint('/tmp')

input_data1 = [[(choice([0, 1]), randint(1, 2)) for _ in range(5)] for _ in range(100)]
input_data2 = [[(choice([0, 1]), choice(['a', 'b'])) for _ in range(5)] for _ in range(100)]

rdd_queue1 = [ssc.sparkContext.parallelize(item) for item in input_data1]
rdd_queue2 = [ssc.sparkContext.parallelize(item) for item in input_data2]

counts1 = ssc.queueStream(rdd_queue1)
counts2 = ssc.queueStream(rdd_queue2)

stream = counts1.cogroup(counts2)
stream.pprint()

ssc.start()
ssc.stop(stopSparkContext=False, stopGraceFully=True)

-------------------------------------------
Time: 2019-10-31 09:32:32
-------------------------------------------
(0, (<pyspark.resultiterable.ResultIterable object at 0x7f1a30410ed0>, <pyspark.resultiterable.ResultIterable object at 0x7f1a30412050>))
(1, (<pyspark.resultiterable.ResultIterable object at 0x7f1a30410d90>, <pyspark.resultiterable.ResultIterable object at 0x7f1a30410250>))

-------------------------------------------
Time: 2019-10-31 09:32:33
-------------------------------------------
(0, (<pyspark.resultiterable.ResultIterable object at 0x7f1a3040f650>, <pyspark.resultiterable.ResultIterable object at 0x7f1a3040f6d0>))
(1, (<pyspark.resultiterable.ResultIterable object at 0x7f1a3040fd10>, <pyspark.resultiterable.ResultIterable object at 0x7f1a303fe8d0>))



In [41]:
from pyspark.streaming import StreamingContext
from random import randint

ssc = StreamingContext(sc, 1)
ssc.checkpoint('/tmp')

input_data = [[i for i in range(100)] for _ in range(100)]
rdd_queue = [ssc.sparkContext.parallelize(item) for item in input_data]

stream = ssc.queueStream(rdd_queue).transform(lambda rdd: rdd.filter(lambda x: x % 2 == 0))
stream.pprint()

ssc.start()
ssc.stop(stopSparkContext=False, stopGraceFully=True)

-------------------------------------------
Time: 2019-10-31 09:32:22
-------------------------------------------
0
2
4
6
8
10
12
14
16
18
...

-------------------------------------------
Time: 2019-10-31 09:32:23
-------------------------------------------
0
2
4
6
8
10
12
14
16
18
...

