-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathLocalInputDStream.scala
311 lines (260 loc) · 10.3 KB
/
LocalInputDStream.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
/*
* Copyright 2014 Pascal Voitot (@mandubian)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** I use spark package to hack all those private Spark classes */
package org.apache.spark.streaming
import java.util.concurrent.ArrayBlockingQueue
import java.nio.ByteBuffer
import scala.collection.mutable.ArrayBuffer
import scala.concurrent.Await
import scala.concurrent.duration._
import scala.reflect.ClassTag
import akka.actor.{Props, Actor}
import akka.pattern.ask
import org.apache.spark.streaming.util.{RecurringTimer, SystemClock}
import org.apache.spark.{Logging, SparkEnv}
import org.apache.spark.rdd.{RDD, BlockRDD}
import org.apache.spark.storage.{BlockId, StorageLevel, StreamBlockId}
import org.apache.spark.streaming.scheduler.{DeregisterReceiver, AddBlocks, RegisterReceiver}
import org.apache.spark.streaming.dstream.InputDStream
//import org.apache.spark.streaming.dstream.{StopReceiver, ReportBlock, ReportError}
import org.apache.spark.{SparkException, Logging, SparkEnv}
import org.apache.spark.SparkContext._
import scala.collection.mutable.HashMap
import scala.collection.mutable.Queue
import scala.concurrent.Future
import scala.concurrent.duration._
import akka.actor._
import akka.pattern.ask
import akka.dispatch._
import org.apache.spark.storage.BlockId
import org.apache.spark.util.AkkaUtils
/**
* ... Just a hack of RemoteInputDstream ...
* Abstract class for defining any [[org.apache.spark.streaming.dstream.InputDStream]]
* that has to start a receiver on local node to receive external data.
* Specific implementations of LocalInputDStream must
* define the `receiver` function that gets the receiver object of type
* [[org.apache.spark.streaming.dstream.LocalReceiver]] that will be used
* to receive data.
* @param ssc_ Streaming context that will execute this input stream
* @tparam T Class type of the object of this stream
*/
abstract class LocalInputDStream[T: ClassTag](@transient ssc_ : StreamingContext)
extends InputDStream[T](ssc_) with Logging {
// This is an unique identifier that is used to match the Znetwork receiver with the
// corresponding Znetwork input stream.
val id = ssc_.getNewNetworkStreamId()
/**
* Gets the receiver object that will be sent to the worker nodes
* to receive data. This method needs to defined by any specific implementation
* of a LocalInputDStream.
*/
def receiver: LocalReceiver[T]
// Nothing to start or stop as both taken care of by the ZNetworkInputTracker.
override def start() {
receiver.setStreamId(id)
receiver.start()
}
override def stop() {
receiver.stop()
}
override def compute(validTime: Time): Option[RDD[T]] = {
// If this is called for any time before the start time of the context,
// then this returns an empty RDD. This may happen when recovering from a
// master failure
if (validTime >= ssc_.graph.startTime) {
val blockIds = receiver.getBlockIds(validTime)
logInfo("!!!!!! GETTING BLOCK"+validTime+ " "+ ssc_.graph.startTime+ " blockIds:"+blockIds.toSeq)
Some(new BlockRDD[T](ssc_.sc, blockIds))
//if(!blockIds.isEmpty) Some(new BlockRDD[T](ssc_.sc, blockIds))
//else None
} else {
logInfo("!!!!!! GETTING BLOCK 2 :"+validTime+ " "+ ssc_.graph.startTime)
Some(new BlockRDD[T](ssc_.sc, Array[BlockId]()))
}
}
}
sealed trait LocalReceiverMessage
case class ReportBlock(blockId: BlockId, metadata: Any) extends LocalReceiverMessage
// case class GetBlocks(time: Time) extends LocalReceiverMessage
/**
* Abstract class of a receiver that can be run on worker nodes to receive external data. See
* [[org.apache.spark.streaming.dstream.LocalInputDStream]] for an explanation.
*/
abstract class LocalReceiver[T: ClassTag]() extends Logging {
protected lazy val env = SparkEnv.get
protected lazy val actor = env.actorSystem.actorOf(
Props(new LocalReceiverActor()), "LocalReceiver-" + streamId)
protected var streamId: Int = -1
protected implicit val timeout = new akka.util.Timeout(5 seconds)
protected val receivedBlockIds = Queue[BlockId]()
/**
* This method will be called to start receiving data. All your receiver
* starting code should be implemented by defining this function.
*/
protected def onStart()
/** This method will be called to stop receiving data. */
protected def onStop()
/**
* Starts the receiver. First is accesses all the lazy members to
* materialize them. Then it calls the user-defined onStart() method to start
* other threads, etc required to receiver the data.
*/
def start() {
try {
// Access the lazy vals to materialize them
logInfo("Starting local receiver")
env
logInfo("Started actor")
// Call user-defined onStart()
onStart()
} catch {
//case ie: InterruptedException =>
// logInfo("Receiving thread interrupted")
case e: Exception =>
stopOnError(e)
}
}
/**
* Stops the receiver. First it interrupts the main receiving thread,
* that is, the thread that called receiver.start(). Then it calls the user-defined
* onStop() method to stop other threads and/or do cleanup.
*/
def stop() {
// DO NOT KILL THREAD/ACTOR, WE ARE LOCAL
logInfo("Stopping local receiver")
onStop()
}
/**
* Stops the receiver and reports exception to the tracker.
* This should be called whenever an exception is to be handled on any thread
* of the receiver.
*/
def stopOnError(e: Exception) {
logError("Error receiving data", e)
stop()
}
/** Return all the blocks received from a receiver. */
def getBlockIds(time: Time): Array[BlockId] = {
logInfo("Getting blocks")
val result = receivedBlockIds.synchronized{
receivedBlockIds.dequeueAll(x => true)
}
logInfo("Stream " + LocalReceiver.this.streamId + " received " + result.size + " blocks")
result.toArray
}
/**
* Pushes a block (as an ArrayBuffer filled with data) into the block manager.
*/
def pushBlock(blockId: BlockId, arrayBuffer: ArrayBuffer[T], metadata: Any, level: StorageLevel) {
//logInfo("!!!!! pushing "+blockId)
env.blockManager.put(blockId, arrayBuffer.asInstanceOf[ArrayBuffer[Any]], level)
actor ! ReportBlock(blockId, metadata)
}
/**
* Pushes a block (as bytes) into the block manager.
*/
def pushBlock(blockId: BlockId, bytes: ByteBuffer, metadata: Any, level: StorageLevel) {
//logInfo("!!!!! pushing2 "+blockId)
env.blockManager.putBytes(blockId, bytes, level)
actor ! ReportBlock(blockId, metadata)
}
/** A helper actor that communicates with the NetworkInputTracker */
private class LocalReceiverActor extends Actor {
logInfo("Attempting to register with tracker")
override def receive() = {
case ReportBlock(blockId, metadata) =>
receivedBlockIds.synchronized {
receivedBlockIds += blockId
}
}
}
protected[streaming] def setStreamId(id: Int) {
streamId = id
}
/**
* Batches objects created by a [[org.apache.spark.streaming.dstream.LocalReceiver]] and puts them into
* appropriately named blocks at regular intervals. This class starts two threads,
* one to periodically start a new batch and prepare the previous batch of as a block,
* the other to push the blocks into the block manager.
*/
class BlockGenerator(storageLevel: StorageLevel)
extends Serializable with Logging {
case class Block(id: BlockId, buffer: ArrayBuffer[T], metadata: Any = null)
val clock = new SystemClock()
val blockInterval = env.conf.getLong("spark.streaming.blockInterval", 200)
val blockIntervalTimer = new RecurringTimer(clock, blockInterval, updateCurrentBuffer)
val blockStorageLevel = storageLevel
val blocksForPushing = new ArrayBlockingQueue[Block](1000)
val blockPushingThread = new Thread() { override def run() { keepPushingBlocks() } }
var currentBuffer = new ArrayBuffer[T]
var goon = true
def start() {
blockIntervalTimer.start()
blockPushingThread.start()
logInfo("Data handler started")
}
def stop() {
blockIntervalTimer.stop()
updateCurrentBuffer(System.currentTimeMillis())
goon = false
//blockPushingThread.interrupt()
logInfo("Data handler stopped")
}
def += (obj: T): Unit = synchronized {
currentBuffer += obj
//logInfo("***** ----> currentBuffer:"+currentBuffer)
}
private def updateCurrentBuffer(time: Long): Unit = synchronized {
try {
//logInfo("----> currentBuffer:"+currentBuffer)
val newBlockBuffer = currentBuffer
currentBuffer = new ArrayBuffer[T]
//logInfo("----> newBlockBuffer:"+newBlockBuffer)
if (newBlockBuffer.size > 0) {
val blockId = StreamBlockId(LocalReceiver.this.streamId, time - blockInterval)
val newBlock = new Block(blockId, newBlockBuffer)
blocksForPushing.add(newBlock)
}
} catch {
case ie: InterruptedException =>
logInfo("Block interval timer thread interrupted")
case e: Exception =>
LocalReceiver.this.stop()
}
}
private def keepPushingBlocks() {
logInfo("Block pushing thread started")
try {
while(goon) {
val block = blocksForPushing.take()
logInfo("Block pushing :"+block)
LocalReceiver.this.pushBlock(block.id, block.buffer, block.metadata, storageLevel)
}
} catch {
case ie: InterruptedException =>
logInfo("Block pushing thread interrupted")
// logInfo("Last Blocks:"+blocksForPushing)
// if(blocksForPushing.size() > 0) {
// val block = blocksForPushing.take()
// LocalReceiver.this.pushBlock(block.id, block.buffer, block.metadata, storageLevel)
// }
case e: Exception =>
LocalReceiver.this.stop()
}
}
}
}