/
Scheduler.scala
327 lines (288 loc) · 13.8 KB
/
Scheduler.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
package com.mesosphere.usi.core
import akka.{Done, NotUsed}
import akka.stream.scaladsl.{BidiFlow, Broadcast, Flow, GraphDSL, Sink, SinkQueueWithCancel, Source}
import akka.stream.{BidiShape, FlowShape, KillSwitches, Materializer, OverflowStrategy, QueueOfferResult}
import com.mesosphere.mesos.client.{MesosCalls, MesosClient}
import com.mesosphere.usi.core.conf.SchedulerSettings
import com.mesosphere.usi.core.models.PodId
import com.mesosphere.usi.core.models.PodRecord
import com.mesosphere.usi.core.models.{
PodRecordUpdated,
SpecEvent,
SpecUpdated,
SpecsSnapshot,
StateEvent,
StateSnapshot
}
import com.mesosphere.usi.repository.PodRecordRepository
import com.typesafe.config.ConfigFactory
import org.apache.mesos.v1.Protos.FrameworkInfo
import org.apache.mesos.v1.scheduler.Protos.{Call => MesosCall, Event => MesosEvent}
import scala.collection.JavaConverters._
import scala.concurrent.Await
import scala.concurrent.duration._
import scala.concurrent.{Future, Promise}
import scala.util.{Failure, Success}
/**
* Provides the scheduler graph component. The component has two inputs, and two outputs:
*
* Input:
* 1) SpecInput - Used to replicate the specification state from the framework implementation to the USI SchedulerLogic;
* First, a spec snapshot, followed by spec updates.
* 2) MesosEvents - Events from Mesos; offers, task status updates, etc.
*
* Output:
* 1) StateEvents - Used to replicate the state of pods, agents, and reservations to the framework
* First, a scheduler state snapshot, followed by state updates.
* 2) MesosCalls - Actions, such as revive, kill, accept offer, etc., used to realize the specification.
*
* Fully wired, the graph looks like this at a high-level view:
* {{{
*
* *** SCHEDULER ***
* +------------------------------------------------------------------------+
* | |
* | +------------------+ +-------------+ StateOutput |
* SpecInput | | | | | /------------------>----> (framework)
* (framework) >------>--> | Scheduler | | / |
* | | | Events | | / |
* | | SchedulerLogic o-----------> Persistence o--+ |
* | | | | | \ |
* MesosEvents | | | | | \ MesosCalls |
* /------------>--> | | | \------------------>
* / | | | | | |\
* / | +------------------+ +-------------+ | \
* / | | \
* | +------------------------------------------------------------------------+ |
* \ |
* \ +----------------------+ /
* \ | | /
* \-----------------------------< Mesos <---------------------------------
* | |
* +----------------------+
* }}}
*/
object Scheduler {
type SpecInput = (SpecsSnapshot, Source[SpecUpdated, Any])
type StateOutput = (StateSnapshot, Source[StateEvent, Any])
private val schedulerSettings = SchedulerSettings.fromConfig(ConfigFactory.load().getConfig("scheduler"))
def asFlow(specsSnapshot: SpecsSnapshot, client: MesosClient, podRecordRepository: PodRecordRepository)(
implicit materializer: Materializer): Future[(StateSnapshot, Flow[SpecUpdated, StateEvent, NotUsed])] = {
implicit val ec = scala.concurrent.ExecutionContext.Implicits.global //only for ultra-fast non-blocking map
val (snap, source, sink) = asSourceAndSink(specsSnapshot, client, podRecordRepository)
snap.map { snapshot =>
(snapshot, Flow.fromSinkAndSourceCoupled(sink, source))
}
}
/**
* Represents the scheduler as a Sink and Source.
*
* This method will materialize the scheduler first, then Sink and Source can be materialized independently, but only once.
*
* @param specsSnapshot Snapshot of the current specs
* @return Snapshot of the current state, as well as Source which produces StateEvents and Sink which accepts SpecEvents
*/
def asSourceAndSink(specsSnapshot: SpecsSnapshot, client: MesosClient, podRecordRepository: PodRecordRepository)(
implicit mat: Materializer): (Future[StateSnapshot], Source[StateEvent, NotUsed], Sink[SpecUpdated, NotUsed]) = {
val flow = fromClient(client, podRecordRepository)
asSourceAndSink(specsSnapshot, flow)(mat)
}
def asSourceAndSink(specsSnapshot: SpecsSnapshot, schedulerFlow: Flow[SpecInput, StateOutput, NotUsed])(
implicit mat: Materializer): (Future[StateSnapshot], Source[StateEvent, NotUsed], Sink[SpecUpdated, NotUsed]) = {
implicit val ec = scala.concurrent.ExecutionContext.Implicits.global //only for ultra-fast non-blocking onComplete
val (stateQueue, stateSource) = Source.queue[StateEvent](1, OverflowStrategy.backpressure).preMaterialize()
val (specQueue, specSink) = Sink.queue[SpecUpdated]().preMaterialize()
val stateSnapshotPromise = Promise[StateSnapshot]()
val killSwitch = KillSwitches.shared("SchedulerAdapter.asSourceAndSink")
// We need to handle the case when the source is canceled or failed
stateQueue.watchCompletion().onComplete {
case Success(_) =>
killSwitch.shutdown()
case Failure(cause) =>
killSwitch.abort(cause)
}
def sourceFromSinkQueue[T](queue: SinkQueueWithCancel[T]): Source[T, NotUsed] = {
Source
.unfoldResourceAsync[T, SinkQueueWithCancel[T]](
create = () => Future.successful(queue),
read = queue => queue.pull(),
close = queue =>
Future.successful {
killSwitch.shutdown()
queue.cancel()
Done
})
}
Source.maybe.prepend {
val events = sourceFromSinkQueue(specQueue)
.watchTermination() {
case (_, completionSignal) =>
completionSignal.onComplete {
case Success(_) =>
killSwitch.shutdown()
case Failure(cause) =>
killSwitch.abort(cause)
}
}
Source.single(specsSnapshot -> events)
}.via(schedulerFlow)
.flatMapConcat {
case (snapshot, updates) =>
stateSnapshotPromise.trySuccess(snapshot)
updates.watchTermination() {
case (_, cancellationSignal) =>
cancellationSignal.onComplete {
case Success(_) =>
killSwitch.shutdown()
case Failure(cause) =>
killSwitch.abort(cause)
}
}
}
.mapAsync(1)(stateQueue.offer)
.map {
case QueueOfferResult.Enqueued =>
case QueueOfferResult.QueueClosed =>
killSwitch.shutdown()
case QueueOfferResult.Failure(cause) =>
killSwitch.abort(cause)
case QueueOfferResult.Dropped => // we shouldn't receive that at all because OverflowStrategy.backpressure
throw new RuntimeException("Unexpected QueueOfferResult.Dropped element")
}
.runWith(Sink.ignore)
val sourceWithKillSwitch = stateSource
.watchTermination() {
case (materializedValue, cancellationSignal) =>
cancellationSignal.onComplete {
case Success(_) =>
killSwitch.shutdown()
case Failure(cause) =>
killSwitch.abort(cause)
}
materializedValue
}
.via(killSwitch.flow)
val sinkWithKillSwitch = Flow[SpecUpdated]
.watchTermination() {
case (materializedValue, cancellationSignal) =>
cancellationSignal.onComplete {
case Success(_) =>
killSwitch.shutdown()
case Failure(cause) =>
killSwitch.abort(cause)
}
materializedValue
}
.via(killSwitch.flow)
.to(specSink)
(stateSnapshotPromise.future, sourceWithKillSwitch, sinkWithKillSwitch)
}
def fromClient(
client: MesosClient,
podRecordRepository: PodRecordRepository): Flow[SpecInput, StateOutput, NotUsed] = {
if (!isMultiRoleFramework(client.frameworkInfo)) {
throw new IllegalArgumentException(
"USI scheduler provides support for MULTI_ROLE frameworks only. " +
"Please provide a MesosClient with FrameworkInfo that has capability MULTI_ROLE")
}
fromFlow(client.calls, podRecordRepository, Flow.fromSinkAndSource(client.mesosSink, client.mesosSource))
}
def fromFlow(
mesosCallFactory: MesosCalls,
podRecordRepository: PodRecordRepository,
mesosFlow: Flow[MesosCall, MesosEvent, Any]): Flow[SpecInput, StateOutput, NotUsed] = {
Flow.fromGraph {
GraphDSL.create(unconnectedGraph(mesosCallFactory, podRecordRepository), mesosFlow)((_, _) => NotUsed) {
implicit builder =>
{ (graph, mesos) =>
import GraphDSL.Implicits._
mesos ~> graph.in2
graph.out2 ~> mesos
FlowShape(graph.in1, graph.out1)
}
}
}
}
/**
* We express an interface of receiving a snapshot, followed by a series of events, to guide consumers of USI down a
* proper implementation path. A SpecSnapshot should be the very first thing that the USI Scheduler receives from the
* Framework implementation.
*
* However, for convenience in working with streams, internally we deal with a single stream of SpecEvents.
*/
private val specInputFlatteningFlow: Flow[SpecInput, SpecEvent, NotUsed] = Flow[SpecInput].flatMapConcat {
case (snapshot, rest) =>
rest.prepend(Source.single(snapshot))
}
private val stateOutputBreakoutFlow: Flow[StateEvent, StateOutput, NotUsed] = Flow[StateEvent].prefixAndTail(1).map {
case (Seq(snapshot), stateEvents) =>
val stateSnapshot = snapshot match {
case x: StateSnapshot => x
case _ => throw new IllegalStateException("First event is allowed to be only a state snapshot")
}
val stateUpdates = stateEvents.map {
case _: StateSnapshot =>
throw new IllegalStateException("Only the first event is allowed to be a state snapshot")
case event => event
}
(stateSnapshot, stateUpdates)
}
private[core] def unconnectedGraph(
mesosCallFactory: MesosCalls,
podRecordRepository: PodRecordRepository): BidiFlow[SpecInput, StateOutput, MesosEvent, MesosCall, NotUsed] = {
val schedulerLogicGraph = new SchedulerLogicGraph(mesosCallFactory, loadPodRecords(podRecordRepository))
BidiFlow.fromGraph {
GraphDSL.create(schedulerLogicGraph) { implicit builder => (schedulerLogic) =>
{
import GraphDSL.Implicits._
val broadcast = builder.add(Broadcast[SchedulerEvents](2, eagerCancel = true))
val specInputFlattening = builder.add(specInputFlatteningFlow)
val stateOutputBreakout = builder.add(stateOutputBreakoutFlow)
val persistenceStorageFlow = builder.add(persistenceFlow(podRecordRepository))
specInputFlattening ~> schedulerLogic.in0
schedulerLogic.out ~> persistenceStorageFlow ~> broadcast.in
val mesosCalls = broadcast.out(0).mapConcat { frameResult =>
frameResult.mesosCalls
}
val stateEvents = broadcast.out(1).mapConcat { frameResult =>
frameResult.stateEvents
}
stateEvents ~> stateOutputBreakout
BidiShape.apply(specInputFlattening.in, stateOutputBreakout.outlet, schedulerLogic.in1, mesosCalls.outlet)
}
}
}
}
private[core] def persistenceFlow(
podRecordRepository: PodRecordRepository): Flow[SchedulerEvents, SchedulerEvents, NotUsed] = {
Flow[SchedulerEvents]
.mapConcat(persistEvents(_, podRecordRepository))
.mapAsync(schedulerSettings.persistencePipelineLimit)(call => call())
.collect { case Some(events) => events }
}
private def persistEvents(
events: SchedulerEvents,
podRecordRepository: PodRecordRepository): List[() => Future[Option[SchedulerEvents]]] = {
val ops: List[() => Future[Option[SchedulerEvents]]] = events.stateEvents.collect {
case PodRecordUpdated(_, Some(podRecord)) =>
() =>
podRecordRepository.store(podRecord).map(_ => None)(CallerThreadExecutionContext.context)
case PodRecordUpdated(podId, None) =>
() =>
podRecordRepository.delete(podId).map(_ => None)(CallerThreadExecutionContext.context)
}
ops :+ (() => Future.successful(Some(events)))
}
/*
* We don't start processing any commands until we've finished loading the entire set of podRecords
* This code delays building a scheduler stage until this podRecord snapshot is available.
*
* Block for IO - If the IO call fails or a timeout occurs, we should not make any progress.
*/
private def loadPodRecords(podRecordRepository: PodRecordRepository): Map[PodId, PodRecord] = {
// Add error handling (and maybe a retry mechanism).
Await.result(podRecordRepository.readAll(), schedulerSettings.persistenceLoadTimeout.seconds)
}
private def isMultiRoleFramework(frameworkInfo: FrameworkInfo): Boolean =
frameworkInfo.getCapabilitiesList.asScala.exists(_.getType == FrameworkInfo.Capability.Type.MULTI_ROLE)
}