This repository has been archived by the owner on Sep 23, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
em_core_status.py
491 lines (395 loc) · 16.7 KB
/
em_core_status.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
import uuid
from datetime import datetime
from cloudyvents.cyvents import CYvent
import em_core_findworkers
import epumgmt.defaults.epustates as epustates
from epumgmt.api import RunVM
from epumgmt.main import em_args
from epumgmt.api.exceptions import IncompatibleEnvironment, ProgrammingError
def find_latest_status(p, c, m, run_name, cloudinitd, findworkersfirst=True):
"""Finds new workers, EPU controllers, and gathers all the information possible for status
The information is all stored to the RunVM objects in persistence
Return "allvms" for convenience, a list of RunVM instances
"""
if findworkersfirst:
try:
em_core_findworkers.find_once(p, c, m, run_name)
except IncompatibleEnvironment,e:
c.log.error("Problem finding new workers: %s" % str(e))
except Exception,e:
c.log.error("Problem finding new workers: %s" % str(e))
allvms = m.persistence.get_run_vms_or_none(run_name)
if not allvms or len(allvms) == 0:
raise IncompatibleEnvironment("Cannot find any VMs associated with run '%s'" % run_name)
#_find_latest_turtle_status(c, m, run_name, cloudinitd, allvms)
_find_latest_worker_status(c, m, run_name, cloudinitd, allvms)
return allvms
def pretty_status(p, c, m, run_name, cloudinitd):
"""Log information about VM instances that are part of the run.
To update or not is based on a given parameter to the program.
If you are interested in using a more API-like method, see find_latest_status()
"""
c.log.debug("Obtaining status")
no_update = p.get_arg_or_none(em_args.STATUS_NOUPDATE)
if no_update:
c.log.debug("The %s flag is suppressing status update" % em_args.STATUS_NOUPDATE.long_syntax)
allvms = m.persistence.get_run_vms_or_none(run_name)
else:
c.log.info("Getting the latest status information")
allvms = find_latest_status(p, c, m, run_name, cloudinitd)
c.log.info("\n%s" % _report(allvms))
# ----------------------------------------------------------------------------------------------------
# IMPL
# ----------------------------------------------------------------------------------------------------
def _find_latest_worker_status(c, m, run_name, cloudinitd, allvms):
"""Update what can be found for any nodes launched via EPU controllers
"""
c.log.info("Updating worker status")
# Print each new info item
trace = False
m.remote_svc_adapter.initialize(m, run_name, cloudinitd)
if not m.remote_svc_adapter.is_channel_open():
c.log.warn("Cannot get worker status: there is no channel open to the EPU controllers")
return
controller_map = m.remote_svc_adapter.controller_map(allvms)
if not len(controller_map):
c.log.warn("Cannot get worker status: there is a channel to the EPU controllers but no controllers are configured")
return
controllers = []
for instanceid in controller_map.keys():
controllers.extend(controller_map[instanceid])
provisioner_vm = _get_provisioner_vm(allvms)
if not provisioner_vm:
# This is an exception because it should be there especially if is_channel_open() passed above
raise ProgrammingError("Cannot update worker status without provisioner channel into the system")
try:
controller_state_map = m.remote_svc_adapter.worker_state(controllers, provisioner_vm)
except:
c.log.exception("Unable to get worker state for controllers: %s" % controllers)
return
_update_worker_parents(c, m, run_name, controllers, controller_state_map, allvms)
_update_worker_states(c, m, run_name, controllers, controller_state_map, allvms)
_update_controller_states(c, m, run_name, controller_map, controller_state_map, allvms)
def _get_running_terminate_timestamps(run_vms):
"""returns a dictionary of tuples of the timestamps of the
timestamp where a worker is RUNNING and TERMINATED.
If these values aren't available yet, they will be set to None
ex:
{"i-fsdfdse" : (running_timestamp, terminated_timestamp), ... }
"""
map = {}
for vm in run_vms:
running, terminated = None, None
for event in vm.events:
if event.name == "iaas_state":
state = event.extra["state"]
if state == epustates.RUNNING:
running = event.timestamp
elif state == epustates.TERMINATED:
terminated = event.timestamp
map[vm.instanceid] = (running, terminated)
return map
def _get_provisioner_vm(allvms):
"""returns a vm with a "provisioner" service_type
"""
provisioner_vm = None
for vm in allvms:
if vm.service_type == "provisioner":
provisioner_vm = vm
break
return provisioner_vm
def _update_controller_states(c, m, run_name, controller_map, controller_state_map, allvms):
"""Generate "de_state" and "de_conf_report" cloudyvents.
"""
trace = False
for instanceid in controller_map.keys():
vm = _get_vm_with_instanceid(instanceid, allvms)
if not vm:
msg = "instanceid '%s' is in your controller_map, but not your list of VMs?" % instanceid
raise ProgrammingError(msg)
any_newevent = False
controllers = controller_map[instanceid]
for controller in controllers:
try:
state = controller_state_map[controller]
except KeyError:
msg = "'%s' in list of controllers, but no state available. " % controller
msg += "Maybe a query failed?"
c.log.warn(msg)
continue
newevent = _get_events_from_controller_state(state, vm, controller, trace, c)
if newevent:
any_newevent = True
if any_newevent:
m.persistence.store_run_vms(run_name, [vm])
def _update_worker_parents(c, m, run_name, controllers, controller_state_map, allvms):
"""Update the parent attribute for each worker vm
"""
for controller in controllers:
try:
state = controller_state_map[controller]
except KeyError:
msg = "'%s' in list of controllers, but no state available. " % controller
msg += "Maybe a query failed?"
c.log.warn(msg)
continue
for wis in state.instances:
vm = _get_vm_with_nodeid(wis.nodeid, allvms)
if not vm:
# Can't make a RunVM yet for this, unfortunately
c.log.warn("Controller '%s' knows about worker we have no IaaS id for yet: %s" % (controller, wis.nodeid))
continue
newparent = False
if not vm.parent:
vm.parent = controller
newparent = True
elif vm.parent != controller:
raise ProgrammingError("Previous RunVM had a different parent "
"'%s', new status query indicates parent is '%s'" % (vm.parent, controller))
if newparent:
m.persistence.store_run_vms(run_name, [vm])
def _update_worker_states(c, m, run_name, controllers, controller_state_map, allvms):
"""Generate "iaas_state" and "heartbeat_state" cloudyvents.
"""
trace = False
for controller in controllers:
try:
state = controller_state_map[controller]
except KeyError:
msg = "'%s' in list of controllers, but no state available. " % controller
msg += "Maybe a query failed?"
c.log.warn(msg)
continue
for wis in state.instances:
vm = _get_vm_with_nodeid(wis.nodeid, allvms)
if not vm:
# Can't make a RunVM yet for this, unfortunately
c.log.warn("Controller '%s' knows about worker we have no IaaS id for yet: %s" % (controller, wis.nodeid))
continue
newevent = _get_events_from_wis(wis, vm, controller, trace, c)
if newevent:
m.persistence.store_run_vms(run_name, [vm])
def _get_events_from_wis(wis, vm, controller, trace, c):
"""See if there is anything in the WorkerInstanceState, return True if events were added
"""
newevent = False
if wis.iaas_state:
event = CYvent(controller,
"iaas_state",
str(uuid.uuid4()),
datetime.fromtimestamp(wis.iaas_state_time),
extra={"nodeid":wis.nodeid,
"state":wis.iaas_state,
"instanceid": vm.instanceid})
vm.events.append(event)
newevent = True
if trace:
c.log.debug("iaas_state for %s: %s (from controller '%s')" % (vm.instanceid, wis.iaas_state, controller))
if wis.heartbeat_state:
event = CYvent(controller,
"heartbeat_state",
str(uuid.uuid4()),
datetime.fromtimestamp(wis.heartbeat_time),
extra={"nodeid":wis.nodeid,
"state":wis.heartbeat_state,
"instanceid": vm.instanceid})
vm.events.append(event)
newevent = True
if trace:
c.log.debug("heartbeat_state for %s: %s (from controller '%s')" % (vm.instanceid, wis.heartbeat_state, controller))
return newevent
def _get_events_from_controller_state(state, vm, controller, trace, c):
"""See if there is anything in the EPUControllerState, return True if any events were added
"""
newevent = False
if state.de_state:
event = CYvent(controller,
"de_state",
str(uuid.uuid4()),
datetime.fromtimestamp(state.capture_time),
extra={"de_state": state.de_state})
vm.events.append(event)
newevent = True
if trace:
c.log.debug("de_state for controller %s: %s" % (controller, state.de_state))
if state.de_conf_report:
event = CYvent(controller,
"de_conf_report",
str(uuid.uuid4()),
datetime.fromtimestamp(state.capture_time),
extra={"de_conf_report": state.de_conf_report})
vm.events.append(event)
newevent = True
if trace:
c.log.debug("de_conf_report for controller %s: %s" % (controller, state.de_conf_report))
return newevent
def _get_vm_with_nodeid(nodeid, allvms):
"""Return RunVM object for nodeid or None if not found
"""
for vm in allvms:
if vm.nodeid == nodeid:
return vm
return None
def _get_vm_with_instanceid(instanceid, allvms):
"""Return RunVM object for instanceid or None if not found
"""
for vm in allvms:
if vm.instanceid == instanceid:
return vm
return None
def _filter_out_workers(allvms):
# We are currently in a transition state, the WORKER_SUFFIX idea is being obsoleted
# by a direct "parent_epu_controller" instance variable in RunVM
nonworkers = []
for vm in allvms:
if vm.parent:
continue
elif vm.service_type.endswith(RunVM.WORKER_SUFFIX):
# For now, to be backwards compatible, WORKER_SUFFIX suffix signals a worker
continue
nonworkers.append(vm)
return nonworkers
def _filter_out_services(allvms):
# We are currently in a transition state, the WORKER_SUFFIX idea is being obsoleted
# by a direct "parent_epu_controller" instance variable in RunVM
workers = []
for vm in allvms:
if vm.parent:
workers.append(vm)
elif vm.service_type.endswith(RunVM.WORKER_SUFFIX):
# For now, to be backwards compatible, WORKER_SUFFIX suffix signals a worker
workers.append(vm)
return workers
def _find_state_from_events(vm):
if not vm:
return None
if not vm.events:
return None
latest = None
for ev in vm.events:
if ev.name == "iaas_state":
if latest:
if latest.timestamp < ev.timestamp:
latest = ev
else:
latest = ev
if not latest:
return None
return latest.extra["state"]
def _get_vm_with_controller(controller, vm_list):
for vm in vm_list:
for ev in vm.events:
if ev.source == controller:
return vm
return None
def _latest_controller_state(vm):
if not vm:
return None
if not vm.events:
return None
latest_destate = None
for ev in vm.events:
if ev.name == "de_state":
if latest_destate:
if latest_destate.timestamp < ev.timestamp:
latest_destate = ev
else:
latest_destate = ev
ret_state = latest_destate
if latest_destate:
if latest_destate.extra.has_key("de_state"):
ret_state = latest_destate.extra["de_state"]
if latest_destate.extra.has_key("state"):
ret_state = latest_destate.extra["state"]
return ret_state
# ----------------------------------------------------------------------------------------------------
# REPORT
# ----------------------------------------------------------------------------------------------------
def _report(allvms):
txt = "\n------------\nBase System:\n------------\n\n"
default_typetxt = "(unknown)"
default_hostname = "(unknown)"
services = _filter_out_workers(allvms)
widest_service = _widest_type(services)
if len(default_typetxt) > widest_service:
widest_service = len(default_typetxt)
default_typetxt = _pad_txt(default_typetxt, widest_service)
for vm in services:
typetxt = default_typetxt
hostname = default_hostname
if vm.service_type:
typetxt = _pad_txt(vm.service_type, widest_service)
if vm.hostname:
hostname = vm.hostname
vm_info = (typetxt, vm.instanceid, hostname)
txt += "%s | %s | %s\n" % vm_info
txt += "\n--------\nWorkers:\n--------\n\n"
workers = _filter_out_services(allvms)
default_status = "(unknown)"
default_controller = "(unknown controller)"
by_controller = {} # key: controller, value: list of vm_info tuples for it
timestamps = _get_running_terminate_timestamps(workers)
for vm in workers:
hostname = default_hostname
if vm.hostname:
hostname = vm.hostname
status = _find_state_from_events(vm)
if not status:
status = default_status
controller = default_controller
if vm.parent:
controller = vm.parent
running_timestamp, terminated_timestamp = timestamps[vm.instanceid]
if not running_timestamp:
running_timestamp = " "
if not terminated_timestamp:
terminated_timestamp = " "
vm_info = (status, vm.instanceid, hostname, running_timestamp, terminated_timestamp)
if by_controller.has_key(controller):
by_controller[controller].append(vm_info)
else:
by_controller[controller] = [vm_info]
widest_status = len(default_status)
widest_hostname = 0
widest_running_timestamp = 0
for vm_info_list in by_controller.values():
for vm_info in vm_info_list:
if len(vm_info[0]) > widest_status:
widest_status = len(vm_info[0])
if len(vm_info[2]) > widest_hostname:
widest_hostname = len(vm_info[2])
if len(str(vm_info[3])) > widest_running_timestamp:
widest_running_timestamp = len(str(vm_info[3]))
for controller in by_controller.keys():
txt += "%s:\n" % controller
vm = _get_vm_with_controller(controller, services)
if vm:
latest_destate = _latest_controller_state(vm)
if latest_destate:
txt += " EPU state: %s" % latest_destate
else:
txt += " EPU state: unknown"
txt += "\n Workers:\n"
for vm_info in by_controller[controller]:
status = _pad_txt(vm_info[0], widest_status)
hostname = _pad_txt(vm_info[2], widest_hostname)
running = _pad_txt(str(vm_info[3]), widest_running_timestamp)
txt += " %s | %s | %s | %s | %s\n" % (
status, vm_info[1], hostname, running, vm_info[4])
txt += "\n"
return txt
def _widest_type(run_vms):
widest = 0
for vm in run_vms:
if vm.service_type:
if len(vm.service_type) > widest:
widest = len(vm.service_type)
return widest
def _pad_txt(txt, widest):
if len(txt) >= widest:
return txt
difference = widest - len(txt)
while difference:
txt += " "
difference -= 1
return txt