/
handle_host.py
436 lines (354 loc) · 15.8 KB
/
handle_host.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
# Copyright(c) 2016 Nippon Telegraph and Telephone Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import socket
import eventlet
from oslo_log import log as oslo_logging
from oslo_utils import timeutils
import masakarimonitors.conf
from masakarimonitors.ha import masakari
import masakarimonitors.hostmonitor.host_handler.driver as driver
from masakarimonitors.hostmonitor.host_handler import hold_host_status
from masakarimonitors.hostmonitor.host_handler import parse_cib_xml
from masakarimonitors.hostmonitor.host_handler import parse_crmmon_xml
from masakarimonitors.objects import event_constants as ec
from masakarimonitors import utils
LOG = oslo_logging.getLogger(__name__)
CONF = masakarimonitors.conf.CONF
class CibSchemaCompliantTag(dict):
"""Create a dict which has the same attributes as a cib node tag.
Given a crm node tag convert it to a dict with corresponding cib tag
attributes.
"""
def __init__(self, crmon_entry):
self['uname'] = crmon_entry.get('name')
online = crmon_entry.get('online')
self['crmd'] = 'online' if online == 'true' else 'offline'
class HandleHost(driver.DriverBase):
"""Handle hosts.
This class handles the host status.
"""
def __init__(self):
super(HandleHost, self).__init__()
self.my_hostname = socket.gethostname()
self.xml_parser = parse_cib_xml.ParseCibXml()
self.crmmon_xml_parser = parse_crmmon_xml.ParseCrmMonXml()
self.status_holder = hold_host_status.HostHoldStatus()
self.notifier = masakari.SendNotification()
def _check_pacemaker_services(self, target_service):
try:
cmd_str = 'systemctl status ' + target_service
command = cmd_str.split(' ')
# Execute command.
out, err = utils.execute(*command, run_as_root=True)
if err:
raise Exception
return True
except Exception:
return False
def _check_hb_line(self):
"""Check whether the corosync communication is normal.
:returns: 0 if normal, 1 if abnormal, 2 if configuration file is
wrong or neither pacemaker nor pacemaker-remote is running.
"""
# Check whether the pacemaker services is normal.
corosync_status = self._check_pacemaker_services('corosync')
pacemaker_status = self._check_pacemaker_services('pacemaker')
pacemaker_remote_status = self._check_pacemaker_services(
'pacemaker_remote')
if corosync_status is False or pacemaker_status is False:
if pacemaker_remote_status is False:
LOG.error(
"Neither pacemaker nor pacemaker-remote is running.")
return 2
else:
LOG.info("Works on pacemaker-remote.")
return 0
# Check whether the neccesary parameters are set.
if CONF.host.corosync_multicast_interfaces is None or \
CONF.host.corosync_multicast_ports is None:
msg = ("corosync_multicast_interfaces or "
"corosync_multicast_ports is not set.")
LOG.error("%s", msg)
return 2
# Check whether the corosync communication is normal.
corosync_multicast_interfaces = \
CONF.host.corosync_multicast_interfaces.split(',')
corosync_multicast_ports = \
CONF.host.corosync_multicast_ports.split(',')
if len(corosync_multicast_interfaces) != len(corosync_multicast_ports):
msg = ("Incorrect parameters corosync_multicast_interfaces or "
"corosync_multicast_ports.")
LOG.error("%s", msg)
return 2
is_nic_normal = False
for num in range(0, len(corosync_multicast_interfaces)):
cmd_str = ("timeout %s tcpdump -n -c 1 -p -i %s port %s") \
% (CONF.host.tcpdump_timeout,
corosync_multicast_interfaces[num],
corosync_multicast_ports[num])
command = cmd_str.split(' ')
try:
# Execute tcpdump command.
out, err = utils.execute(*command, run_as_root=True)
# If command doesn't raise exception, nic is normal.
msg = ("Corosync communication using '%s' is normal.") \
% corosync_multicast_interfaces[num]
LOG.info("%s", msg)
is_nic_normal = True
break
except Exception:
msg = ("Corosync communication using '%s' is failed.") \
% corosync_multicast_interfaces[num]
LOG.warning("%s", msg)
if is_nic_normal is False:
LOG.error("Corosync communication is failed.")
return 1
return 0
def _check_host_status_by_crmadmin(self):
try:
# Execute crmadmin command.
out, err = utils.execute('crmadmin', '-S', self.my_hostname,
run_as_root=True)
if err:
msg = ("crmadmin command output stderr: %s") % err
raise Exception(msg)
# If own host is stable status, crmadmin outputs
# 'S_IDLE' or 'S_NOT_DC'
if 'S_IDLE' in out or 'S_NOT_DC' in out:
return 0
else:
raise Exception(
"crmadmin command output unexpected host status.")
except Exception as e:
LOG.warning("Exception caught: %s", e)
LOG.warning("'%s' is unstable state on cluster.",
self.my_hostname)
return 1
def _get_cib_xml(self):
try:
# Execute cibadmin command.
out, err = utils.execute('cibadmin', '--query', run_as_root=True)
if err:
msg = ("cibadmin command output stderr: %s") % err
raise Exception(msg)
except Exception as e:
LOG.warning("Exception caught: %s", e)
return
return out
def _get_crmmon_xml(self):
"""Get summary of cluster's current state in XML format."""
try:
# Execute crm_mon command.
out, err = utils.execute('crm_mon', '-X', run_as_root=True)
if err:
msg = ("crmmon command output stderr: %s") % err
raise Exception(msg)
except Exception as e:
LOG.warning("Exception caught: %s", e)
return
return out
def _is_poweroff(self, hostname):
ipmi_values = self.xml_parser.get_stonith_ipmi_params(hostname)
if ipmi_values is None:
LOG.error("Failed to get params of ipmi RA.")
return False
cmd_str = ("timeout %s ipmitool -U %s -P %s -I %s -H %s "
"power status") \
% (str(CONF.host.ipmi_timeout), ipmi_values['userid'],
ipmi_values['passwd'], ipmi_values['interface'],
ipmi_values['ipaddr'])
command = cmd_str.split(' ')
retry_count = 0
while True:
try:
# Execute ipmitool command.
out, err = utils.execute(*command, run_as_root=False)
if err:
msg = ("ipmitool command output stderr: %s") % err
raise Exception(msg)
msg = ("ipmitool command output stdout: %s") % out
if 'Power is off' in out:
LOG.info("%s", msg)
return True
else:
raise Exception(msg)
except Exception as e:
if retry_count < CONF.host.ipmi_retry_max:
LOG.warning("Retry executing ipmitool command. (%s)", e)
retry_count = retry_count + 1
eventlet.greenthread.sleep(CONF.host.ipmi_retry_interval)
else:
LOG.error("Exception caught: %s", e)
return False
def _make_event(self, hostname, current_status):
if current_status == 'online':
# Set values that host has started.
event_type = ec.EventConstants.EVENT_STARTED
cluster_status = current_status.upper()
host_status = ec.EventConstants.HOST_STATUS_NORMAL
else:
# Set values that host has stopped.
event_type = ec.EventConstants.EVENT_STOPPED
cluster_status = current_status.upper()
if not CONF.host.disable_ipmi_check:
if self._is_poweroff(hostname):
# Set value that host status is normal.
host_status = ec.EventConstants.HOST_STATUS_NORMAL
else:
# Set value that host status is unknown.
host_status = ec.EventConstants.HOST_STATUS_UNKNOWN
else:
# Set value that host status is normal.
host_status = ec.EventConstants.HOST_STATUS_NORMAL
current_time = timeutils.utcnow()
event = {
'notification': {
'type': ec.EventConstants.TYPE_COMPUTE_HOST,
'hostname': hostname,
'generated_time': current_time,
'payload': {
'event': event_type,
'cluster_status': cluster_status,
'host_status': host_status
}
}
}
return event
def _check_if_status_changed(self, node_state_tag_list):
# Check if host status changed.
for node_state_tag in node_state_tag_list:
# hostmonitor doesn't monitor itself.
if node_state_tag.get('uname') == self.my_hostname:
continue
# Get current status and old status.
current_status = node_state_tag.get('crmd')
old_status = self.status_holder.get_host_status(
node_state_tag.get('uname'))
# If old_status is None, This is first get of host status.
if old_status is None:
msg = ("Recognized '%s' as a new member of cluster."
" Host status is '%s'.") \
% (node_state_tag.get('uname'), current_status)
LOG.info("%s", msg)
self.status_holder.set_host_status(node_state_tag)
continue
# Output host status.
msg = ("'%s' is '%s'.") % (node_state_tag.get('uname'),
current_status)
LOG.info("%s", msg)
# If host status changed, send a notification.
if current_status != old_status:
if current_status != 'online' and current_status != 'offline':
# If current_status is not 'online' or 'offline',
# hostmonitor doesn't send a notification.
msg = ("Since host status is '%s',"
" hostmonitor doesn't send a notification.") \
% current_status
LOG.info("%s", msg)
else:
event = self._make_event(node_state_tag.get('uname'),
current_status)
# Send a notification.
self.notifier.send_notification(
CONF.host.api_retry_max,
CONF.host.api_retry_interval,
event)
# Update host status.
self.status_holder.set_host_status(node_state_tag)
def _check_host_status_by_crm_mon(self):
crmmon_xml = self._get_crmmon_xml()
if crmmon_xml is None:
# crm_mon command failure.
return 1
# Set to the ParseCrmMonXml object.
self.crmmon_xml_parser.set_crmmon_xml(crmmon_xml)
# Get node_state tag list.
node_state_tag_list = self.crmmon_xml_parser.get_node_state_tag_list()
if len(node_state_tag_list) == 0:
# If crmmon xml doesn't have node_state tag,
# it is an unexpected result.
raise Exception(
"Failed to get nodes tag from crm_mon xml.")
node_state_tag_list = [CibSchemaCompliantTag(n)
for n in node_state_tag_list
if n.get('type') == 'remote']
# Check if status changed.
self._check_if_status_changed(node_state_tag_list)
return 0
def _check_host_status_by_cibadmin(self):
# Get xml of cib info.
cib_xml = self._get_cib_xml()
if cib_xml is None:
# cibadmin command failure.
return 1
# Set to the ParseCibXml object.
self.xml_parser.set_cib_xml(cib_xml)
# Check if pacemaker cluster have quorum.
if self.xml_parser.have_quorum() == 0:
msg = "Pacemaker cluster doesn't have quorum."
LOG.warning("%s", msg)
# Get node_state tag list.
node_state_tag_list = self.xml_parser.get_node_state_tag_list()
if len(node_state_tag_list) == 0:
# If cib xml doesn't have node_state tag,
# it is an unexpected result.
raise Exception(
"Failed to get node_state tag from cib xml.")
# Check if status changed.
self._check_if_status_changed(node_state_tag_list)
return 0
def stop(self):
self.running = False
def monitor_hosts(self):
"""Host monitoring main method.
This method monitors hosts.
"""
try:
self.running = True
while self.running:
# Check whether corosync communication between hosts
# is normal.
ret = self._check_hb_line()
if ret == 1:
# Because my host may be fenced by stonith due to split
# brain condition, sleep for a certain time.
eventlet.greenthread.sleep(CONF.host.stonith_wait)
elif ret == 2:
LOG.warning("hostmonitor skips monitoring hosts.")
eventlet.greenthread.sleep(CONF.host.monitoring_interval)
continue
# Check the host status is stable or unstable by crmadmin.
# It only checks when this process runs on the full cluster
# stack of corosync.
pacemaker_remote_status = self._check_pacemaker_services(
'pacemaker_remote')
if pacemaker_remote_status is False:
if self._check_host_status_by_crmadmin() != 0:
LOG.warning("hostmonitor skips monitoring hosts.")
eventlet.greenthread.sleep(
CONF.host.monitoring_interval)
continue
# Check the host status is online or offline.
if CONF.host.restrict_to_remotes:
status_func = self._check_host_status_by_crm_mon
else:
status_func = self._check_host_status_by_cibadmin
if status_func() != 0:
LOG.warning("hostmonitor skips monitoring hosts.")
eventlet.greenthread.sleep(CONF.host.monitoring_interval)
continue
eventlet.greenthread.sleep(CONF.host.monitoring_interval)
except Exception as e:
LOG.exception("Exception caught: %s", e)
return