/
driver.py
2155 lines (1876 loc) · 95.9 KB
/
driver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Copyright 2014 Red Hat, Inc.
# Copyright 2013 Hewlett-Packard Development Company, L.P.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""
A driver wrapping the Ironic API, such that Nova may provision
bare metal resources.
"""
import base64
from distutils import version
import gzip
import shutil
import tempfile
import time
from urllib import parse as urlparse
from openstack import exceptions as sdk_exc
from oslo_log import log as logging
from oslo_serialization import jsonutils
from oslo_service import loopingcall
from oslo_utils import excutils
from oslo_utils import importutils
from tooz import hashring as hash_ring
from nova.api.metadata import base as instance_metadata
from nova import block_device
from nova.compute import power_state
from nova.compute import task_states
from nova.compute import vm_states
import nova.conf
from nova.console import type as console_type
from nova import context as nova_context
from nova import exception
from nova.i18n import _
from nova import objects
from nova.objects import external_event as external_event_obj
from nova.objects import fields as obj_fields
from nova import servicegroup
from nova import utils
from nova.virt import configdrive
from nova.virt import driver as virt_driver
from nova.virt import hardware
from nova.virt.ironic import client_wrapper
from nova.virt.ironic import ironic_states
from nova.virt.ironic import patcher
from nova.virt import netutils
ironic = None
LOG = logging.getLogger(__name__)
CONF = nova.conf.CONF
_POWER_STATE_MAP = {
ironic_states.POWER_ON: power_state.RUNNING,
ironic_states.NOSTATE: power_state.NOSTATE,
ironic_states.POWER_OFF: power_state.SHUTDOWN,
}
_UNPROVISION_STATES = (ironic_states.ACTIVE, ironic_states.DEPLOYFAIL,
ironic_states.ERROR, ironic_states.DEPLOYWAIT,
ironic_states.DEPLOYING, ironic_states.RESCUE,
ironic_states.RESCUING, ironic_states.RESCUEWAIT,
ironic_states.RESCUEFAIL, ironic_states.UNRESCUING,
ironic_states.UNRESCUEFAIL)
_NODE_FIELDS = ('uuid', 'power_state', 'target_power_state', 'provision_state',
'target_provision_state', 'last_error', 'maintenance',
'properties', 'instance_uuid', 'traits', 'resource_class')
# Console state checking interval in seconds
_CONSOLE_STATE_CHECKING_INTERVAL = 1
# Number of hash ring partitions per service
# 5 should be fine for most deployments, as an experimental feature.
_HASH_RING_PARTITIONS = 2 ** 5
def map_power_state(state):
try:
return _POWER_STATE_MAP[state]
except KeyError:
LOG.warning("Power state %s not found.", state)
return power_state.NOSTATE
def _get_nodes_supported_instances(cpu_arch=None):
"""Return supported instances for a node."""
if not cpu_arch:
return []
return [(cpu_arch,
obj_fields.HVType.BAREMETAL,
obj_fields.VMMode.HVM)]
def _log_ironic_polling(what, node, instance):
power_state = (None if node.power_state is None else
'"%s"' % node.power_state)
tgt_power_state = (None if node.target_power_state is None else
'"%s"' % node.target_power_state)
prov_state = (None if node.provision_state is None else
'"%s"' % node.provision_state)
tgt_prov_state = (None if node.target_provision_state is None else
'"%s"' % node.target_provision_state)
LOG.debug('Still waiting for ironic node %(node)s to %(what)s: '
'power_state=%(power_state)s, '
'target_power_state=%(tgt_power_state)s, '
'provision_state=%(prov_state)s, '
'target_provision_state=%(tgt_prov_state)s',
dict(what=what,
node=node.uuid,
power_state=power_state,
tgt_power_state=tgt_power_state,
prov_state=prov_state,
tgt_prov_state=tgt_prov_state),
instance=instance)
def _check_peer_list():
# these configs are mutable; need to check at runtime and init
if CONF.ironic.partition_key is not None:
peer_list = set(CONF.ironic.peer_list)
if not peer_list:
LOG.error('FATAL: Peer list is not configured in the '
'[ironic]/peer_list option; cannot map '
'ironic nodes to compute services.')
raise exception.InvalidPeerList(host=CONF.host)
if CONF.host not in peer_list:
LOG.error('FATAL: Peer list does not contain this '
'compute service hostname (%s); add it to '
'the [ironic]/peer_list option.', CONF.host)
raise exception.InvalidPeerList(host=CONF.host)
if set([CONF.host]) == peer_list:
LOG.warning('This compute service (%s) is the only service '
'present in the [ironic]/peer_list option. '
'Are you sure this should not include more '
'hosts?', CONF.host)
class IronicDriver(virt_driver.ComputeDriver):
"""Hypervisor driver for Ironic - bare metal provisioning."""
capabilities = {
"has_imagecache": False,
"supports_evacuate": False,
"supports_migrate_to_same_host": False,
"supports_attach_interface": True,
"supports_multiattach": False,
"supports_trusted_certs": False,
"supports_pcpus": False,
"supports_accelerators": False,
# Image type support flags
"supports_image_type_aki": False,
"supports_image_type_ami": True,
"supports_image_type_ari": False,
"supports_image_type_iso": False,
"supports_image_type_qcow2": True,
"supports_image_type_raw": True,
"supports_image_type_vdi": False,
"supports_image_type_vhd": False,
"supports_image_type_vhdx": False,
"supports_image_type_vmdk": False,
"supports_image_type_ploop": False,
}
# This driver is capable of rebalancing nodes between computes.
rebalances_nodes = True
def __init__(self, virtapi, read_only=False):
super(IronicDriver, self).__init__(virtapi)
global ironic
if ironic is None:
ironic = importutils.import_module('ironicclient')
# NOTE(deva): work around a lack of symbols in the current version.
if not hasattr(ironic, 'exc'):
ironic.exc = importutils.import_module('ironicclient.exc')
if not hasattr(ironic, 'client'):
ironic.client = importutils.import_module(
'ironicclient.client')
self.node_cache = {}
self.node_cache_time = 0
self.servicegroup_api = servicegroup.API()
self.ironicclient = client_wrapper.IronicClientWrapper()
self._ironic_connection = None
@property
def ironic_connection(self):
if self._ironic_connection is None:
# Ask get_sdk_adapter to raise ServiceUnavailable if the baremetal
# service isn't ready yet. Consumers of ironic_connection are set
# up to handle this and raise VirtDriverNotReady as appropriate.
self._ironic_connection = utils.get_sdk_adapter(
'baremetal', check_service=True)
return self._ironic_connection
def _get_node(self, node_id):
"""Get a node by its UUID.
Some methods pass in variables named nodename, but are
actually UUID's.
"""
node = self.ironic_connection.get_node(node_id, fields=_NODE_FIELDS)
# TODO(dustinc): Make consumers use the right fields and remove this
node.uuid = node.id
node.instance_uuid = node.instance_id
node.maintenance = node.is_maintenance
return node
def _validate_instance_and_node(self, instance):
"""Get the node associated with the instance.
Check with the Ironic service that this instance is associated with a
node, and return the node.
"""
nodes = list(self.ironic_connection.nodes(
instance_id=instance.uuid, fields=_NODE_FIELDS))
if not nodes:
raise exception.InstanceNotFound(instance_id=instance.uuid)
if len(nodes) > 1:
# This indicates a programming error so fail.
raise exception.NovaException(
_('Ironic returned more than one node for a query '
'that can only return zero or one: %s') % nodes)
node = nodes[0]
# TODO(dustinc): Make consumers use the right fields and remove
if hasattr(node, "id"):
node.uuid = node.id
if hasattr(node, "instance_id"):
node.instance_uuid = node.instance_id
if hasattr(node, "is_maintenance"):
node.maintenance = node.is_maintenance
return node
def _node_resources_unavailable(self, node_obj):
"""Determine whether the node's resources are in an acceptable state.
Determines whether the node's resources should be presented
to Nova for use based on the current power, provision and maintenance
state. This is called after _node_resources_used, so any node that
is not used and not in AVAILABLE should be considered in a 'bad' state,
and unavailable for scheduling. Returns True if unacceptable.
"""
bad_power_states = [
ironic_states.ERROR, ironic_states.NOSTATE]
# keep NOSTATE around for compatibility
good_provision_states = [
ironic_states.AVAILABLE, ironic_states.NOSTATE]
return (node_obj.maintenance or
node_obj.power_state in bad_power_states or
node_obj.provision_state not in good_provision_states)
def _node_resources_used(self, node_obj):
"""Determine whether the node's resources are currently used.
Determines whether the node's resources should be considered used
or not. A node is used when it is either in the process of putting
a new instance on the node, has an instance on the node, or is in
the process of cleaning up from a deleted instance. Returns True if
used.
If we report resources as consumed for a node that does not have an
instance on it, the resource tracker will notice there's no instances
consuming resources and try to correct us. So only nodes with an
instance attached should report as consumed here.
"""
return node_obj.instance_uuid is not None
def _parse_node_properties(self, node):
"""Helper method to parse the node's properties."""
properties = {}
for prop in ('cpus', 'memory_mb', 'local_gb'):
try:
properties[prop] = int(node.properties.get(prop, 0))
except (TypeError, ValueError):
LOG.warning('Node %(uuid)s has a malformed "%(prop)s". '
'It should be an integer.',
{'uuid': node.uuid, 'prop': prop})
properties[prop] = 0
raw_cpu_arch = node.properties.get('cpu_arch', None)
try:
cpu_arch = obj_fields.Architecture.canonicalize(raw_cpu_arch)
except exception.InvalidArchitectureName:
cpu_arch = None
if not cpu_arch:
LOG.warning("cpu_arch not defined for node '%s'", node.uuid)
properties['cpu_arch'] = cpu_arch
properties['raw_cpu_arch'] = raw_cpu_arch
properties['capabilities'] = node.properties.get('capabilities')
return properties
def _node_resource(self, node):
"""Helper method to create resource dict from node stats."""
properties = self._parse_node_properties(node)
raw_cpu_arch = properties['raw_cpu_arch']
cpu_arch = properties['cpu_arch']
nodes_extra_specs = {}
# NOTE(deva): In Havana and Icehouse, the flavor was required to link
# to an arch-specific deploy kernel and ramdisk pair, and so the flavor
# also had to have extra_specs['cpu_arch'], which was matched against
# the ironic node.properties['cpu_arch'].
# With Juno, the deploy image(s) may be referenced directly by the
# node.driver_info, and a flavor no longer needs to contain any of
# these three extra specs, though the cpu_arch may still be used
# in a heterogeneous environment, if so desired.
# NOTE(dprince): we use the raw cpu_arch here because extra_specs
# filters aren't canonicalized
nodes_extra_specs['cpu_arch'] = raw_cpu_arch
# NOTE(gilliard): To assist with more precise scheduling, if the
# node.properties contains a key 'capabilities', we expect the value
# to be of the form "k1:v1,k2:v2,etc.." which we add directly as
# key/value pairs into the node_extra_specs to be used by the
# ComputeCapabilitiesFilter
capabilities = properties['capabilities']
if capabilities:
for capability in str(capabilities).split(','):
parts = capability.split(':')
if len(parts) == 2 and parts[0] and parts[1]:
nodes_extra_specs[parts[0].strip()] = parts[1]
else:
LOG.warning("Ignoring malformed capability '%s'. "
"Format should be 'key:val'.", capability)
vcpus = vcpus_used = 0
memory_mb = memory_mb_used = 0
local_gb = local_gb_used = 0
dic = {
'uuid': str(node.uuid),
'hypervisor_hostname': str(node.uuid),
'hypervisor_type': self._get_hypervisor_type(),
'hypervisor_version': self._get_hypervisor_version(),
'resource_class': node.resource_class,
# The Ironic driver manages multiple hosts, so there are
# likely many different CPU models in use. As such it is
# impossible to provide any meaningful info on the CPU
# model of the "host"
'cpu_info': None,
'vcpus': vcpus,
'vcpus_used': vcpus_used,
'local_gb': local_gb,
'local_gb_used': local_gb_used,
'disk_available_least': local_gb - local_gb_used,
'memory_mb': memory_mb,
'memory_mb_used': memory_mb_used,
'supported_instances': _get_nodes_supported_instances(cpu_arch),
'stats': nodes_extra_specs,
'numa_topology': None,
}
return dic
def _set_instance_id(self, node, instance):
try:
# NOTE(TheJulia): Assert an instance ID to lock the node
# from other deployment attempts while configuration is
# being set.
self.ironic_connection.update_node(node, retry_on_conflict=False,
instance_id=instance.uuid)
except sdk_exc.SDKException:
msg = (_("Failed to reserve node %(node)s "
"when provisioning the instance %(instance)s")
% {'node': node.id, 'instance': instance.uuid})
LOG.error(msg)
raise exception.InstanceDeployFailure(msg)
def prepare_for_spawn(self, instance):
LOG.debug('Preparing to spawn instance %s.', instance.uuid)
node_uuid = instance.get('node')
if not node_uuid:
raise ironic.exc.BadRequest(
_("Ironic node uuid not supplied to "
"driver for instance %s.") % instance.uuid)
node = self._get_node(node_uuid)
self._set_instance_id(node, instance)
def failed_spawn_cleanup(self, instance):
LOG.debug('Failed spawn cleanup called for instance',
instance=instance)
try:
node = self._validate_instance_and_node(instance)
except exception.InstanceNotFound:
LOG.warning('Attempt to clean-up from failed spawn of '
'instance %s failed due to no instance_uuid '
'present on the node.', instance.uuid)
return
self._cleanup_deploy(node, instance)
def _add_instance_info_to_node(self, node, instance, image_meta, flavor,
preserve_ephemeral=None,
block_device_info=None):
root_bdm = block_device.get_root_bdm(
virt_driver.block_device_info_get_mapping(block_device_info))
boot_from_volume = root_bdm is not None
patch = patcher.create(node).get_deploy_patch(instance,
image_meta,
flavor,
preserve_ephemeral,
boot_from_volume)
try:
# FIXME(lucasagomes): The "retry_on_conflict" parameter was added
# to basically causes the deployment to fail faster in case the
# node picked by the scheduler is already associated with another
# instance due bug #1341420.
self.ironicclient.call('node.update', node.uuid, patch,
retry_on_conflict=False)
except ironic.exc.BadRequest:
msg = (_("Failed to add deploy parameters on node %(node)s "
"when provisioning the instance %(instance)s")
% {'node': node.uuid, 'instance': instance.uuid})
LOG.error(msg)
raise exception.InstanceDeployFailure(msg)
def _remove_instance_info_from_node(self, node, instance):
patch = [{'path': '/instance_info', 'op': 'remove'},
{'path': '/instance_uuid', 'op': 'remove'}]
try:
self.ironicclient.call('node.update', node.uuid, patch)
except ironic.exc.BadRequest as e:
LOG.warning("Failed to remove deploy parameters from node "
"%(node)s when unprovisioning the instance "
"%(instance)s: %(reason)s",
{'node': node.uuid, 'instance': instance.uuid,
'reason': str(e)})
def _add_volume_target_info(self, context, instance, block_device_info):
bdms = virt_driver.block_device_info_get_mapping(block_device_info)
for bdm in bdms:
if not bdm.is_volume:
continue
connection_info = jsonutils.loads(bdm._bdm_obj.connection_info)
target_properties = connection_info['data']
driver_volume_type = connection_info['driver_volume_type']
try:
self.ironicclient.call('volume_target.create',
node_uuid=instance.node,
volume_type=driver_volume_type,
properties=target_properties,
boot_index=bdm._bdm_obj.boot_index,
volume_id=bdm._bdm_obj.volume_id)
except (ironic.exc.BadRequest, ironic.exc.Conflict):
msg = (_("Failed to add volume target information of "
"volume %(volume)s on node %(node)s when "
"provisioning the instance")
% {'volume': bdm._bdm_obj.volume_id,
'node': instance.node})
LOG.error(msg, instance=instance)
raise exception.InstanceDeployFailure(msg)
def _cleanup_volume_target_info(self, instance):
targets = self.ironicclient.call('node.list_volume_targets',
instance.node, detail=True)
for target in targets:
volume_target_id = target.uuid
try:
self.ironicclient.call('volume_target.delete',
volume_target_id)
except ironic.exc.NotFound:
LOG.debug("Volume target information %(target)s of volume "
"%(volume)s is already removed from node %(node)s",
{'target': volume_target_id,
'volume': target.volume_id,
'node': instance.node},
instance=instance)
except ironic.exc.ClientException as e:
LOG.warning("Failed to remove volume target information "
"%(target)s of volume %(volume)s from node "
"%(node)s when unprovisioning the instance: "
"%(reason)s",
{'target': volume_target_id,
'volume': target.volume_id,
'node': instance.node,
'reason': e},
instance=instance)
def _cleanup_deploy(self, node, instance, network_info=None,
remove_instance_info=True):
self._cleanup_volume_target_info(instance)
self._unplug_vifs(node, instance, network_info)
if remove_instance_info:
self._remove_instance_info_from_node(node, instance)
def _wait_for_active(self, instance):
"""Wait for the node to be marked as ACTIVE in Ironic."""
instance.refresh()
# Ignore REBUILD_SPAWNING when rebuilding from ERROR state.
if (instance.task_state != task_states.REBUILD_SPAWNING and
(instance.task_state == task_states.DELETING or
instance.vm_state in (vm_states.ERROR, vm_states.DELETED))):
raise exception.InstanceDeployFailure(
_("Instance %s provisioning was aborted") % instance.uuid)
node = self._validate_instance_and_node(instance)
if node.provision_state == ironic_states.ACTIVE:
# job is done
LOG.debug("Ironic node %(node)s is now ACTIVE",
dict(node=node.uuid), instance=instance)
raise loopingcall.LoopingCallDone()
if node.target_provision_state in (ironic_states.DELETED,
ironic_states.AVAILABLE):
# ironic is trying to delete it now
raise exception.InstanceNotFound(instance_id=instance.uuid)
if node.provision_state in (ironic_states.NOSTATE,
ironic_states.AVAILABLE):
# ironic already deleted it
raise exception.InstanceNotFound(instance_id=instance.uuid)
if node.provision_state == ironic_states.DEPLOYFAIL:
# ironic failed to deploy
msg = (_("Failed to provision instance %(inst)s: %(reason)s")
% {'inst': instance.uuid, 'reason': node.last_error})
raise exception.InstanceDeployFailure(msg)
_log_ironic_polling('become ACTIVE', node, instance)
def _wait_for_power_state(self, instance, message):
"""Wait for the node to complete a power state change."""
node = self._validate_instance_and_node(instance)
if node.target_power_state == ironic_states.NOSTATE:
raise loopingcall.LoopingCallDone()
_log_ironic_polling(message, node, instance)
def init_host(self, host):
"""Initialize anything that is necessary for the driver to function.
:param host: the hostname of the compute host.
"""
self._refresh_hash_ring(nova_context.get_admin_context())
def _get_hypervisor_type(self):
"""Get hypervisor type."""
return 'ironic'
def _get_hypervisor_version(self):
"""Returns the version of the Ironic API service endpoint."""
return client_wrapper.IRONIC_API_VERSION[0]
def instance_exists(self, instance):
"""Checks the existence of an instance.
Checks the existence of an instance. This is an override of the
base method for efficiency.
:param instance: The instance object.
:returns: True if the instance exists. False if not.
"""
try:
self._validate_instance_and_node(instance)
return True
except exception.InstanceNotFound:
return False
def _get_node_list(self, return_generator=False, **kwargs):
"""Helper function to return a list or generator of nodes.
:param return_generator: If True, returns a generator of nodes. This
generator will only have SDK attribute names.
:returns: a list or generator of raw nodes from ironic
:raises: VirtDriverNotReady
"""
try:
# NOTE(dustinc): The generator returned by the SDK can only be
# interated once. Since there are cases where it needs to be
# iterated more than once, we should return it as a list. In the
# future it may be worth refactoring these other usages so it can
# be returned as a generator.
node_generator = self.ironic_connection.nodes(**kwargs)
except sdk_exc.InvalidResourceQuery as e:
LOG.error("Invalid parameters in the provided search query."
"Error: %s", str(e))
raise exception.VirtDriverNotReady()
except Exception as e:
LOG.error("An unknown error has occurred when trying to get the "
"list of nodes from the Ironic inventory. Error: %s",
str(e))
raise exception.VirtDriverNotReady()
if return_generator:
return node_generator
else:
node_list = []
# TODO(dustinc): Update all usages to use SDK attributes then stop
# copying values to PythonClient attributes.
for node in node_generator:
# NOTE(dustinc): There are usages that filter out these fields
# which forces us to check for the attributes.
if hasattr(node, "id"):
node.uuid = node.id
if hasattr(node, "instance_id"):
node.instance_uuid = node.instance_id
if hasattr(node, "is_maintenance"):
node.maintenance = node.is_maintenance
node_list.append(node)
return node_list
def list_instances(self):
"""Return the names of all the instances provisioned.
:returns: a list of instance names.
:raises: VirtDriverNotReady
"""
# NOTE(dustinc): The SDK returns an object with instance_id,
# but the Ironic API expects instance_uuid in query.
context = nova_context.get_admin_context()
return [objects.Instance.get_by_uuid(context, i.instance_id).name
for i in self._get_node_list(return_generator=True,
associated=True,
fields=['instance_uuid'])]
def list_instance_uuids(self):
"""Return the IDs of all the instances provisioned.
:returns: a list of instance IDs.
:raises: VirtDriverNotReady
"""
# NOTE(dustinc): The SDK returns an object with instance_id,
# but the Ironic API expects instance_uuid in query.
return [node.instance_id for node in self._get_node_list(
return_generator=True, associated=True, fields=['instance_uuid'])]
def node_is_available(self, nodename):
"""Confirms a Nova hypervisor node exists in the Ironic inventory.
:param nodename: The UUID of the node. Parameter is called nodename
even though it is a UUID to keep method signature
the same as inherited class.
:returns: True if the node exists, False if not.
"""
# NOTE(comstud): We can cheat and use caching here. This method
# just needs to return True for nodes that exist. It doesn't
# matter if the data is stale. Sure, it's possible that removing
# node from Ironic will cause this method to return True until
# the next call to 'get_available_nodes', but there shouldn't
# be much harm. There's already somewhat of a race.
if not self.node_cache:
# Empty cache, try to populate it.
self._refresh_cache()
# nodename is the ironic node's UUID.
if nodename in self.node_cache:
return True
# NOTE(comstud): Fallback and check Ironic. This case should be
# rare.
try:
# nodename is the ironic node's UUID.
self._get_node(nodename)
return True
except sdk_exc.ResourceNotFound:
return False
def _refresh_hash_ring(self, ctxt):
peer_list = None
# NOTE(jroll) if this is set, we need to limit the set of other
# compute services in the hash ring to hosts that are currently up
# and specified in the peer_list config option, as there's no way
# to check which partition_key other compute services are using.
if CONF.ironic.partition_key is not None:
try:
# NOTE(jroll) first we need to make sure the Ironic API can
# filter by conductor_group. If it cannot, limiting to
# peer_list could end up with a node being managed by multiple
# compute services.
self._can_send_version(min_version='1.46')
peer_list = set(CONF.ironic.peer_list)
# these configs are mutable; need to check at runtime and init.
# luckily, we run this method from init_host.
_check_peer_list()
LOG.debug('Limiting peer list to %s', peer_list)
except exception.IronicAPIVersionNotAvailable:
pass
# TODO(jroll) optimize this to limit to the peer_list
service_list = objects.ServiceList.get_all_computes_by_hv_type(
ctxt, self._get_hypervisor_type())
services = set()
for svc in service_list:
# NOTE(jroll) if peer_list is None, we aren't partitioning by
# conductor group, so we check all compute services for liveness.
# if we have a peer_list, don't check liveness for compute
# services that aren't in the list.
if peer_list is None or svc.host in peer_list:
is_up = self.servicegroup_api.service_is_up(svc)
if is_up:
services.add(svc.host.lower())
# NOTE(jroll): always make sure this service is in the list, because
# only services that have something registered in the compute_nodes
# table will be here so far, and we might be brand new.
services.add(CONF.host.lower())
self.hash_ring = hash_ring.HashRing(services,
partitions=_HASH_RING_PARTITIONS)
LOG.debug('Hash ring members are %s', services)
def _refresh_cache(self):
ctxt = nova_context.get_admin_context()
self._refresh_hash_ring(ctxt)
node_cache = {}
def _get_node_list(**kwargs):
# NOTE(TheJulia): This call can take a substantial amount
# of time as it may be attempting to retrieve thousands of
# baremetal nodes. Depending on the version of Ironic,
# this can be as long as 2-10 seconds per every thousand
# nodes, and this call may retrieve all nodes in a deployment,
# depending on if any filter paramters are applied.
return self._get_node_list(fields=_NODE_FIELDS, **kwargs)
# NOTE(jroll) if partition_key is set, we need to limit nodes that
# can be managed to nodes that have a matching conductor_group
# attribute. If the API isn't new enough to support conductor groups,
# we fall back to managing all nodes. If it is new enough, we can
# filter it in the API.
partition_key = CONF.ironic.partition_key
if partition_key is not None:
try:
self._can_send_version(min_version='1.46')
nodes = _get_node_list(conductor_group=partition_key)
LOG.debug('Limiting manageable ironic nodes to conductor '
'group %s', partition_key)
except exception.IronicAPIVersionNotAvailable:
LOG.error('Required Ironic API version 1.46 is not '
'available to filter nodes by conductor group. '
'All nodes will be eligible to be managed by '
'this compute service.')
nodes = _get_node_list()
else:
nodes = _get_node_list()
# NOTE(saga): As _get_node_list() will take a long
# time to return in large clusters we need to call it before
# get_uuids_by_host() method. Otherwise the instances list we get from
# get_uuids_by_host() method will become stale.
# A stale instances list can cause a node that is managed by this
# compute host to be excluded in error and cause the compute node
# to be orphaned and associated resource provider to be deleted.
instances = objects.InstanceList.get_uuids_by_host(ctxt, CONF.host)
for node in nodes:
# NOTE(jroll): we always manage the nodes for instances we manage
if node.instance_uuid in instances:
node_cache[node.uuid] = node
# NOTE(jroll): check if the node matches us in the hash ring, and
# does not have an instance_uuid (which would imply the node has
# an instance managed by another compute service).
# Note that this means nodes with an instance that was deleted in
# nova while the service was down, and not yet reaped, will not be
# reported until the periodic task cleans it up.
elif (node.instance_uuid is None and
CONF.host.lower() in
self.hash_ring.get_nodes(node.uuid.encode('utf-8'))):
node_cache[node.uuid] = node
self.node_cache = node_cache
self.node_cache_time = time.time()
def get_available_nodes(self, refresh=False):
"""Returns the UUIDs of Ironic nodes managed by this compute service.
We use consistent hashing to distribute Ironic nodes between all
available compute services. The subset of nodes managed by a given
compute service is determined by the following rules:
* any node with an instance managed by the compute service
* any node that is mapped to the compute service on the hash ring
* no nodes with instances managed by another compute service
The ring is rebalanced as nova-compute services are brought up and
down. Note that this rebalance does not happen at the same time for
all compute services, so a node may be managed by multiple compute
services for a small amount of time.
:param refresh: Boolean value; If True run update first. Ignored by
this driver.
:returns: a list of UUIDs
"""
# NOTE(jroll) we refresh the cache every time this is called
# because it needs to happen in the resource tracker
# periodic task. This task doesn't pass refresh=True,
# unfortunately.
self._refresh_cache()
node_uuids = list(self.node_cache.keys())
LOG.debug("Returning %(num_nodes)s available node(s)",
dict(num_nodes=len(node_uuids)))
return node_uuids
def update_provider_tree(self, provider_tree, nodename, allocations=None):
"""Update a ProviderTree object with current resource provider and
inventory information.
:param nova.compute.provider_tree.ProviderTree provider_tree:
A nova.compute.provider_tree.ProviderTree object representing all
the providers in the tree associated with the compute node, and any
sharing providers (those with the ``MISC_SHARES_VIA_AGGREGATE``
trait) associated via aggregate with any of those providers (but
not *their* tree- or aggregate-associated providers), as currently
known by placement.
:param nodename:
String name of the compute node (i.e.
ComputeNode.hypervisor_hostname) for which the caller is requesting
updated provider information.
:param allocations:
Dict of allocation data of the form:
{ $CONSUMER_UUID: {
# The shape of each "allocations" dict below is identical
# to the return from GET /allocations/{consumer_uuid}
"allocations": {
$RP_UUID: {
"generation": $RP_GEN,
"resources": {
$RESOURCE_CLASS: $AMOUNT,
...
},
},
...
},
"project_id": $PROJ_ID,
"user_id": $USER_ID,
"consumer_generation": $CONSUMER_GEN,
},
...
}
If None, and the method determines that any inventory needs to be
moved (from one provider to another and/or to a different resource
class), the ReshapeNeeded exception must be raised. Otherwise, this
dict must be edited in place to indicate the desired final state of
allocations.
:raises ReshapeNeeded: If allocations is None and any inventory needs
to be moved from one provider to another and/or to a different
resource class.
"""
# nodename is the ironic node's UUID.
node = self._node_from_cache(nodename)
reserved = False
if (not self._node_resources_used(node) and
self._node_resources_unavailable(node)):
LOG.debug('Node %(node)s is not ready for a deployment, '
'reporting resources as reserved for it. Node\'s '
'provision state is %(prov)s, power state is '
'%(power)s and maintenance is %(maint)s.',
{'node': node.uuid, 'prov': node.provision_state,
'power': node.power_state, 'maint': node.maintenance})
reserved = True
info = self._node_resource(node)
result = {}
rc_name = info.get('resource_class')
if rc_name is None:
raise exception.NoResourceClass(node=nodename)
norm_name = utils.normalize_rc_name(rc_name)
if norm_name is not None:
result[norm_name] = {
'total': 1,
'reserved': int(reserved),
'min_unit': 1,
'max_unit': 1,
'step_size': 1,
'allocation_ratio': 1.0,
}
provider_tree.update_inventory(nodename, result)
# TODO(efried): *Unset* (remove_traits) if "owned" by ironic virt but
# not set on the node object, and *set* (add_traits) only those both
# owned by ironic virt and set on the node object.
provider_tree.update_traits(nodename, node.traits)
def get_available_resource(self, nodename):
"""Retrieve resource information.
This method is called when nova-compute launches, and
as part of a periodic task that records the results in the DB.
:param nodename: the UUID of the node.
:returns: a dictionary describing resources.
"""
# NOTE(comstud): We can cheat and use caching here. This method is
# only called from a periodic task and right after the above
# get_available_nodes() call is called.
if not self.node_cache:
# Well, it's also called from init_host(), so if we have empty
# cache, let's try to populate it.
self._refresh_cache()
# nodename is the ironic node's UUID.
node = self._node_from_cache(nodename)
return self._node_resource(node)
def _node_from_cache(self, node_uuid):
"""Returns a node from the cache, retrieving the node from Ironic API
if the node doesn't yet exist in the cache.
"""
# NOTE(vdrok): node_cache might also be modified during instance
# _unprovision call, hence this function is synchronized
@utils.synchronized('ironic-node-%s' % node_uuid)
def _sync_node_from_cache():
cache_age = time.time() - self.node_cache_time
if node_uuid in self.node_cache:
LOG.debug("Using cache for node %(node)s, age: %(age)s",
{'node': node_uuid, 'age': cache_age})
return self.node_cache[node_uuid]
else:
LOG.debug("Node %(node)s not found in cache, age: %(age)s",
{'node': node_uuid, 'age': cache_age})
node = self._get_node(node_uuid)
self.node_cache[node_uuid] = node
return node
return _sync_node_from_cache()
def get_info(self, instance, use_cache=True):
"""Get the current state and resource usage for this instance.
If the instance is not found this method returns (a dictionary
with) NOSTATE and all resources == 0.
:param instance: the instance object.
:param use_cache: boolean to indicate if the driver should be allowed
to use cached data to return instance status.
If false, pull fresh data from ironic.
:returns: an InstanceInfo object
"""
def _fetch_from_ironic(self, instance):
try:
node = self._validate_instance_and_node(instance)
return hardware.InstanceInfo(
state=map_power_state(node.power_state))
except exception.InstanceNotFound:
return hardware.InstanceInfo(
state=map_power_state(ironic_states.NOSTATE))
if not use_cache:
return _fetch_from_ironic(self, instance)
# we should already have a cache for our nodes, refreshed on every
# RT loop. but if we don't have a cache, generate it.
if not self.node_cache:
self._refresh_cache()
for node in self.node_cache.values():
if instance.uuid == node.instance_uuid:
break
else:
# if we can't find the instance, fall back to ironic
return _fetch_from_ironic(self, instance)
return hardware.InstanceInfo(state=map_power_state(node.power_state))
def _get_network_metadata(self, node, network_info):
"""Gets a more complete representation of the instance network info.
This data is exposed as network_data.json in the metadata service and
the config drive.
:param node: The node object.
:param network_info: Instance network information.
"""