/
migrate.py
335 lines (296 loc) · 16 KB
/
migrate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from oslo_log import log as logging
from oslo_serialization import jsonutils
from nova import availability_zones
from nova.compute import utils as compute_utils
from nova.conductor.tasks import base
from nova import exception
from nova.i18n import _
from nova import objects
from nova.scheduler.client import report
from nova.scheduler import utils as scheduler_utils
LOG = logging.getLogger(__name__)
def replace_allocation_with_migration(context, instance, migration):
"""Replace instance's allocation with one for a migration.
:raises: keystoneauth1.exceptions.base.ClientException on failure to
communicate with the placement API
:raises: ConsumerAllocationRetrievalFailed if reading the current
allocation from placement fails
:raises: ComputeHostNotFound if the host of the instance is not found in
the databse
:raises: AllocationMoveFailed if moving the allocation from the
instance.uuid to the migration.uuid fails due to parallel
placement operation on the instance consumer
:raises: NoValidHost if placement rejectes the update for other reasons
(e.g. not enough resources)
:returns: (source_compute_node, migration_allocation)
"""
try:
source_cn = objects.ComputeNode.get_by_host_and_nodename(
context, instance.host, instance.node)
except exception.ComputeHostNotFound:
LOG.error('Unable to find record for source '
'node %(node)s on %(host)s',
{'host': instance.host, 'node': instance.node},
instance=instance)
# A generic error like this will just error out the migration
# and do any rollback required
raise
reportclient = report.SchedulerReportClient()
orig_alloc = reportclient.get_allocs_for_consumer(
context, instance.uuid)['allocations']
root_alloc = orig_alloc.get(source_cn.uuid, {}).get('resources', {})
if not root_alloc:
LOG.debug('Unable to find existing allocations for instance on '
'source compute node: %s. This is normal if you are not '
'using the FilterScheduler.', source_cn.uuid,
instance=instance)
return None, None
# FIXME(danms): This method is flawed in that it asssumes allocations
# against only one provider. So, this may overwite allocations against
# a shared provider, if we had one.
success = reportclient.move_allocations(context, instance.uuid,
migration.uuid)
if not success:
LOG.error('Unable to replace resource claim on source '
'host %(host)s node %(node)s for instance',
{'host': instance.host,
'node': instance.node},
instance=instance)
# Mimic the "no space" error that could have come from the
# scheduler. Once we have an atomic replace operation, this
# would be a severe error.
raise exception.NoValidHost(
reason=_('Unable to replace instance claim on source'))
else:
LOG.debug('Created allocations for migration %(mig)s on %(rp)s',
{'mig': migration.uuid, 'rp': source_cn.uuid})
return source_cn, orig_alloc
def revert_allocation_for_migration(context, source_cn, instance, migration):
"""Revert an allocation made for a migration back to the instance."""
reportclient = report.SchedulerReportClient()
# FIXME(danms): This method is flawed in that it asssumes allocations
# against only one provider. So, this may overwite allocations against
# a shared provider, if we had one.
success = reportclient.move_allocations(context, migration.uuid,
instance.uuid)
if not success:
LOG.error('Unable to replace resource claim on source '
'host %(host)s node %(node)s for instance',
{'host': instance.host,
'node': instance.node},
instance=instance)
else:
LOG.debug('Created allocations for instance %(inst)s on %(rp)s',
{'inst': instance.uuid, 'rp': source_cn.uuid})
class MigrationTask(base.TaskBase):
def __init__(self, context, instance, flavor,
request_spec, clean_shutdown, compute_rpcapi,
query_client, report_client, host_list, network_api):
super(MigrationTask, self).__init__(context, instance)
self.clean_shutdown = clean_shutdown
self.request_spec = request_spec
self.flavor = flavor
self.compute_rpcapi = compute_rpcapi
self.query_client = query_client
self.reportclient = report_client
self.host_list = host_list
self.network_api = network_api
# Persist things from the happy path so we don't have to look
# them up if we need to roll back
self._migration = None
self._held_allocations = None
self._source_cn = None
def _preallocate_migration(self):
# If this is a rescheduled migration, don't create a new record.
migration_type = ("resize" if self.instance.flavor.id != self.flavor.id
else "migration")
filters = {"instance_uuid": self.instance.uuid,
"migration_type": migration_type,
"status": "pre-migrating"}
migrations = objects.MigrationList.get_by_filters(self.context,
filters).objects
if migrations:
migration = migrations[0]
else:
migration = objects.Migration(context=self.context.elevated())
migration.old_instance_type_id = self.instance.flavor.id
migration.new_instance_type_id = self.flavor.id
migration.status = 'pre-migrating'
migration.instance_uuid = self.instance.uuid
migration.source_compute = self.instance.host
migration.source_node = self.instance.node
migration.migration_type = migration_type
migration.create()
self._migration = migration
self._source_cn, self._held_allocations = (
replace_allocation_with_migration(self.context,
self.instance,
self._migration))
return migration
def _restrict_request_spec_to_cell(self, legacy_props):
# NOTE(danms): Right now we only support migrate to the same
# cell as the current instance, so request that the scheduler
# limit thusly.
instance_mapping = objects.InstanceMapping.get_by_instance_uuid(
self.context, self.instance.uuid)
LOG.debug('Requesting cell %(cell)s while migrating',
{'cell': instance_mapping.cell_mapping.identity},
instance=self.instance)
if ('requested_destination' in self.request_spec and
self.request_spec.requested_destination):
self.request_spec.requested_destination.cell = (
instance_mapping.cell_mapping)
# NOTE(takashin): In the case that the target host is specified,
# if the migration is failed, it is not necessary to retry
# the cold migration to the same host. So make sure that
# reschedule will not occur.
if 'host' in self.request_spec.requested_destination:
legacy_props.pop('retry', None)
self.request_spec.retry = None
else:
self.request_spec.requested_destination = objects.Destination(
cell=instance_mapping.cell_mapping)
def _execute(self):
# TODO(sbauza): Remove once all the scheduler.utils methods accept a
# RequestSpec object in the signature.
legacy_props = self.request_spec.to_legacy_filter_properties_dict()
scheduler_utils.setup_instance_group(self.context, self.request_spec)
# If a target host is set in a requested destination,
# 'populate_retry' need not be executed.
if not ('requested_destination' in self.request_spec and
self.request_spec.requested_destination and
'host' in self.request_spec.requested_destination):
scheduler_utils.populate_retry(legacy_props,
self.instance.uuid)
# NOTE(sbauza): Force_hosts/nodes needs to be reset
# if we want to make sure that the next destination
# is not forced to be the original host
self.request_spec.reset_forced_destinations()
port_res_req = self.network_api.get_requested_resource_for_instance(
self.context, self.instance.uuid)
# NOTE(gibi): When cyborg or other module wants to handle similar
# non-nova resources then here we have to collect all the external
# resource requests in a single list and add them to the RequestSpec.
self.request_spec.requested_resources = port_res_req
self._restrict_request_spec_to_cell(legacy_props)
# Once _preallocate_migration() is done, the source node allocation is
# moved from the instance consumer to the migration record consumer,
# and the instance consumer doesn't have any allocations. If this is
# the first time through here (not a reschedule), select_destinations
# below will allocate resources on the selected destination node for
# the instance consumer. If we're rescheduling, host_list is not None
# and we'll call claim_resources for the instance and the selected
# alternate. If we exhaust our alternates and raise MaxRetriesExceeded,
# the rollback() method should revert the allocation swaparoo and move
# the source node allocation from the migration record back to the
# instance record.
migration = self._preallocate_migration()
self.request_spec.ensure_project_and_user_id(self.instance)
self.request_spec.ensure_network_metadata(self.instance)
compute_utils.heal_reqspec_is_bfv(
self.context, self.request_spec, self.instance)
# On an initial call to migrate, 'self.host_list' will be None, so we
# have to call the scheduler to get a list of acceptable hosts to
# migrate to. That list will consist of a selected host, along with
# zero or more alternates. On a reschedule, though, the alternates will
# be passed to this object and stored in 'self.host_list', so we can
# pop the first alternate from the list to use for the destination, and
# pass the remaining alternates to the compute.
if self.host_list is None:
selection_lists = self.query_client.select_destinations(
self.context, self.request_spec, [self.instance.uuid],
return_objects=True, return_alternates=True)
# Since there is only ever one instance to migrate per call, we
# just need the first returned element.
selection_list = selection_lists[0]
# The selected host is the first item in the list, with the
# alternates being the remainder of the list.
selection, self.host_list = selection_list[0], selection_list[1:]
scheduler_utils.fill_provider_mapping(
self.context, self.reportclient, self.request_spec, selection)
else:
# This is a reschedule that will use the supplied alternate hosts
# in the host_list as destinations. Since the resources on these
# alternates may have been consumed and might not be able to
# support the migrated instance, we need to first claim the
# resources to verify the host still has sufficient availabile
# resources.
elevated = self.context.elevated()
host_available = False
while self.host_list and not host_available:
selection = self.host_list.pop(0)
if selection.allocation_request:
alloc_req = jsonutils.loads(selection.allocation_request)
else:
alloc_req = None
if alloc_req:
# If this call succeeds, the resources on the destination
# host will be claimed by the instance.
host_available = scheduler_utils.claim_resources(
elevated, self.reportclient, self.request_spec,
self.instance.uuid, alloc_req,
selection.allocation_request_version)
if host_available:
scheduler_utils.fill_provider_mapping(
self.context, self.reportclient, self.request_spec,
selection)
else:
# Some deployments use different schedulers that do not
# use Placement, so they will not have an
# allocation_request to claim with. For those cases,
# there is no concept of claiming, so just assume that
# the host is valid.
host_available = True
# There are no more available hosts. Raise a MaxRetriesExceeded
# exception in that case.
if not host_available:
reason = ("Exhausted all hosts available for retrying build "
"failures for instance %(instance_uuid)s." %
{"instance_uuid": self.instance.uuid})
raise exception.MaxRetriesExceeded(reason=reason)
scheduler_utils.populate_filter_properties(legacy_props, selection)
# context is not serializable
legacy_props.pop('context', None)
(host, node) = (selection.service_host, selection.nodename)
self.instance.availability_zone = (
availability_zones.get_host_availability_zone(
self.context, host))
LOG.debug("Calling prep_resize with selected host: %s; "
"Selected node: %s; Alternates: %s", host, node,
self.host_list, instance=self.instance)
# RPC cast to the destination host to start the migration process.
self.compute_rpcapi.prep_resize(
# NOTE(mriedem): Using request_spec.image here is potentially
# dangerous if it is not kept up to date (i.e. rebuild/unshelve);
# seems like the sane thing to do would be to pass the current
# instance.image_meta since that is what MoveClaim will use for
# any NUMA topology claims on the destination host...
self.context, self.instance, self.request_spec.image,
self.flavor, host, migration,
request_spec=self.request_spec, filter_properties=legacy_props,
node=node, clean_shutdown=self.clean_shutdown,
host_list=self.host_list)
def rollback(self):
if self._migration:
self._migration.status = 'error'
self._migration.save()
if not self._held_allocations:
return
# NOTE(danms): We created new-style migration-based
# allocations for the instance, but failed before we kicked
# off the migration in the compute. Normally the latter would
# do that cleanup but we never got that far, so do it here and
# now.
revert_allocation_for_migration(self.context, self._source_cn,
self.instance, self._migration)