/
report.py
2371 lines (2133 loc) · 112 KB
/
report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Copyright (c) 2014 Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import collections
import contextlib
import copy
import functools
import random
import time
from keystoneauth1 import exceptions as ks_exc
import os_resource_classes as orc
import os_traits
from oslo_log import log as logging
from oslo_middleware import request_id
from oslo_utils import excutils
from oslo_utils import versionutils
import retrying
import six
from nova.compute import provider_tree
import nova.conf
from nova import exception
from nova.i18n import _
from nova import objects
from nova import utils
CONF = nova.conf.CONF
LOG = logging.getLogger(__name__)
WARN_EVERY = 10
NEGATIVE_MEMBER_OF_VERSION = '1.32'
RESHAPER_VERSION = '1.30'
CONSUMER_GENERATION_VERSION = '1.28'
ALLOW_RESERVED_EQUAL_TOTAL_INVENTORY_VERSION = '1.26'
POST_RPS_RETURNS_PAYLOAD_API_VERSION = '1.20'
AGGREGATE_GENERATION_VERSION = '1.19'
NESTED_PROVIDER_API_VERSION = '1.14'
POST_ALLOCATIONS_API_VERSION = '1.13'
GET_USAGES_VERSION = '1.9'
AggInfo = collections.namedtuple('AggInfo', ['aggregates', 'generation'])
TraitInfo = collections.namedtuple('TraitInfo', ['traits', 'generation'])
ProviderAllocInfo = collections.namedtuple(
'ProviderAllocInfo', ['allocations'])
def warn_limit(self, msg):
if self._warn_count:
self._warn_count -= 1
else:
self._warn_count = WARN_EVERY
LOG.warning(msg)
def safe_connect(f):
@functools.wraps(f)
def wrapper(self, *a, **k):
try:
return f(self, *a, **k)
except ks_exc.EndpointNotFound:
warn_limit(
self, 'The placement API endpoint was not found.')
# Reset client session so there is a new catalog, which
# gets cached when keystone is first successfully contacted.
self._client = self._create_client()
except ks_exc.MissingAuthPlugin:
warn_limit(
self, 'No authentication information found for placement API.')
except ks_exc.Unauthorized:
warn_limit(
self, 'Placement service credentials do not work.')
except ks_exc.DiscoveryFailure:
# TODO(_gryf): Looks like DiscoveryFailure is not the only missing
# exception here. In Pike we should take care about keystoneauth1
# failures handling globally.
warn_limit(self,
'Discovering suitable URL for placement API failed.')
except ks_exc.ConnectFailure:
LOG.warning('Placement API service is not responding.')
return wrapper
class Retry(Exception):
def __init__(self, operation, reason):
self.operation = operation
self.reason = reason
def retries(f):
"""Decorator to retry a call three times if it raises Retry
Note that this returns the actual value of the inner call on success
or returns False if all the retries fail.
"""
@functools.wraps(f)
def wrapper(self, *a, **k):
for retry in range(0, 4):
try:
sleep_time = random.uniform(0, retry * 2)
time.sleep(sleep_time)
return f(self, *a, **k)
except Retry as e:
LOG.debug(
'Unable to %(op)s because %(reason)s; retrying...',
{'op': e.operation, 'reason': e.reason})
LOG.error('Failed scheduler client operation %s: out of retries',
f.__name__)
return False
return wrapper
def _move_operation_alloc_request(source_allocs, dest_alloc_req):
"""Given existing allocations for a source host and a new allocation
request for a destination host, return a new allocation_request that
contains resources claimed against both source and destination, accounting
for shared providers.
This is expected to only be used during an evacuate operation.
:param source_allocs: Dict, keyed by resource provider UUID, of resources
allocated on the source host
:param dest_alloc_req: The allocation_request for resources against the
destination host
"""
LOG.debug("Doubling-up allocation_request for move operation. Current "
"allocations: %s", source_allocs)
# Remove any allocations against resource providers that are
# already allocated against on the source host (like shared storage
# providers)
cur_rp_uuids = set(source_allocs.keys())
new_rp_uuids = set(dest_alloc_req['allocations']) - cur_rp_uuids
current_allocs = {
cur_rp_uuid: {'resources': alloc['resources']}
for cur_rp_uuid, alloc in source_allocs.items()
}
new_alloc_req = {'allocations': current_allocs}
for rp_uuid in dest_alloc_req['allocations']:
if rp_uuid in new_rp_uuids:
new_alloc_req['allocations'][rp_uuid] = dest_alloc_req[
'allocations'][rp_uuid]
LOG.debug("New allocation_request containing both source and "
"destination hosts in move operation: %s", new_alloc_req)
return new_alloc_req
def get_placement_request_id(response):
if response is not None:
return response.headers.get(request_id.HTTP_RESP_HEADER_REQUEST_ID)
# TODO(mriedem): Consider making SchedulerReportClient a global singleton so
# that things like the compute API do not have to lazy-load it. That would
# likely require inspecting methods that use a ProviderTree cache to see if
# they need locks.
class SchedulerReportClient(object):
"""Client class for updating the scheduler."""
def __init__(self, adapter=None):
"""Initialize the report client.
:param adapter: A prepared keystoneauth1 Adapter for API communication.
If unspecified, one is created based on config options in the
[placement] section.
"""
self._adapter = adapter
# An object that contains a nova-compute-side cache of resource
# provider and inventory information
self._provider_tree = None
# Track the last time we updated providers' aggregates and traits
self._association_refresh_time = None
self._client = self._create_client()
# NOTE(danms): Keep track of how naggy we've been
self._warn_count = 0
def clear_provider_cache(self, init=False):
if not init:
LOG.info("Clearing the report client's provider cache.")
self._provider_tree = provider_tree.ProviderTree()
self._association_refresh_time = {}
def _clear_provider_cache_for_tree(self, rp_uuid):
"""Clear the provider cache for only the tree containing rp_uuid.
This exists for situations where we encounter an error updating
placement, and therefore need to refresh the provider tree cache before
redriving the update. However, it would be wasteful and inefficient to
clear the *entire* cache, which may contain many separate trees (e.g.
ironic nodes or sharing providers) which should be unaffected by the
error.
:param rp_uuid: UUID of a resource provider, which may be anywhere in a
a tree hierarchy, i.e. need not be a root. For non-root
providers, we still clear the cache for the entire tree
including descendants, ancestors up to the root,
siblings/cousins and *their* ancestors/descendants.
"""
try:
uuids = self._provider_tree.get_provider_uuids_in_tree(rp_uuid)
except ValueError:
# If the provider isn't in the tree, it should also not be in the
# timer dict, so nothing to clear.
return
# get_provider_uuids_in_tree returns UUIDs in top-down order, so the
# first one is the root; and .remove() is recursive.
self._provider_tree.remove(uuids[0])
for uuid in uuids:
self._association_refresh_time.pop(uuid, None)
def _create_client(self):
"""Create the HTTP session accessing the placement service."""
# Flush provider tree and associations so we start from a clean slate.
self.clear_provider_cache(init=True)
client = self._adapter or utils.get_sdk_adapter('placement')
# Set accept header on every request to ensure we notify placement
# service of our response body media type preferences.
client.additional_headers = {'accept': 'application/json'}
return client
def get(self, url, version=None, global_request_id=None):
return self._client.get(url, microversion=version,
global_request_id=global_request_id)
def post(self, url, data, version=None, global_request_id=None):
# NOTE(sdague): using json= instead of data= sets the
# media type to application/json for us. Placement API is
# more sensitive to this than other APIs in the OpenStack
# ecosystem.
return self._client.post(url, json=data, microversion=version,
global_request_id=global_request_id)
def put(self, url, data, version=None, global_request_id=None):
# NOTE(sdague): using json= instead of data= sets the
# media type to application/json for us. Placement API is
# more sensitive to this than other APIs in the OpenStack
# ecosystem.
return self._client.put(url, json=data, microversion=version,
global_request_id=global_request_id)
def delete(self, url, version=None, global_request_id=None):
return self._client.delete(url, microversion=version,
global_request_id=global_request_id)
@safe_connect
def get_allocation_candidates(self, context, resources):
"""Returns a tuple of (allocation_requests, provider_summaries,
allocation_request_version).
The allocation_requests are a collection of potential JSON objects that
can be passed to the PUT /allocations/{consumer_uuid} Placement REST
API to claim resources against one or more resource providers that meet
the requested resource constraints.
The provider summaries is a dict, keyed by resource provider UUID, of
inventory and capacity information and traits for any resource
provider involved in the allocation_requests.
:returns: A tuple with a list of allocation_request dicts, a dict of
provider information, and the microversion used to request
this data from placement, or (None, None, None) if the
request failed
:param context: The security context
:param nova.scheduler.utils.ResourceRequest resources:
A ResourceRequest object representing the requested resources,
traits, and aggregates from the request spec.
Example member_of (aggregates) value in resources:
[('foo', 'bar'), ('baz',)]
translates to:
"Candidates are in either 'foo' or 'bar', but definitely in 'baz'"
"""
# Note that claim_resources() will use this version as well to
# make allocations by `PUT /allocations/{consumer_uuid}`
version = NEGATIVE_MEMBER_OF_VERSION
qparams = resources.to_querystring()
url = "/allocation_candidates?%s" % qparams
resp = self.get(url, version=version,
global_request_id=context.global_id)
if resp.status_code == 200:
data = resp.json()
return (data['allocation_requests'], data['provider_summaries'],
version)
args = {
'resource_request': str(resources),
'status_code': resp.status_code,
'err_text': resp.text,
}
msg = ("Failed to retrieve allocation candidates from placement "
"API for filters: %(resource_request)s\n"
"Got %(status_code)d: %(err_text)s.")
LOG.error(msg, args)
return None, None, None
@safe_connect
def _get_provider_aggregates(self, context, rp_uuid):
"""Queries the placement API for a resource provider's aggregates.
:param rp_uuid: UUID of the resource provider to grab aggregates for.
:return: A namedtuple comprising:
* .aggregates: A set() of string aggregate UUIDs, which may
be empty if the specified provider is associated with no
aggregates.
* .generation: The resource provider generation.
:raise: ResourceProviderAggregateRetrievalFailed on errors. In
particular, we raise this exception (as opposed to returning
None or the empty set()) if the specified resource provider
does not exist.
"""
resp = self.get("/resource_providers/%s/aggregates" % rp_uuid,
version=AGGREGATE_GENERATION_VERSION,
global_request_id=context.global_id)
if resp.status_code == 200:
data = resp.json()
return AggInfo(aggregates=set(data['aggregates']),
generation=data['resource_provider_generation'])
placement_req_id = get_placement_request_id(resp)
msg = ("[%(placement_req_id)s] Failed to retrieve aggregates from "
"placement API for resource provider with UUID %(uuid)s. "
"Got %(status_code)d: %(err_text)s.")
args = {
'placement_req_id': placement_req_id,
'uuid': rp_uuid,
'status_code': resp.status_code,
'err_text': resp.text,
}
LOG.error(msg, args)
raise exception.ResourceProviderAggregateRetrievalFailed(uuid=rp_uuid)
def get_provider_traits(self, context, rp_uuid):
"""Queries the placement API for a resource provider's traits.
:param context: The security context
:param rp_uuid: UUID of the resource provider to grab traits for.
:return: A namedtuple comprising:
* .traits: A set() of string trait names, which may be
empty if the specified provider has no traits.
* .generation: The resource provider generation.
:raise: ResourceProviderTraitRetrievalFailed on errors. In particular,
we raise this exception (as opposed to returning None or the
empty set()) if the specified resource provider does not exist.
:raise: keystoneauth1.exceptions.ClientException if placement API
communication fails.
"""
resp = self.get("/resource_providers/%s/traits" % rp_uuid,
version='1.6', global_request_id=context.global_id)
if resp.status_code == 200:
json = resp.json()
return TraitInfo(traits=set(json['traits']),
generation=json['resource_provider_generation'])
placement_req_id = get_placement_request_id(resp)
LOG.error(
"[%(placement_req_id)s] Failed to retrieve traits from "
"placement API for resource provider with UUID %(uuid)s. Got "
"%(status_code)d: %(err_text)s.",
{'placement_req_id': placement_req_id, 'uuid': rp_uuid,
'status_code': resp.status_code, 'err_text': resp.text})
raise exception.ResourceProviderTraitRetrievalFailed(uuid=rp_uuid)
def get_resource_provider_name(self, context, uuid):
"""Return the name of a RP. It tries to use the internal of RPs or
falls back to calling placement directly.
:param context: The security context
:param uuid: UUID identifier for the resource provider to look up
:return: The name of the RP
:raise: ResourceProviderRetrievalFailed if the RP is not in the cache
and the communication with the placement is failed.
:raise: ResourceProviderNotFound if the RP does not exists.
"""
try:
return self._provider_tree.data(uuid).name
except ValueError:
rsp = self._get_resource_provider(context, uuid)
if rsp is None:
raise exception.ResourceProviderNotFound(name_or_uuid=uuid)
else:
return rsp['name']
@safe_connect
def _get_resource_provider(self, context, uuid):
"""Queries the placement API for a resource provider record with the
supplied UUID.
:param context: The security context
:param uuid: UUID identifier for the resource provider to look up
:return: A dict of resource provider information if found or None if no
such resource provider could be found.
:raise: ResourceProviderRetrievalFailed on error.
"""
resp = self.get("/resource_providers/%s" % uuid,
version=NESTED_PROVIDER_API_VERSION,
global_request_id=context.global_id)
if resp.status_code == 200:
data = resp.json()
return data
elif resp.status_code == 404:
return None
else:
placement_req_id = get_placement_request_id(resp)
msg = ("[%(placement_req_id)s] Failed to retrieve resource "
"provider record from placement API for UUID %(uuid)s. Got "
"%(status_code)d: %(err_text)s.")
args = {
'uuid': uuid,
'status_code': resp.status_code,
'err_text': resp.text,
'placement_req_id': placement_req_id,
}
LOG.error(msg, args)
raise exception.ResourceProviderRetrievalFailed(uuid=uuid)
@safe_connect
def _get_sharing_providers(self, context, agg_uuids):
"""Queries the placement API for a list of the resource providers
associated with any of the specified aggregates and possessing the
MISC_SHARES_VIA_AGGREGATE trait.
:param context: The security context
:param agg_uuids: Iterable of string UUIDs of aggregates to filter on.
:return: A list of dicts of resource provider information, which may be
empty if no provider exists with the specified UUID.
:raise: ResourceProviderRetrievalFailed on error.
"""
if not agg_uuids:
return []
aggs = ','.join(agg_uuids)
url = "/resource_providers?member_of=in:%s&required=%s" % (
aggs, os_traits.MISC_SHARES_VIA_AGGREGATE)
resp = self.get(url, version='1.18',
global_request_id=context.global_id)
if resp.status_code == 200:
return resp.json()['resource_providers']
msg = _("[%(placement_req_id)s] Failed to retrieve sharing resource "
"providers associated with the following aggregates from "
"placement API: %(aggs)s. Got %(status_code)d: %(err_text)s.")
args = {
'aggs': aggs,
'status_code': resp.status_code,
'err_text': resp.text,
'placement_req_id': get_placement_request_id(resp),
}
LOG.error(msg, args)
raise exception.ResourceProviderRetrievalFailed(message=msg % args)
def get_providers_in_tree(self, context, uuid):
"""Queries the placement API for a list of the resource providers in
the tree associated with the specified UUID.
:param context: The security context
:param uuid: UUID identifier for the resource provider to look up
:return: A list of dicts of resource provider information, which may be
empty if no provider exists with the specified UUID.
:raise: ResourceProviderRetrievalFailed on error.
:raise: keystoneauth1.exceptions.ClientException if placement API
communication fails.
"""
resp = self.get("/resource_providers?in_tree=%s" % uuid,
version=NESTED_PROVIDER_API_VERSION,
global_request_id=context.global_id)
if resp.status_code == 200:
return resp.json()['resource_providers']
# Some unexpected error
placement_req_id = get_placement_request_id(resp)
msg = ("[%(placement_req_id)s] Failed to retrieve resource provider "
"tree from placement API for UUID %(uuid)s. Got "
"%(status_code)d: %(err_text)s.")
args = {
'uuid': uuid,
'status_code': resp.status_code,
'err_text': resp.text,
'placement_req_id': placement_req_id,
}
LOG.error(msg, args)
raise exception.ResourceProviderRetrievalFailed(uuid=uuid)
@safe_connect
def _create_resource_provider(self, context, uuid, name,
parent_provider_uuid=None):
"""Calls the placement API to create a new resource provider record.
:param context: The security context
:param uuid: UUID of the new resource provider
:param name: Name of the resource provider
:param parent_provider_uuid: Optional UUID of the immediate parent
:return: A dict of resource provider information object representing
the newly-created resource provider.
:raise: ResourceProviderCreationFailed or
ResourceProviderRetrievalFailed on error.
"""
url = "/resource_providers"
payload = {
'uuid': uuid,
'name': name,
}
if parent_provider_uuid is not None:
payload['parent_provider_uuid'] = parent_provider_uuid
# Bug #1746075: First try the microversion that returns the new
# provider's payload.
resp = self.post(url, payload,
version=POST_RPS_RETURNS_PAYLOAD_API_VERSION,
global_request_id=context.global_id)
placement_req_id = get_placement_request_id(resp)
if resp:
msg = ("[%(placement_req_id)s] Created resource provider record "
"via placement API for resource provider with UUID "
"%(uuid)s and name %(name)s.")
args = {
'uuid': uuid,
'name': name,
'placement_req_id': placement_req_id,
}
LOG.info(msg, args)
return resp.json()
# TODO(efried): Push error codes from placement, and use 'em.
name_conflict = 'Conflicting resource provider name:'
if resp.status_code == 409 and name_conflict not in resp.text:
# Another thread concurrently created a resource provider with the
# same UUID. Log a warning and then just return the resource
# provider object from _get_resource_provider()
msg = ("[%(placement_req_id)s] Another thread already created a "
"resource provider with the UUID %(uuid)s. Grabbing that "
"record from the placement API.")
args = {
'uuid': uuid,
'placement_req_id': placement_req_id,
}
LOG.info(msg, args)
return self._get_resource_provider(context, uuid)
# A provider with the same *name* already exists, or some other error.
msg = ("[%(placement_req_id)s] Failed to create resource provider "
"record in placement API for UUID %(uuid)s. Got "
"%(status_code)d: %(err_text)s.")
args = {
'uuid': uuid,
'status_code': resp.status_code,
'err_text': resp.text,
'placement_req_id': placement_req_id,
}
LOG.error(msg, args)
raise exception.ResourceProviderCreationFailed(name=name)
def _ensure_resource_provider(self, context, uuid, name=None,
parent_provider_uuid=None):
"""Ensures that the placement API has a record of a resource provider
with the supplied UUID. If not, creates the resource provider record in
the placement API for the supplied UUID, passing in a name for the
resource provider.
If found or created, the provider's UUID is returned from this method.
If the resource provider for the supplied uuid was not found and the
resource provider record could not be created in the placement API, an
exception is raised.
If this method returns successfully, callers are assured that the
placement API contains a record of the provider; and that the local
cache of resource provider information contains a record of:
- The specified provider
- All providers in its tree
- All providers associated via aggregate with all providers in said
tree
and for each of those providers:
- The UUIDs of its aggregates
- The trait strings associated with the provider
Note that if the provider did not exist prior to this call, the above
reduces to just the specified provider as a root, with no aggregates or
traits.
:param context: The security context
:param uuid: UUID identifier for the resource provider to ensure exists
:param name: Optional name for the resource provider if the record
does not exist. If empty, the name is set to the UUID
value
:param parent_provider_uuid: Optional UUID of the immediate parent,
which must have been previously _ensured.
:raise ResourceProviderCreationFailed: If we expected to be creating
providers, but couldn't.
:raise: keystoneauth1.exceptions.ClientException if placement API
communication fails.
"""
# NOTE(efried): We currently have no code path where we need to set the
# parent_provider_uuid on a previously-parent-less provider - so we do
# NOT handle that scenario here.
# If we already have the root provider in the cache, and it's not
# stale, don't refresh it; and use the cache to determine the
# descendants to (soft) refresh.
# NOTE(efried): This assumes the compute service only cares about
# providers it "owns". If that ever changes, we'll need a way to find
# out about out-of-band changes here. Options that have been
# brainstormed at this time:
# - Make this condition more frequently True
# - Some kind of notification subscription so a separate thread is
# alerted when <thing we care about happens in placement>.
# - "Cascading generations" - i.e. a change to a leaf node percolates
# generation bump up the tree so that we bounce 409 the next time we
# try to update anything and have to refresh.
if (self._provider_tree.exists(uuid) and
not self._associations_stale(uuid)):
uuids_to_refresh = [
u for u in self._provider_tree.get_provider_uuids(uuid)
if self._associations_stale(u)]
else:
# We either don't have it locally or it's stale. Pull or create it.
created_rp = None
rps_to_refresh = self.get_providers_in_tree(context, uuid)
if not rps_to_refresh:
created_rp = self._create_resource_provider(
context, uuid, name or uuid,
parent_provider_uuid=parent_provider_uuid)
# If @safe_connect can't establish a connection to the
# placement service, like if placement isn't running or
# nova-compute is mis-configured for authentication, we'll get
# None back and need to treat it like we couldn't create the
# provider (because we couldn't).
if created_rp is None:
raise exception.ResourceProviderCreationFailed(
name=name or uuid)
# Don't add the created_rp to rps_to_refresh. Since we just
# created it, it has no aggregates or traits.
# But do mark it as having just been "refreshed".
self._association_refresh_time[uuid] = time.time()
self._provider_tree.populate_from_iterable(
rps_to_refresh or [created_rp])
uuids_to_refresh = [rp['uuid'] for rp in rps_to_refresh]
# At this point, the whole tree exists in the local cache.
for uuid_to_refresh in uuids_to_refresh:
self._refresh_associations(context, uuid_to_refresh, force=True)
return uuid
@safe_connect
def _delete_provider(self, rp_uuid, global_request_id=None):
resp = self.delete('/resource_providers/%s' % rp_uuid,
global_request_id=global_request_id)
# Check for 404 since we don't need to warn/raise if we tried to delete
# something which doesn"t actually exist.
if resp or resp.status_code == 404:
if resp:
LOG.info("Deleted resource provider %s", rp_uuid)
# clean the caches
try:
self._provider_tree.remove(rp_uuid)
except ValueError:
pass
self._association_refresh_time.pop(rp_uuid, None)
return
msg = ("[%(placement_req_id)s] Failed to delete resource provider "
"with UUID %(uuid)s from the placement API. Got "
"%(status_code)d: %(err_text)s.")
args = {
'placement_req_id': get_placement_request_id(resp),
'uuid': rp_uuid,
'status_code': resp.status_code,
'err_text': resp.text
}
LOG.error(msg, args)
# On conflict, the caller may wish to delete allocations and
# redrive. (Note that this is not the same as a
# PlacementAPIConflict case.)
if resp.status_code == 409:
raise exception.ResourceProviderInUse()
raise exception.ResourceProviderDeletionFailed(uuid=rp_uuid)
def _get_inventory(self, context, rp_uuid):
url = '/resource_providers/%s/inventories' % rp_uuid
result = self.get(url, global_request_id=context.global_id)
if not result:
# TODO(efried): Log.
return None
return result.json()
def _refresh_and_get_inventory(self, context, rp_uuid):
"""Helper method that retrieves the current inventory for the supplied
resource provider according to the placement API.
If the cached generation of the resource provider is not the same as
the generation returned from the placement API, we update the cached
generation and attempt to update inventory if any exists, otherwise
return empty inventories.
"""
curr = self._get_inventory(context, rp_uuid)
if curr is None:
return None
LOG.debug('Updating ProviderTree inventory for provider %s from '
'_refresh_and_get_inventory using data: %s', rp_uuid,
curr['inventories'])
self._provider_tree.update_inventory(
rp_uuid, curr['inventories'],
generation=curr['resource_provider_generation'])
return curr
def _refresh_associations(self, context, rp_uuid, force=False,
refresh_sharing=True):
"""Refresh inventories, aggregates, traits, and (optionally) aggregate-
associated sharing providers for the specified resource provider uuid.
Only refresh if there has been no refresh during the lifetime of
this process, CONF.compute.resource_provider_association_refresh
seconds have passed, or the force arg has been set to True.
:param context: The security context
:param rp_uuid: UUID of the resource provider to check for fresh
inventories, aggregates, and traits
:param force: If True, force the refresh
:param refresh_sharing: If True, fetch all the providers associated
by aggregate with the specified provider,
including their inventories, traits, and
aggregates (but not *their* sharing providers).
:raise: On various placement API errors, one of:
- ResourceProviderAggregateRetrievalFailed
- ResourceProviderTraitRetrievalFailed
- ResourceProviderRetrievalFailed
:raise: keystoneauth1.exceptions.ClientException if placement API
communication fails.
"""
if force or self._associations_stale(rp_uuid):
# Refresh inventories
msg = "Refreshing inventories for resource provider %s"
LOG.debug(msg, rp_uuid)
self._refresh_and_get_inventory(context, rp_uuid)
# Refresh aggregates
agg_info = self._get_provider_aggregates(context, rp_uuid)
# If @safe_connect makes the above return None, this will raise
# TypeError. Good.
aggs, generation = agg_info.aggregates, agg_info.generation
msg = ("Refreshing aggregate associations for resource provider "
"%s, aggregates: %s")
LOG.debug(msg, rp_uuid, ','.join(aggs or ['None']))
# NOTE(efried): This will blow up if called for a RP that doesn't
# exist in our _provider_tree.
self._provider_tree.update_aggregates(
rp_uuid, aggs, generation=generation)
# Refresh traits
trait_info = self.get_provider_traits(context, rp_uuid)
traits, generation = trait_info.traits, trait_info.generation
msg = ("Refreshing trait associations for resource provider %s, "
"traits: %s")
LOG.debug(msg, rp_uuid, ','.join(traits or ['None']))
# NOTE(efried): This will blow up if called for a RP that doesn't
# exist in our _provider_tree.
self._provider_tree.update_traits(
rp_uuid, traits, generation=generation)
if refresh_sharing:
# Refresh providers associated by aggregate
for rp in self._get_sharing_providers(context, aggs):
if not self._provider_tree.exists(rp['uuid']):
# NOTE(efried): Right now sharing providers are always
# treated as roots. This is deliberate. From the
# context of this compute's RP, it doesn't matter if a
# sharing RP is part of a tree.
self._provider_tree.new_root(
rp['name'], rp['uuid'],
generation=rp['generation'])
# Now we have to (populate or) refresh that provider's
# traits, aggregates, and inventories (but not *its*
# aggregate-associated providers). No need to override
# force=True for newly-added providers - the missing
# timestamp will always trigger them to refresh.
self._refresh_associations(context, rp['uuid'],
force=force,
refresh_sharing=False)
self._association_refresh_time[rp_uuid] = time.time()
def _associations_stale(self, uuid):
"""Respond True if aggregates and traits have not been refreshed
"recently".
Associations are stale if association_refresh_time for this uuid is not
set or is more than CONF.compute.resource_provider_association_refresh
seconds ago.
Always False if CONF.compute.resource_provider_association_refresh is
zero.
"""
rpar = CONF.compute.resource_provider_association_refresh
refresh_time = self._association_refresh_time.get(uuid, 0)
# If refresh is disabled, associations are "never" stale. (But still
# load them if we haven't yet done so.)
if rpar == 0 and refresh_time != 0:
# TODO(efried): If refresh is disabled, we could avoid touching the
# _association_refresh_time dict anywhere, but that would take some
# nontrivial refactoring.
return False
return (time.time() - refresh_time) > rpar
def get_provider_tree_and_ensure_root(self, context, rp_uuid, name=None,
parent_provider_uuid=None):
"""Returns a fresh ProviderTree representing all providers which are in
the same tree or in the same aggregate as the specified provider,
including their aggregates, traits, and inventories.
If the specified provider does not exist, it is created with the
specified UUID, name, and parent provider (which *must* already exist).
:param context: The security context
:param rp_uuid: UUID of the resource provider for which to populate the
tree. (This doesn't need to be the UUID of the root.)
:param name: Optional name for the resource provider if the record
does not exist. If empty, the name is set to the UUID
value
:param parent_provider_uuid: Optional UUID of the immediate parent,
which must have been previously _ensured.
:return: A new ProviderTree object.
"""
# TODO(efried): We would like to have the caller handle create-and/or-
# cache-if-not-already, but the resource tracker is currently
# structured to handle initialization and update in a single path. At
# some point this should be refactored, and this method can *just*
# return a deep copy of the local _provider_tree cache.
# (Re)populate the local ProviderTree
self._ensure_resource_provider(
context, rp_uuid, name=name,
parent_provider_uuid=parent_provider_uuid)
# Return a *copy* of the tree.
return copy.deepcopy(self._provider_tree)
def set_inventory_for_provider(self, context, rp_uuid, inv_data):
"""Given the UUID of a provider, set the inventory records for the
provider to the supplied dict of resources.
The provider must exist - this method does not attempt to create it.
:param context: The security context
:param rp_uuid: The UUID of the provider whose inventory is to be
updated.
:param inv_data: Dict, keyed by resource class name, of inventory data
to set for the provider. Use None or the empty dict
to remove all inventory for the provider.
:raises: InventoryInUse if inv_data indicates removal of inventory in a
resource class which has active allocations for this provider.
:raises: InvalidResourceClass if inv_data contains a resource class
which cannot be created.
:raises: ResourceProviderUpdateConflict if the provider's generation
doesn't match the generation in the cache. Callers may choose
to retrieve the provider and its associations afresh and
redrive this operation.
:raises: ResourceProviderUpdateFailed on any other placement API
failure.
"""
# NOTE(efried): This is here because _ensure_resource_class already has
# @safe_connect, so we don't want to decorate this whole method with it
@safe_connect
def do_put(url, payload):
# NOTE(vdrok): in microversion 1.26 it is allowed to have inventory
# records with reserved value equal to total
return self.put(
url, payload, global_request_id=context.global_id,
version=ALLOW_RESERVED_EQUAL_TOTAL_INVENTORY_VERSION)
# If not different from what we've got, short out
if not self._provider_tree.has_inventory_changed(rp_uuid, inv_data):
LOG.debug('Inventory has not changed for provider %s based '
'on inventory data: %s', rp_uuid, inv_data)
return
# Ensure non-standard resource classes exist, creating them if needed.
self._ensure_resource_classes(context, set(inv_data))
url = '/resource_providers/%s/inventories' % rp_uuid
inv_data = inv_data or {}
generation = self._provider_tree.data(rp_uuid).generation
payload = {
'resource_provider_generation': generation,
'inventories': inv_data,
}
resp = do_put(url, payload)
if resp.status_code == 200:
LOG.debug('Updated inventory for provider %s with generation %s '
'in Placement from set_inventory_for_provider using '
'data: %s', rp_uuid, generation, inv_data)
json = resp.json()
self._provider_tree.update_inventory(
rp_uuid, json['inventories'],
generation=json['resource_provider_generation'])
return
# Some error occurred; log it
msg = ("[%(placement_req_id)s] Failed to update inventory to "
"[%(inv_data)s] for resource provider with UUID %(uuid)s. Got "
"%(status_code)d: %(err_text)s")
args = {
'placement_req_id': get_placement_request_id(resp),
'uuid': rp_uuid,
'inv_data': str(inv_data),
'status_code': resp.status_code,
'err_text': resp.text,
}
LOG.error(msg, args)
if resp.status_code == 409:
# If a conflict attempting to remove inventory in a resource class
# with active allocations, raise InventoryInUse
err = resp.json()['errors'][0]
# TODO(efried): If there's ever a lib exporting symbols for error
# codes, use it.
if err['code'] == 'placement.inventory.inuse':
# The error detail includes the resource class and provider.
raise exception.InventoryInUse(err['detail'])
# Other conflicts are generation mismatch: raise conflict exception
raise exception.ResourceProviderUpdateConflict(
uuid=rp_uuid, generation=generation, error=resp.text)
# Otherwise, raise generic exception
raise exception.ResourceProviderUpdateFailed(url=url, error=resp.text)
@safe_connect
def _ensure_traits(self, context, traits):
"""Make sure all specified traits exist in the placement service.
:param context: The security context
:param traits: Iterable of trait strings to ensure exist.
:raises: TraitCreationFailed if traits contains a trait that did not
exist in placement, and couldn't be created. When this
exception is raised, it is possible that *some* of the
requested traits were created.
:raises: TraitRetrievalFailed if the initial query of existing traits
was unsuccessful. In this scenario, it is guaranteed that
no traits were created.
"""
if not traits:
return
# Query for all the requested traits. Whichever ones we *don't* get
# back, we need to create.
# NOTE(efried): We don't attempt to filter based on our local idea of
# standard traits, which may not be in sync with what the placement
# service knows. If the caller tries to ensure a nonexistent
# "standard" trait, they deserve the TraitCreationFailed exception
# they'll get.
resp = self.get('/traits?name=in:' + ','.join(traits), version='1.6',
global_request_id=context.global_id)
if resp.status_code == 200:
traits_to_create = set(traits) - set(resp.json()['traits'])
# Might be neat to have a batch create. But creating multiple
# traits will generally happen once, at initial startup, if at all.
for trait in traits_to_create:
resp = self.put('/traits/' + trait, None, version='1.6',
global_request_id=context.global_id)
if not resp:
raise exception.TraitCreationFailed(name=trait,
error=resp.text)
return
# The initial GET failed
msg = ("[%(placement_req_id)s] Failed to retrieve the list of traits. "
"Got %(status_code)d: %(err_text)s")
args = {
'placement_req_id': get_placement_request_id(resp),
'status_code': resp.status_code,
'err_text': resp.text,
}
LOG.error(msg, args)
raise exception.TraitRetrievalFailed(error=resp.text)