/
sync_worker.go
1327 lines (1178 loc) · 47.1 KB
/
sync_worker.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
package cvo
import (
"context"
"errors"
"fmt"
"math/rand"
"reflect"
"sort"
"strings"
"sync"
"time"
configlistersv1 "github.com/openshift/client-go/config/listers/config/v1"
"github.com/prometheus/client_golang/prometheus"
"golang.org/x/time/rate"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/util/errors"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/tools/record"
"k8s.io/klog/v2"
configv1 "github.com/openshift/api/config/v1"
"github.com/openshift/cluster-version-operator/lib/capability"
"github.com/openshift/cluster-version-operator/pkg/payload"
"github.com/openshift/cluster-version-operator/pkg/payload/precondition"
"github.com/openshift/library-go/pkg/manifest"
)
// ConfigSyncWorker abstracts how the image is synchronized to the server. Introduced for testing.
type ConfigSyncWorker interface {
Start(ctx context.Context, maxWorkers int, cvoOptrName string, lister configlistersv1.ClusterVersionLister)
Update(ctx context.Context, generation int64, desired configv1.Update, config *configv1.ClusterVersion, state payload.State, cvoOptrName string) *SyncWorkerStatus
StatusCh() <-chan SyncWorkerStatus
// Inform the sync worker about activity for a managed resource.
NotifyAboutManagedResourceActivity(obj interface{}, msg string)
}
// PayloadInfo returns details about the payload when it was retrieved.
type PayloadInfo struct {
// Directory is the path on disk where the payload is rooted.
Directory string
// Local is true if the payload was the payload associated with the operator image.
Local bool
// Verified is true if the payload was explicitly verified against the root of trust.
// If unset and VerificationFailure is nil, the payload should be considered to not have
// had verification attempted.
Verified bool
// VerificationFailure is any error returned by attempting to verify the payload.
VerificationError error
}
// PayloadRetriever abstracts how a desired version is extracted to disk. Introduced for testing.
type PayloadRetriever interface {
// RetrievePayload attempts to retrieve the desired payload, returning info about the payload
// or an error.
RetrievePayload(ctx context.Context, desired configv1.Update) (PayloadInfo, error)
}
// StatusReporter abstracts how status is reported by the worker run method. Introduced for testing.
type StatusReporter interface {
Report(status SyncWorkerStatus)
ReportPayload(payLoadStatus LoadPayloadStatus)
ValidPayloadStatus(update configv1.Update) bool
}
// SyncWork represents the work that should be done in a sync iteration.
type SyncWork struct {
Generation int64
Desired configv1.Update
Overrides []configv1.ComponentOverride
State payload.State
// Completed is the number of times in a row we have synced this payload
Completed int
// Attempt is incremented each time we attempt to sync a payload and reset
// when we change Generation/Desired or successfully synchronize.
Attempt int
Capabilities capability.ClusterCapabilities
}
// Empty returns true if the image is empty for this work.
func (w SyncWork) Empty() bool {
return len(w.Desired.Image) == 0
}
type LoadPayloadStatus struct {
Step string
Message string
Failure error
Update configv1.Update
Verified bool
LastTransitionTime time.Time
}
type CapabilityStatus struct {
Status configv1.ClusterVersionCapabilitiesStatus
ImplicitlyEnabledCaps []configv1.ClusterVersionCapability
}
// SyncWorkerStatus is the status of the sync worker at a given time.
type SyncWorkerStatus struct {
Generation int64
Failure error
Done int
Total int
Completed int
Reconciling bool
Initial bool
VersionHash string
Architecture string
LastProgress time.Time
Actual configv1.Release
// indicates if actual (current) release was verified
Verified bool
loadPayloadStatus LoadPayloadStatus
CapabilitiesStatus CapabilityStatus
}
// DeepCopy copies the worker status.
func (w SyncWorkerStatus) DeepCopy() *SyncWorkerStatus {
return &w
}
// SyncWorker retrieves and applies the desired image, tracking the status for the parent to
// monitor. The worker accepts a desired state via Update() and works to keep that state in
// sync. Once a particular image version is synced, it will be updated no more often than
// minimumReconcileInterval.
//
// State transitions:
//
// Initial: wait for valid Update(), report empty status
// Update() -> Sync
// Sync: attempt to invoke the apply() method
// apply() returns an error -> Error
// apply() returns nil -> Reconciling
// Reconciling: invoke apply() no more often than reconcileInterval
// Update() with different values -> Sync
// apply() returns an error -> Error
// apply() returns nil -> Reconciling
// Error: backoff until we are attempting every reconcileInterval
// apply() returns an error -> Error
// apply() returns nil -> Reconciling
//
type SyncWorker struct {
backoff wait.Backoff
retriever PayloadRetriever
builder payload.ResourceBuilder
preconditions precondition.List
eventRecorder record.EventRecorder
// minimumReconcileInterval is the minimum time between reconcile attempts, and is
// used to define the maximum backoff interval when apply() returns an error.
minimumReconcileInterval time.Duration
// coordination between the sync loop and external callers
notify chan string
report chan SyncWorkerStatus
// lock guards changes to these fields
lock sync.Mutex
work *SyncWork
cancelFn func()
status SyncWorkerStatus
// updated by the run method only
payload *payload.Update
// exclude is an identifier used to determine which
// manifests should be excluded based on an annotation
// of the form exclude.release.openshift.io/<identifier>=true
exclude string
// includeTechPreview is set to true when the CVO should create resources with the `release.openshift.io/feature-gate=TechPreviewNoUpgrade`
includeTechPreview bool
clusterProfile string
}
// NewSyncWorker initializes a ConfigSyncWorker that will retrieve payloads to disk, apply them via builder
// to a server, and obey limits about how often to reconcile or retry on errors.
func NewSyncWorker(retriever PayloadRetriever, builder payload.ResourceBuilder, reconcileInterval time.Duration, backoff wait.Backoff, exclude string, includeTechPreview bool, eventRecorder record.EventRecorder, clusterProfile string) *SyncWorker {
return &SyncWorker{
retriever: retriever,
builder: builder,
backoff: backoff,
eventRecorder: eventRecorder,
minimumReconcileInterval: reconcileInterval,
notify: make(chan string, 1),
// report is a large buffered channel to improve local testing - most consumers should invoke
// Status() or use the result of calling Update() instead because the channel can be out of date
// if the reader is not fast enough.
report: make(chan SyncWorkerStatus, 500),
exclude: exclude,
includeTechPreview: includeTechPreview,
clusterProfile: clusterProfile,
}
}
// NewSyncWorkerWithPreconditions initializes a ConfigSyncWorker that will retrieve payloads to disk, apply them via builder
// to a server, and obey limits about how often to reconcile or retry on errors.
// It allows providing preconditions for loading payload.
func NewSyncWorkerWithPreconditions(retriever PayloadRetriever, builder payload.ResourceBuilder, preconditions precondition.List, reconcileInterval time.Duration, backoff wait.Backoff, exclude string, includeTechPreview bool, eventRecorder record.EventRecorder, clusterProfile string) *SyncWorker {
worker := NewSyncWorker(retriever, builder, reconcileInterval, backoff, exclude, includeTechPreview, eventRecorder, clusterProfile)
worker.preconditions = preconditions
return worker
}
// StatusCh returns a channel that reports status from the worker. The channel is buffered and events
// can be lost, so this is best used as a trigger to read the latest status.
func (w *SyncWorker) StatusCh() <-chan SyncWorkerStatus {
return w.report
}
// Inform the sync worker about activity for a managed resource.
func (w *SyncWorker) NotifyAboutManagedResourceActivity(obj interface{}, message string) {
select {
case w.notify <- message:
klog.V(2).Infof("Notify the sync worker: %s", message)
default:
klog.V(2).Info("The sync worker already has a pending notification, so do not notify about:no need to inform about: %s", message)
}
}
// syncPayload retrieves, loads, and verifies the specified payload, aka sync's the payload, whenever there is no current
// payload or the current payload differs from the desired payload. Whenever a payload is sync'ed a check is made for
// implicitly enabled capabilities. For the purposes of the check made here, implicitly enabled capabilities are
// capabilities which are disabled in the desired payload but must remain enabled since the capability defines one or
// more resources which are enabled in the current payload. All such capabilities are returned along with any previously
// existing implicitly enabled capabilities. If no new implicitly enabled capabilities are found, just the previously
// existing implicitly enabled capabilities are returned.
func (w *SyncWorker) syncPayload(ctx context.Context, work *SyncWork,
reporter StatusReporter) ([]configv1.ClusterVersionCapability, error) {
implicitlyEnabledCaps := work.Capabilities.ImplicitlyEnabledCapabilities
desired := configv1.Update{
Version: work.Desired.Version,
Image: work.Desired.Image,
Force: work.Desired.Force,
}
klog.V(2).Infof("syncPayload: %s (force=%t)", versionStringFromUpdate(desired), work.Desired.Force)
// cache the payload until the release image changes
validPayload := w.payload
if validPayload != nil && validPayload.Release.Image == desired.Image {
// reset payload status to currently loaded payload if it no longer applies to desired target
if !reporter.ValidPayloadStatus(desired) {
klog.V(2).Info("Resetting payload status to currently loaded payload.")
reporter.ReportPayload(LoadPayloadStatus{
Failure: nil,
Step: "PayloadLoaded",
Message: fmt.Sprintf("Payload loaded version=%q image=%q", desired.Version, desired.Image),
Verified: w.status.Verified,
Update: desired,
LastTransitionTime: time.Now(),
})
}
// possibly complain here if Version, etc. diverges from the payload content
return implicitlyEnabledCaps, nil
} else if validPayload == nil || !equalUpdate(configv1.Update{Image: validPayload.Release.Image}, configv1.Update{Image: desired.Image}) {
cvoObjectRef := &corev1.ObjectReference{APIVersion: "config.openshift.io/v1", Kind: "ClusterVersion", Name: "version", Namespace: "openshift-cluster-version"}
msg := fmt.Sprintf("Retrieving and verifying payload version=%q image=%q", desired.Version, desired.Image)
w.eventRecorder.Eventf(cvoObjectRef, corev1.EventTypeNormal, "RetrievePayload", msg)
reporter.ReportPayload(LoadPayloadStatus{
Step: "RetrievePayload",
Message: msg,
Update: desired,
LastTransitionTime: time.Now(),
})
info, err := w.retriever.RetrievePayload(ctx, work.Desired)
if err != nil {
msg := fmt.Sprintf("Retrieving payload failed version=%q image=%q failure=%v", desired.Version, desired.Image, err)
w.eventRecorder.Eventf(cvoObjectRef, corev1.EventTypeWarning, "RetrievePayloadFailed", msg)
reporter.ReportPayload(LoadPayloadStatus{
Failure: err,
Step: "RetrievePayload",
Message: msg,
Update: desired,
LastTransitionTime: time.Now(),
})
return nil, err
}
if info.VerificationError != nil {
msg := ""
for err := info.VerificationError; err != nil; err = errors.Unwrap(err) {
details := err.Error()
if !strings.Contains(msg, details) {
// library-go/pkg/verify wraps the details, but does not include them
// in the top-level error string. If we have an error like that,
// include the details here.
msg = fmt.Sprintf("%s\n%s", msg, details)
}
}
w.eventRecorder.Eventf(cvoObjectRef, corev1.EventTypeWarning, "RetrievePayload", msg)
}
w.eventRecorder.Eventf(cvoObjectRef, corev1.EventTypeNormal, "LoadPayload", "Loading payload version=%q image=%q", desired.Version, desired.Image)
payloadUpdate, err := payload.LoadUpdate(info.Directory, desired.Image, w.exclude, w.includeTechPreview, w.clusterProfile,
capability.GetKnownCapabilities())
if err != nil {
msg := fmt.Sprintf("Loading payload failed version=%q image=%q failure=%v", desired.Version, desired.Image, err)
w.eventRecorder.Eventf(cvoObjectRef, corev1.EventTypeWarning, "LoadPayloadFailed", msg)
reporter.ReportPayload(LoadPayloadStatus{
Failure: err,
Step: "LoadPayload",
Message: msg,
Verified: info.Verified,
Update: desired,
LastTransitionTime: time.Now(),
})
return nil, err
}
payloadUpdate.VerifiedImage = info.Verified
payloadUpdate.LoadedAt = time.Now()
if work.Desired.Version == "" {
work.Desired.Version = payloadUpdate.Release.Version
desired.Version = payloadUpdate.Release.Version
} else if payloadUpdate.Release.Version != work.Desired.Version {
err = fmt.Errorf("release image version %s does not match the expected upstream version %s", payloadUpdate.Release.Version, work.Desired.Version)
msg := fmt.Sprintf("Verifying payload failed version=%q image=%q failure=%v", work.Desired.Version, work.Desired.Image, err)
w.eventRecorder.Eventf(cvoObjectRef, corev1.EventTypeWarning, "VerifyPayloadVersionFailed", msg)
reporter.ReportPayload(LoadPayloadStatus{
Failure: err,
Step: "VerifyPayloadVersion",
Message: msg,
Verified: info.Verified,
Update: desired,
LastTransitionTime: time.Now(),
})
return nil, err
}
// need to make sure the payload is only set when the preconditions have been successful
if len(w.preconditions) == 0 {
klog.V(2).Info("No preconditions configured.")
} else if info.Local {
klog.V(2).Info("Skipping preconditions for a local operator image payload.")
} else {
if block, err := precondition.Summarize(w.preconditions.RunAll(ctx, precondition.ReleaseContext{
DesiredVersion: payloadUpdate.Release.Version,
}), work.Desired.Force); err != nil {
klog.V(2).Infof("Precondition error (force %t, block %t): %v", work.Desired.Force, block, err)
if block {
msg := fmt.Sprintf("Preconditions failed for payload loaded version=%q image=%q: %v", desired.Version, desired.Image, err)
w.eventRecorder.Eventf(cvoObjectRef, corev1.EventTypeWarning, "PreconditionBlock", msg)
reporter.ReportPayload(LoadPayloadStatus{
Failure: err,
Step: "PreconditionChecks",
Message: msg,
Verified: info.Verified,
Update: desired,
LastTransitionTime: time.Now(),
})
return nil, err
} else {
w.eventRecorder.Eventf(cvoObjectRef, corev1.EventTypeWarning, "PreconditionWarn", "precondition warning for payload loaded version=%q image=%q: %v", desired.Version, desired.Image, err)
}
}
w.eventRecorder.Eventf(cvoObjectRef, corev1.EventTypeNormal, "PreconditionsPassed", "preconditions passed for payload loaded version=%q image=%q", desired.Version, desired.Image)
}
if w.payload != nil {
implicitlyEnabledCaps = payload.GetImplicitlyEnabledCapabilities(payloadUpdate.Manifests, w.payload.Manifests,
work.Capabilities)
}
w.payload = payloadUpdate
msg = fmt.Sprintf("Payload loaded version=%q image=%q architecture=%q", desired.Version, desired.Image,
payloadUpdate.Architecture)
w.eventRecorder.Eventf(cvoObjectRef, corev1.EventTypeNormal, "PayloadLoaded", msg)
reporter.ReportPayload(LoadPayloadStatus{
Failure: nil,
Step: "PayloadLoaded",
Message: msg,
Verified: info.Verified,
Update: desired,
LastTransitionTime: time.Now(),
})
klog.V(2).Infof("Payload loaded from %s with hash %s, architecture %s", desired.Image, payloadUpdate.ManifestHash,
payloadUpdate.Architecture)
}
return implicitlyEnabledCaps, nil
}
// loadUpdatedPayload retrieves the image. If successfully retrieved it updates payload otherwise it returns an error.
func (w *SyncWorker) loadUpdatedPayload(ctx context.Context, work *SyncWork,
cvoOptrName string) ([]configv1.ClusterVersionCapability, error) {
// reporter hides status updates that occur earlier than the previous failure,
// so that we don't fail, then immediately start reporting an earlier status
reporter := &statusWrapper{w: w, previousStatus: w.status.DeepCopy()}
implicitlyEnabledCaps, err := w.syncPayload(ctx, work, reporter)
if err != nil {
klog.V(2).Infof("loadUpdatedPayload syncPayload err=%v", err)
return nil, err
}
return implicitlyEnabledCaps, nil
}
// Update instructs the sync worker to start synchronizing the desired update. The reconciling boolean is
// ignored unless this is the first time that Update has been called. The returned status represents either
// the initial state or whatever the last recorded status was.
// TODO: in the future it may be desirable for changes that alter desired to wait briefly before returning,
// giving the sync loop the opportunity to observe our change and begin working towards it.
func (w *SyncWorker) Update(ctx context.Context, generation int64, desired configv1.Update, config *configv1.ClusterVersion,
state payload.State, cvoOptrName string) *SyncWorkerStatus {
w.lock.Lock()
defer w.lock.Unlock()
work := &SyncWork{
Generation: generation,
Desired: desired,
Overrides: config.Spec.Overrides,
}
var priorCaps map[configv1.ClusterVersionCapability]struct{}
// The sync worker’s generation should always be latest with every change.
// If this is the first time through initialize priorCaps to the last known value of enabled capabilities.
if w.work != nil {
w.work.Generation = generation
priorCaps = w.work.Capabilities.EnabledCapabilities
} else {
klog.V(2).Info("Initializing prior known value of enabled capabilities from ClusterVersion status.")
priorCaps = capability.GetCapabilitiesAsMap(config.Status.Capabilities.EnabledCapabilities)
}
if work.Empty() {
klog.V(2).Info("Update work has no release image; ignoring requested change")
return w.status.DeepCopy()
}
work.Capabilities = capability.SetCapabilities(config, priorCaps)
versionEqual, overridesEqual, capabilitiesEqual :=
equalSyncWork(w.work, work, fmt.Sprintf("considering cluster version generation %d", generation))
// needs to be set here since changes in implicitly enabled capabilities are not considered a "capabilities change"
w.status.CapabilitiesStatus.ImplicitlyEnabledCaps = work.Capabilities.ImplicitlyEnabledCapabilities
if versionEqual && overridesEqual && capabilitiesEqual {
klog.V(2).Info("Update work is equal to current target; no change required")
if !equalUpdate(w.work.Desired, w.status.loadPayloadStatus.Update) {
w.lock.Unlock()
// this will only reset payload status to currently loaded payload
_, err := w.loadUpdatedPayload(ctx, w.work, cvoOptrName)
w.lock.Lock()
if err != nil {
klog.Warningf("Error when attempting to reset payload status to currently loaded payload: %v.", err)
}
}
return w.status.DeepCopy()
}
// initialize the reconciliation flag and the status the first time
// update is invoked
var oldDesired *configv1.Update
if w.work == nil {
work.State = state
w.status.Generation = generation
w.status.Reconciling = state.Reconciling()
w.status.Actual = configv1.Release{
Version: work.Desired.Version,
Image: work.Desired.Image,
}
} else {
oldDesired = &w.work.Desired
}
// since oldDesired is not nil this is not the first time update is invoked and therefore w.work is not nil
if !versionEqual && oldDesired != nil && state == payload.InitializingPayload {
klog.Warningf("Ignoring detected version change from %v to %v during payload initialization", *oldDesired, work.Desired)
w.work.Desired = *oldDesired
if overridesEqual && capabilitiesEqual {
return w.status.DeepCopy()
}
}
w.lock.Unlock()
implicit, err := w.loadUpdatedPayload(ctx, work, cvoOptrName)
w.lock.Lock()
if err != nil {
// save override and capability changes if not first time through
if w.work != nil {
w.work.Overrides = config.Spec.Overrides
w.work.Capabilities = work.Capabilities
w.status.CapabilitiesStatus.Status = capability.GetCapabilitiesStatus(w.work.Capabilities)
}
return w.status.DeepCopy()
}
if !versionEqual && oldDesired == nil {
klog.Infof("Propagating initial target version %v to sync worker loop in state %s.", desired, state)
}
// update work to include desired version now that it has been successfully loaded
w.work = work
// Update capabilities settings and status to include any capabilities that were implicitly enabled due
// to previously managed resources.
w.work.Capabilities = capability.SetFromImplicitlyEnabledCapabilities(implicit, w.work.Capabilities)
w.status.CapabilitiesStatus.ImplicitlyEnabledCaps = w.work.Capabilities.ImplicitlyEnabledCapabilities
w.status.CapabilitiesStatus.Status = capability.GetCapabilitiesStatus(w.work.Capabilities)
// Update syncWorker status with architecture of newly loaded payload.
w.status.Architecture = w.payload.Architecture
// notify the sync loop that we changed config
if w.cancelFn != nil {
klog.V(2).Info("Cancel the sync worker's current loop")
w.cancelFn()
w.cancelFn = nil
}
msg := "new work is available"
select {
case w.notify <- msg:
klog.V(2).Info("Notify the sync worker that new work is available")
default:
klog.V(2).Info("The sync worker has already been notified that new work is available")
}
return w.status.DeepCopy()
}
// Start periodically invokes run, detecting whether content has changed.
// It is edge-triggered when Update() is invoked and level-driven after the
// apply() has succeeded for a given input (we are said to be "reconciling").
func (w *SyncWorker) Start(ctx context.Context, maxWorkers int, cvoOptrName string, lister configlistersv1.ClusterVersionLister) {
klog.V(2).Infof("Start: starting sync worker")
work := &SyncWork{}
wait.Until(func() {
consecutiveErrors := 0
errorInterval := w.minimumReconcileInterval / 16
var next <-chan time.Time
for {
waitingToReconcile := work.State == payload.ReconcilingPayload
select {
case <-ctx.Done():
klog.V(2).Infof("Stopped worker")
return
case <-next:
waitingToReconcile = false
klog.V(2).Infof("Wait finished")
case msg := <-w.notify:
klog.V(2).Info(msg)
}
// determine whether we need to do work
changed := w.calculateNext(work)
if !changed && waitingToReconcile {
klog.V(2).Infof("No change, waiting")
continue
}
// until Update() has been called at least once, we do nothing
if work.Empty() {
next = time.After(w.minimumReconcileInterval)
klog.V(2).Infof("No work, waiting")
continue
}
// actually apply the image, allowing for calls to be cancelled
err := func() error {
var syncTimeout time.Duration
switch work.State {
case payload.InitializingPayload:
// during initialization we want to show what operators are being
// created, so time out syncs more often to show a snapshot of progress
// TODO: allow status outside of sync
syncTimeout = w.minimumReconcileInterval
case payload.UpdatingPayload:
// during updates we want to flag failures on any resources that -
// for cluster operators that are not reporting failing the error
// message will point users to which operator is upgrading
syncTimeout = w.minimumReconcileInterval * 2
default:
// TODO: make reconciling run in parallel, processing every resource
// once and accumulating errors, then reporting a summary of how
// much drift we found, and then we can turn down the timeout
syncTimeout = w.minimumReconcileInterval * 2
}
ctx, cancelFn := context.WithTimeout(ctx, syncTimeout)
w.lock.Lock()
w.cancelFn = cancelFn
w.lock.Unlock()
defer cancelFn()
// reporter hides status updates that occur earlier than the previous failure,
// so that we don't fail, then immediately start reporting an earlier status
reporter := &statusWrapper{w: w, previousStatus: w.Status()}
klog.V(2).Infof("Previous sync status: %#v", reporter.previousStatus)
return w.apply(ctx, work, maxWorkers, reporter)
}()
if err != nil {
// backoff wait
// TODO: replace with wait.Backoff when 1.13 client-go is available
consecutiveErrors++
interval := w.minimumReconcileInterval
if consecutiveErrors < 4 {
interval = errorInterval
for i := 0; i < consecutiveErrors; i++ {
interval *= 2
}
}
next = time.After(wait.Jitter(interval, 0.2))
work.Completed = 0
work.Attempt++
utilruntime.HandleError(fmt.Errorf("unable to synchronize image (waiting %s): %v", interval, err))
continue
}
if work.State != payload.ReconcilingPayload {
klog.V(2).Infof("Sync succeeded, transitioning from %s to %s", work.State, payload.ReconcilingPayload)
}
work.Completed++
work.Attempt = 0
work.State = payload.ReconcilingPayload
next = time.After(w.minimumReconcileInterval)
}
}, 10*time.Millisecond, ctx.Done())
klog.V(2).Infof("Worker shut down")
}
// statusWrapper prevents a newer status update from overwriting a previous
// failure from later in the sync process.
type statusWrapper struct {
w *SyncWorker
previousStatus *SyncWorkerStatus
}
func (w *statusWrapper) ValidPayloadStatus(update configv1.Update) bool {
return equalDigest(w.previousStatus.loadPayloadStatus.Update.Image, update.Image)
}
// ReportPayload reports payload load status.
func (w *statusWrapper) ReportPayload(payloadStatus LoadPayloadStatus) {
status := w.previousStatus
status.loadPayloadStatus = payloadStatus
w.w.updateLoadStatus(*status)
}
// Report reports payload application status. It does not overwrite payload load status and capabilities status
// since payload application does not update these statuses they could therefore be out-of-date.
func (w *statusWrapper) Report(status SyncWorkerStatus) {
p := w.previousStatus
var fractionComplete float32
if status.Total > 0 {
fractionComplete = float32(status.Done) / float32(status.Total)
}
var previousFractionComplete float32
if p.Total > 0 {
previousFractionComplete = float32(p.Done) / float32(p.Total)
}
if p.Failure != nil && status.Failure == nil {
if p.Actual.Image == status.Actual.Image {
if fractionComplete < previousFractionComplete {
klog.V(2).Infof("Dropping status report from earlier in sync loop")
return
}
}
}
if fractionComplete > previousFractionComplete || status.Completed > p.Completed || (status.Failure == nil && status.Actual.Image != p.Actual.Image) {
status.LastProgress = time.Now()
}
if status.Generation == 0 {
status.Generation = p.Generation
} else if status.Generation < p.Generation {
klog.Warningf("Received a Generation(%d) lower than previously known Generation(%d), this is most probably an internal error", status.Generation, p.Generation)
}
w.w.updateApplyStatus(status)
}
// calculateNext updates the passed work object with the desired next state and
// returns true if any changes were made. The reconciling flag is set the first
// time work transitions from empty to not empty (as a result of someone invoking
// Update).
func (w *SyncWorker) calculateNext(work *SyncWork) bool {
w.lock.Lock()
defer w.lock.Unlock()
sameVersion, sameOverrides, sameCapabilities := equalSyncWork(work, w.work, "calculating next work")
changed := !sameVersion || !sameOverrides || !sameCapabilities
// if this is the first time through the loop, initialize reconciling to
// the state Update() calculated (to allow us to start in reconciling)
if work.Empty() {
work.State = w.work.State
work.Attempt = 0
} else if changed && work.State != payload.InitializingPayload {
klog.V(2).Infof("Work changed, transitioning from %s to %s", work.State, payload.UpdatingPayload)
work.State = payload.UpdatingPayload
work.Attempt = 0
}
if w.work != nil {
work.Desired = w.work.Desired
work.Overrides = w.work.Overrides
work.Capabilities = w.work.Capabilities
}
work.Generation = w.work.Generation
return changed
}
// equalUpdate returns true if two updates are semantically equivalent.
// It checks if the updates have have the same force and image values.
//
// We require complete pullspec equality, not just digest equality,
// because we want to go through the usual update process even if it's
// only the registry portion of the pullspec which has changed.
func equalUpdate(a, b configv1.Update) bool {
return a.Force == b.Force && a.Image == b.Image
}
// equalDigest returns true if and only if the two pullspecs are
// by-digest pullspecs and the digests are equal or the two pullspecs
// are identical.
func equalDigest(pullspecA string, pullspecB string) bool {
if pullspecA == pullspecB {
return true
}
digestA := splitDigest(pullspecA)
return digestA != "" && digestA == splitDigest(pullspecB)
}
// splitDigest returns the pullspec's digest, or an empty string if
// the pullspec is not by-digest (e.g. it is by-tag, or implicit).
func splitDigest(pullspec string) string {
parts := strings.SplitN(pullspec, "@", 3)
if len(parts) != 2 {
return ""
}
return parts[1]
}
// equalSyncWork returns indications of whether release version has changed, whether overrides have changed,
// and whether capabilities have changed.
func equalSyncWork(a, b *SyncWork, context string) (equalVersion, equalOverrides, equalCapabilities bool) {
// if both `a` and `b` are the same then simply return true
if a == b {
return true, true, true
}
// if either `a` or `b` are nil then return false
if a == nil || b == nil {
return false, false, false
}
sameVersion := equalUpdate(a.Desired, b.Desired)
sameOverrides := reflect.DeepEqual(a.Overrides, b.Overrides)
capabilitiesError := a.Capabilities.Equal(&b.Capabilities)
var msgs []string
if !sameVersion {
msgs = append(msgs, fmt.Sprintf("version changed (from %v to %v)", a.Desired, b.Desired))
}
if !sameOverrides {
msgs = append(msgs, fmt.Sprintf("overrides changed (%v to %v)", a.Overrides, b.Overrides))
}
if capabilitiesError != nil {
msgs = append(msgs, fmt.Sprintf("capabilities changed (%v)", capabilitiesError))
}
if len(msgs) > 0 {
klog.V(2).Infof("Detected while %s: %s", context, strings.Join(msgs, ", "))
}
return sameVersion, sameOverrides, capabilitiesError == nil
}
// updateApplyStatus records the current status of the payload apply sync action for
// observation by others. It sends a copy of the update to the report channel for improved
// testability. It sets Generation, Failure, Done, Total, Completed, Reconciling, Initial,
// VersionHash, LastProgress, Actual, and Verified statuses which are manged by the payload
// apply sync action.
func (w *SyncWorker) updateApplyStatus(update SyncWorkerStatus) {
w.lock.Lock()
defer w.lock.Unlock()
// do not overwrite these status values which are not managed by apply
update.loadPayloadStatus = w.status.loadPayloadStatus
update.CapabilitiesStatus = w.status.CapabilitiesStatus
klog.V(6).Infof("Payload apply status change %#v", update)
w.status = update
select {
case w.report <- update:
default:
if klog.V(6).Enabled() {
klog.Infof("Status report channel was full %#v", update)
}
}
}
// updateLoadStatus records the current status of the payload load sync action for
// observation by others. It sends a copy of the update to the report channel for improved
// testability. It sets Generation, Reconciling, Actual, Verified, payload load, and
// capabilities statuses which are manged by the payload load sync action.
func (w *SyncWorker) updateLoadStatus(update SyncWorkerStatus) {
w.lock.Lock()
defer w.lock.Unlock()
// do not overwrite these status values which are not managed by load
update.Failure = w.status.Failure
update.Done = w.status.Done
update.Total = w.status.Total
update.Completed = w.status.Completed
update.Initial = w.status.Initial
update.VersionHash = w.status.VersionHash
update.LastProgress = w.status.LastProgress
klog.V(6).Infof("Payload load status change %#v", update)
w.status = update
select {
case w.report <- update:
default:
if klog.V(6).Enabled() {
klog.Infof("Status report channel was full %#v", update)
}
}
}
// Desired returns the state the SyncWorker is trying to achieve.
func (w *SyncWorker) Desired() configv1.Update {
w.lock.Lock()
defer w.lock.Unlock()
if w.work == nil {
return configv1.Update{}
}
return w.work.Desired
}
// Status returns a copy of the current worker status.
func (w *SyncWorker) Status() *SyncWorkerStatus {
w.lock.Lock()
defer w.lock.Unlock()
return w.status.DeepCopy()
}
// apply applies the current payload to the server, executing in parallel if maxWorkers is set greater
// than 1, returning an error if the update could not be completely applied. The status is updated as we
// progress. Cancelling the context will abort the execution of apply.
func (w *SyncWorker) apply(ctx context.Context, work *SyncWork, maxWorkers int, reporter StatusReporter) error {
klog.V(2).Infof("apply: %s on generation %d in state %s at attempt %d", work.Desired.Version, work.Generation, work.State, work.Attempt)
if work.Attempt == 0 {
payload.InitCOUpdateStartTimes()
}
payloadUpdate := w.payload
// encapsulate status reporting in a threadsafe updater
total := len(payloadUpdate.Manifests)
cr := &consistentReporter{
status: SyncWorkerStatus{
Generation: work.Generation,
Initial: work.State.Initializing(),
Reconciling: work.State.Reconciling(),
VersionHash: payloadUpdate.ManifestHash,
Architecture: w.status.Architecture,
Actual: payloadUpdate.Release,
Verified: payloadUpdate.VerifiedImage,
},
completed: work.Completed,
version: payloadUpdate.Release.Version,
total: total,
reporter: reporter,
}
var tasks []*payload.Task
backoff := w.backoff
if backoff.Steps == 0 {
return fmt.Errorf("SyncWorker requires at least one backoff step to apply any manifests")
}
if backoff.Steps > 1 && work.State == payload.InitializingPayload {
backoff = wait.Backoff{Steps: 4, Factor: 2, Duration: time.Second, Cap: 15 * time.Second}
}
for i := range payloadUpdate.Manifests {
tasks = append(tasks, &payload.Task{
Index: i + 1,
Total: total,
Manifest: &payloadUpdate.Manifests[i],
Backoff: backoff,
})
}
graph := payload.NewTaskGraph(tasks)
graph.Split(payload.SplitOnJobs)
var precreateObjects bool
switch work.State {
case payload.InitializingPayload:
// Create every component in parallel to maximize reaching steady
// state.
graph.Parallelize(payload.FlattenByNumberAndComponent)
maxWorkers = len(graph.Nodes)
precreateObjects = true
case payload.ReconcilingPayload:
// Run the graph in random order during reconcile so that we don't
// hang on any particular component - we seed from the number of
// times we've attempted this particular payload, so a particular
// payload always syncs in a reproducible order. We permute the
// same way for 8 successive attempts, shifting the tasks for each
// of those attempts to try to cover as much of the payload as
// possible within that interval.
steps := 8
epoch, iteration := work.Attempt/steps, work.Attempt%steps
r := rand.New(rand.NewSource(int64(epoch)))
graph.Parallelize(payload.ShiftOrder(payload.PermuteOrder(payload.FlattenByNumberAndComponent, r), iteration, steps))
maxWorkers = 2
default:
// perform an orderly roll out by payload order, using some parallelization
// but avoiding out of order creation so components have some base
graph.Parallelize(payload.ByNumberAndComponent)
precreateObjects = true
}
capabilities := capability.GetCapabilitiesStatus(work.Capabilities)
var reportEffectErrors []error
// update each object
errs := payload.RunGraph(ctx, graph, maxWorkers, func(ctx context.Context, tasks []*payload.Task) error {
// in specific modes, attempt to precreate a set of known types (currently ClusterOperator) without
// retries
if precreateObjects {
for _, task := range tasks {
if err := ctx.Err(); err != nil {
return cr.ContextError(err)
}
if task.Manifest.GVK != configv1.SchemeGroupVersion.WithKind("ClusterOperator") {
continue
}
ov, ok := getOverrideForManifest(work.Overrides, task.Manifest)
if ok && ov.Unmanaged {
klog.V(2).Infof("Skipping precreation of %s as unmanaged", task)
continue
}
if err := task.Manifest.Include(nil, nil, nil, &capabilities); err != nil {
klog.V(2).Infof("Skipping precreation of %s: %s", task, err)
continue
}
if err := w.builder.Apply(ctx, task.Manifest, payload.PrecreatingPayload); err != nil {
klog.V(2).Infof("Unable to precreate resource %s: %v", task, err)
continue
}
klog.V(2).Infof("Precreated resource %s", task)
}
}
for _, task := range tasks {
if err := ctx.Err(); err != nil {
return cr.ContextError(err)
}
cr.Update()
klog.V(2).Infof("Running sync for %s", task)
ov, ok := getOverrideForManifest(work.Overrides, task.Manifest)
if ok && ov.Unmanaged {
klog.V(2).Infof("Skipping %s as unmanaged", task)
continue
}
if err := task.Manifest.Include(nil, nil, nil, &capabilities); err != nil {
klog.V(2).Infof("Skipping %s: %s", task, err)
continue
}
if err := task.Run(ctx, payloadUpdate.Release.Version, w.builder, work.State); err != nil {
if uErr, ok := err.(*payload.UpdateError); ok && uErr.UpdateEffect == payload.UpdateEffectReport {
// do not fail the task on this manifest, just record it for later complaining
reportEffectErrors = append(reportEffectErrors, err)
} else {
return err
}
}
cr.Inc()