-
Notifications
You must be signed in to change notification settings - Fork 127
/
bootstrap.go
189 lines (161 loc) · 7.14 KB
/
bootstrap.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
package ceohelpers
import (
"context"
"fmt"
configv1listers "github.com/openshift/client-go/config/listers/config/v1"
"github.com/openshift/library-go/pkg/operator/bootstrap"
"github.com/openshift/library-go/pkg/operator/v1helpers"
corev1listers "k8s.io/client-go/listers/core/v1"
"k8s.io/klog/v2"
"github.com/openshift/cluster-etcd-operator/pkg/etcdcli"
"github.com/openshift/cluster-etcd-operator/pkg/operator/operatorclient"
)
// BootstrapScalingStrategy describes the invariants which will be enforced when
// scaling the etcd cluster.
type BootstrapScalingStrategy string
const (
// HAScalingStrategy means the etcd cluster will only be scaled up when at least
// 3 node are available so that HA is enforced at all times. This rule applies
// during bootstrapping and the steady state.
//
// This is the default strategy.
HAScalingStrategy BootstrapScalingStrategy = "HAScalingStrategy"
// DelayedHAScalingStrategy means that during bootstrapping, the etcd cluster will
// be allowed to scale when at least 2 members are available (which is not HA),
// but after bootstrapping any further scaling will require 3 nodes in the same
// way as HAScalingStrategy.
//
// This strategy is selected by adding the `openshift.io/delayed-ha-bootstrap`
// annotation to the openshift-etcd namesapce.
DelayedHAScalingStrategy BootstrapScalingStrategy = "DelayedHAScalingStrategy"
// BootstrapInPlaceStrategy means that the bootstrap node will never exist
// during the lifecycle of the cluster. Bootkube will run on a live iso
// afterwards the node will pivot into the manifests generated during that
// process.
//
// This strategy is selected by observing the existence of `bootstrapInPlace`
// root key in the install-config.
BootstrapInPlaceStrategy BootstrapScalingStrategy = "BootstrapInPlaceStrategy"
// UnsafeScalingStrategy means scaling will occur without regards to nodes and
// any effect on quorum. Use of this strategy isn't officially tested or supported,
// but is made available for ad-hoc use.
//
// This strategy is selected by setting unsupportedConfigOverrides on the
// operator config.
UnsafeScalingStrategy BootstrapScalingStrategy = "UnsafeScalingStrategy"
)
const (
// DelayedHABootstrapScalingStrategyAnnotation is an annotation on the openshift-etcd
// namespace which, if present indicates the DelayedHAScalingStrategy strategy
// should be used.
DelayedHABootstrapScalingStrategyAnnotation = "openshift.io/delayed-ha-bootstrap"
)
// GetBootstrapScalingStrategy determines the scaling strategy to use
func GetBootstrapScalingStrategy(staticPodClient v1helpers.StaticPodOperatorClient, namespaceLister corev1listers.NamespaceLister, infraLister configv1listers.InfrastructureLister) (BootstrapScalingStrategy, error) {
var strategy BootstrapScalingStrategy
operatorSpec, _, _, err := staticPodClient.GetStaticPodOperatorState()
if err != nil {
return strategy, fmt.Errorf("failed to get operator state: %w", err)
}
isUnsupportedUnsafeEtcd, err := isUnsupportedUnsafeEtcd(operatorSpec)
if err != nil {
return strategy, fmt.Errorf("couldn't determine etcd unsupported override status, assuming default HA scaling strategy: %w", err)
}
etcdNamespace, err := namespaceLister.Get(operatorclient.TargetNamespace)
if err != nil {
return strategy, fmt.Errorf("failed to get %s namespace: %w", operatorclient.TargetNamespace, err)
}
_, hasDelayedHAAnnotation := etcdNamespace.Annotations[DelayedHABootstrapScalingStrategyAnnotation]
singleNode, err := IsSingleNodeTopology(infraLister)
if err != nil {
return strategy, fmt.Errorf("failed to get control plane topology: %w", err)
}
switch {
case isUnsupportedUnsafeEtcd || singleNode:
strategy = UnsafeScalingStrategy
case hasDelayedHAAnnotation:
strategy = DelayedHAScalingStrategy
default:
strategy = HAScalingStrategy
}
return strategy, nil
}
// CheckSafeToScaleCluster is used to implement the bootstrap scaling strategy invariants.
// This function returns nil if cluster conditions are such that it's safe to scale
// the etcd cluster based on the scaling strategy in use, and otherwise will return
// an error explaining why it's unsafe to scale.
func CheckSafeToScaleCluster(
configmapLister corev1listers.ConfigMapLister,
staticPodClient v1helpers.StaticPodOperatorClient,
namespaceLister corev1listers.NamespaceLister,
infraLister configv1listers.InfrastructureLister,
etcdClient etcdcli.AllMemberLister) error {
bootstrapComplete, err := IsBootstrapComplete(configmapLister, staticPodClient, etcdClient)
if err != nil {
return fmt.Errorf("CheckSafeToScaleCluster failed to determine bootstrap status: %w", err)
}
// while bootstrapping, scaling should be considered safe always
if !bootstrapComplete {
return nil
}
scalingStrategy, err := GetBootstrapScalingStrategy(staticPodClient, namespaceLister, infraLister)
if err != nil {
return fmt.Errorf("CheckSafeToScaleCluster failed to get bootstrap scaling strategy: %w", err)
}
if scalingStrategy == UnsafeScalingStrategy {
return nil
}
var minimumNodes int
switch scalingStrategy {
case HAScalingStrategy:
minimumNodes = 3
case DelayedHAScalingStrategy:
minimumNodes = 3
default:
return fmt.Errorf("CheckSafeToScaleCluster unrecognized scaling strategy %q", scalingStrategy)
}
memberHealth, err := etcdClient.MemberHealth(context.Background())
if err != nil {
return fmt.Errorf("CheckSafeToScaleCluster couldn't determine member health: %w", err)
}
err = etcdcli.IsQuorumFaultTolerantErr(memberHealth)
if err != nil {
return err
}
klog.V(4).Infof("node count %d satisfies minimum of %d required by the %s bootstrap scaling strategy", len(memberHealth.GetHealthyMembers()), minimumNodes, scalingStrategy)
return nil
}
// IsBootstrapComplete returns true if bootstrap has completed.
func IsBootstrapComplete(configmapLister corev1listers.ConfigMapLister, staticPodClient v1helpers.StaticPodOperatorClient, etcdClient etcdcli.AllMemberLister) (bool, error) {
// do a cheap check to see if the installer has marked
// bootstrapping as done by creating the configmap first.
if isBootstrapComplete, err := bootstrap.IsBootstrapComplete(configmapLister); !isBootstrapComplete || err != nil {
return isBootstrapComplete, err
}
// now run check to stability of revisions
_, status, _, err := staticPodClient.GetStaticPodOperatorState()
if err != nil {
return false, fmt.Errorf("failed to get static pod operator state: %w", err)
}
if status.LatestAvailableRevision == 0 {
return false, nil
}
for _, curr := range status.NodeStatuses {
if curr.CurrentRevision != status.LatestAvailableRevision {
klog.V(4).Infof("bootstrap considered incomplete because revision %d is still in progress", status.LatestAvailableRevision)
return false, nil
}
}
// check if etcd-bootstrap member is still present within the etcd cluster membership
membersList, err := etcdClient.MemberList(context.Background())
if err != nil {
return false, fmt.Errorf("IsBootstrapComplete couldn't list the etcd cluster members: %w", err)
}
for _, m := range membersList {
if m.Name == "etcd-bootstrap" {
klog.V(4).Infof("(etcd-bootstrap) member is still present in the etcd cluster membership")
return false, nil
}
}
return true, nil
}