Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug 1832923: Forget bootstrap etcd member IP after bootstrap #367

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions go.mod
Expand Up @@ -24,4 +24,5 @@ require (
k8s.io/client-go v0.18.3
k8s.io/component-base v0.18.3
k8s.io/klog v1.0.0
k8s.io/utils v0.0.0-20200324210504-a9aa75ae1b89
)
20 changes: 8 additions & 12 deletions pkg/etcdcli/etcdcli.go
Expand Up @@ -74,28 +74,24 @@ func (g *etcdClientGetter) getEtcdClient() (*clientv3.Client, error) {

network, err := g.networkLister.Get("cluster")
if err != nil {
return nil, err
return nil, fmt.Errorf("failed to list cluster network: %w", err)
}

etcdEndpoints := []string{}
nodes, err := g.nodeLister.List(labels.Set{"node-role.kubernetes.io/master": ""}.AsSelector())
for _, node := range nodes {
internalIP, err := dnshelpers.GetEscapedPreferredInternalIPAddressForNodeName(network, node)
if err != nil {
return nil, err
return nil, fmt.Errorf("failed to get internal IP for node: %w", err)
}
etcdEndpoints = append(etcdEndpoints, fmt.Sprintf("https://%s:2379", internalIP))
}

hostEtcd, err := g.configmapsLister.ConfigMaps(operatorclient.TargetNamespace).Get("etcd-endpoints")
configmap, err := g.configmapsLister.ConfigMaps(operatorclient.TargetNamespace).Get("etcd-endpoints")
if err != nil {
return nil, err
}
bootstrapIP, ok := hostEtcd.Annotations[BootstrapIPAnnotationKey]
if !ok {
klog.V(2).Infof("configmaps/etcd-endpoints is missing annotation %s", BootstrapIPAnnotationKey)
return nil, fmt.Errorf("failed to list endpoints: %w", err)
}
if bootstrapIP != "" {
if bootstrapIP, ok := configmap.Annotations[BootstrapIPAnnotationKey]; ok && bootstrapIP != "" {
// escape if IPv6
if net.ParseIP(bootstrapIP).To4() == nil {
bootstrapIP = "[" + bootstrapIP + "]"
Expand All @@ -112,11 +108,11 @@ func (g *etcdClientGetter) getEtcdClient() (*clientv3.Client, error) {

c, err := getEtcdClient(etcdEndpoints)
if err != nil {
return nil, err
return nil, fmt.Errorf("failed to get etcd client: %w", err)
}
if g.cachedClient != nil {
if err := g.cachedClient.Close(); err != nil {
utilruntime.HandleError(err)
utilruntime.HandleError(fmt.Errorf("failed to close cached client: %w", err))
}
}
g.cachedClient = c
Expand Down Expand Up @@ -146,7 +142,7 @@ func getEtcdClient(endpoints []string) (*clientv3.Client, error) {

cli, err := clientv3.New(*cfg)
if err != nil {
return nil, err
return nil, fmt.Errorf("failed to make etcd client for endpoints %v: %w", endpoints, err)
}
return cli, err
}
Expand Down
93 changes: 79 additions & 14 deletions pkg/operator/etcdendpointscontroller/etcdendpointscontroller.go
Expand Up @@ -13,50 +13,50 @@ import (
"github.com/openshift/library-go/pkg/operator/v1helpers"
operatorv1helpers "github.com/openshift/library-go/pkg/operator/v1helpers"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/client-go/kubernetes"
corev1client "k8s.io/client-go/kubernetes/typed/core/v1"
corev1listers "k8s.io/client-go/listers/core/v1"
"k8s.io/klog"

"github.com/openshift/cluster-etcd-operator/pkg/etcdcli"
"github.com/openshift/cluster-etcd-operator/pkg/operator/operatorclient"
)

// EtcdEndpointsController maintains a configmap resource with
// IP addresses for etcd. It should never depend on DNS directly or transitively.
type EtcdEndpointsController struct {
operatorClient v1helpers.OperatorClient
operatorClient v1helpers.StaticPodOperatorClient
nodeLister corev1listers.NodeLister
configmapLister corev1listers.ConfigMapLister
configmapClient corev1client.ConfigMapsGetter
}

func NewEtcdEndpointsController(
operatorClient v1helpers.OperatorClient,
operatorClient v1helpers.StaticPodOperatorClient,
eventRecorder events.Recorder,
kubeClient kubernetes.Interface,
kubeInformers operatorv1helpers.KubeInformersForNamespaces,
) factory.Controller {
kubeInformersForTargetNamespace := kubeInformers.InformersFor(operatorclient.TargetNamespace)
configmapsInformer := kubeInformersForTargetNamespace.Core().V1().ConfigMaps()
kubeInformersForCluster := kubeInformers.InformersFor("")
nodeInformer := kubeInformersForCluster.Core().V1().Nodes()
nodeInformer := kubeInformers.InformersFor("").Core().V1().Nodes()

c := &EtcdEndpointsController{
operatorClient: operatorClient,
nodeLister: nodeInformer.Lister(),
configmapLister: configmapsInformer.Lister(),
configmapLister: kubeInformers.ConfigMapLister(),
configmapClient: kubeClient.CoreV1(),
}
return factory.New().ResyncEvery(time.Minute).WithInformers(
operatorClient.Informer(),
configmapsInformer.Informer(),
kubeInformers.InformersFor(operatorclient.TargetNamespace).Core().V1().ConfigMaps().Informer(),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Curious what is the difference between the former and the latter?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

functionally equivalent, both cause sync to trigger on configmap change in the openshift-etcd namespace

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK so why change :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because configmapsInformer was only used in one place (so what's the point of a local to begin with), and also had a misleading name (it isn't an informer for all configmaps, it's only for the openshift-etcd namespace).

nodeInformer.Informer(),
).WithSync(c.sync).ToController("EtcdEndpointsController", eventRecorder.WithComponentSuffix("etcd-endpoints-controller"))
}

func (c *EtcdEndpointsController) sync(ctx context.Context, syncCtx factory.SyncContext) error {
err := c.syncConfigMap(ctx, syncCtx.Recorder())
err := c.syncConfigMap(syncCtx.Recorder())

if err != nil {
_, _, updateErr := v1helpers.UpdateStatus(c.operatorClient, v1helpers.UpdateConditionFn(operatorv1.OperatorCondition{
Expand All @@ -83,9 +83,29 @@ func (c *EtcdEndpointsController) sync(ctx context.Context, syncCtx factory.Sync
return nil
}

func (c *EtcdEndpointsController) syncConfigMap(ctx context.Context, recorder events.Recorder) error {
func (c *EtcdEndpointsController) syncConfigMap(recorder events.Recorder) error {
bootstrapComplete, err := isBootstrapComplete(c.configmapLister, c.operatorClient)
if err != nil {
return fmt.Errorf("couldn't determine bootstrap status: %w", err)
}

required := configMapAsset()

// If the bootstrap IP is present on the existing configmap, either copy it
// forward or remove it if possible so clients can forget about it.
if existing, err := c.configmapLister.ConfigMaps(operatorclient.TargetNamespace).Get("etcd-endpoints"); err == nil {
hexfusion marked this conversation as resolved.
Show resolved Hide resolved
if existingIP, hasExistingIP := existing.Annotations[etcdcli.BootstrapIPAnnotationKey]; hasExistingIP {
if bootstrapComplete {
// remove the annotation
required.Annotations[etcdcli.BootstrapIPAnnotationKey+"-"] = existingIP
} else {
required.Annotations[etcdcli.BootstrapIPAnnotationKey] = existingIP
}
}
} else if !errors.IsNotFound(err) {
return fmt.Errorf("couldn't get configmap %s/%s: %w", operatorclient.TargetNamespace, "etcd-endpoints", err)
}

// create endpoint addresses for each node
nodes, err := c.nodeLister.List(labels.Set{"node-role.kubernetes.io/master": ""}.AsSelector())
if err != nil {
Expand All @@ -112,15 +132,60 @@ func (c *EtcdEndpointsController) syncConfigMap(ctx context.Context, recorder ev

required.Data = endpointAddresses

_, _, err = resourceapply.ApplyConfigMap(c.configmapClient, recorder, required)
return err
// Apply endpoint updates
if _, _, err := resourceapply.ApplyConfigMap(c.configmapClient, recorder, required); err != nil {
return err
}

return nil
}

func configMapAsset() *corev1.ConfigMap {
return &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: "etcd-endpoints",
Namespace: operatorclient.TargetNamespace,
Name: "etcd-endpoints",
Namespace: operatorclient.TargetNamespace,
Annotations: map[string]string{},
},
}
}

// isBootstrapComplete returns true if bootstrap has completed. This is used to
// indicate whether it's safe for clients to forget about the bootstrap member IP.
func isBootstrapComplete(configMapClient corev1listers.ConfigMapLister, staticPodClient v1helpers.StaticPodOperatorClient) (bool, error) {
// do a cheap check to see if the annotation is already gone.
// check to see if bootstrapping is complete
bootstrapFinishedConfigMap, err := configMapClient.ConfigMaps("kube-system").Get("bootstrap")
if err != nil {
if errors.IsNotFound(err) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

style: prefer

if errors.IsNotFound(){
   return
}
if err != nil{
   different return
}

// If the resource was deleted (e.g. by an admin) after bootstrap is actually complete,
// this is a false negative.
klog.V(4).Infof("bootstrap considered incomplete because the kube-system/bootstrap configmap wasn't found")
return false, nil
}
// We don't know, give up quickly.
return false, fmt.Errorf("failed to get configmap %s/%s: %w", "kube-system", "bootstrap", err)
}

if status, ok := bootstrapFinishedConfigMap.Data["status"]; !ok || status != "complete" {
// do nothing, not torn down
klog.V(4).Infof("bootstrap considered incomplete because status is %q", status)
return false, nil
}

// now run check to stability of revisions
_, status, _, err := staticPodClient.GetStaticPodOperatorState()
if err != nil {
return false, fmt.Errorf("failed to get static pod operator state: %w", err)
}
if status.LatestAvailableRevision == 0 {
return false, nil
}
for _, curr := range status.NodeStatuses {
if curr.CurrentRevision != status.LatestAvailableRevision {
klog.V(4).Infof("bootstrap considered incomplete because revision %d is still in progress", status.LatestAvailableRevision)
return false, nil
}
}
return true, nil
}