From 3278bec23226c77ac7fb8a662cfd96f2b3551fcb Mon Sep 17 00:00:00 2001 From: Thomas Jungblut Date: Wed, 29 May 2024 19:50:35 +0200 Subject: [PATCH 1/2] ETCD-606: Batch bundle revision rollout (#1268) * ETCD-606: Batch bundle revision rollout This PR creates a new "etcd-all-bundles" configmap, similar to "etcd-all-certs", which allows us to transactionally create static pod rollouts on bundle changes. This is a pre-requisite to allow us to gate leaf certificate changes right after a bundle changes. Signed-off-by: Thomas Jungblut * fix env Signed-off-by: Thomas Jungblut --------- Signed-off-by: Thomas Jungblut --- bindata/etcd/pod.yaml | 10 ++-- bindata/etcd/restore-pod.yaml | 4 +- pkg/cmd/render/certs_test.go | 46 +++++++++------ pkg/etcdenvvar/envvarcontroller_test.go | 2 +- pkg/etcdenvvar/etcd_env.go | 2 +- pkg/operator/etcd_assets/bindata.go | 14 ++--- .../etcdcertsignercontroller.go | 56 ++++++++++++++++++- .../etcdcertsignercontroller_test.go | 45 ++++++++++++--- pkg/operator/starter.go | 11 +--- pkg/tlshelpers/tlshelpers.go | 1 + 10 files changed, 138 insertions(+), 53 deletions(-) diff --git a/bindata/etcd/pod.yaml b/bindata/etcd/pod.yaml index fa322ce5a..1b94d519f 100644 --- a/bindata/etcd/pod.yaml +++ b/bindata/etcd/pod.yaml @@ -145,7 +145,7 @@ ${COMPUTED_ENV_VARS} # this has a non-zero return code if the command is non-zero. If you use an export first, it doesn't and you # will succeed when you should fail. ETCD_INITIAL_CLUSTER=$(discover-etcd-initial-cluster \ - --cacert=/etc/kubernetes/static-pod-certs/configmaps/etcd-serving-ca/ca-bundle.crt \ + --cacert=/etc/kubernetes/static-pod-certs/configmaps/etcd-all-bundles/server-ca-bundle.crt \ --cert=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-peer-NODE_NAME.crt \ --key=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-peer-NODE_NAME.key \ --endpoints=${ALL_ETCD_ENDPOINTS} \ @@ -175,11 +175,11 @@ ${COMPUTED_ENV_VARS} --initial-advertise-peer-urls=https://${NODE_NODE_ENVVAR_NAME_IP}:2380 \ --cert-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-NODE_NAME.crt \ --key-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-NODE_NAME.key \ - --trusted-ca-file=/etc/kubernetes/static-pod-certs/configmaps/etcd-serving-ca/ca-bundle.crt \ + --trusted-ca-file=/etc/kubernetes/static-pod-certs/configmaps/etcd-all-bundles/server-ca-bundle.crt \ --client-cert-auth=true \ --peer-cert-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-peer-NODE_NAME.crt \ --peer-key-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-peer-NODE_NAME.key \ - --peer-trusted-ca-file=/etc/kubernetes/static-pod-certs/configmaps/etcd-peer-client-ca/ca-bundle.crt \ + --peer-trusted-ca-file=/etc/kubernetes/static-pod-certs/configmaps/etcd-all-bundles/server-ca-bundle.crt \ --peer-client-cert-auth=true \ --advertise-client-urls=https://${NODE_NODE_ENVVAR_NAME_IP}:2379 \ --listen-client-urls=https://${LISTEN_ON_ALL_IPS}:2379,unixs://${NODE_NODE_ENVVAR_NAME_IP}:0 \ @@ -255,8 +255,8 @@ ${COMPUTED_ENV_VARS} --key-file /etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-metrics-NODE_NAME.key \ --cert /etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-peer-NODE_NAME.crt \ --cert-file /etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-metrics-NODE_NAME.crt \ - --cacert /etc/kubernetes/static-pod-certs/configmaps/etcd-peer-client-ca/ca-bundle.crt \ - --trusted-ca-file /etc/kubernetes/static-pod-certs/configmaps/etcd-metrics-proxy-serving-ca/ca-bundle.crt \ + --cacert /etc/kubernetes/static-pod-certs/configmaps/etcd-all-bundles/server-ca-bundle.crt \ + --trusted-ca-file /etc/kubernetes/static-pod-certs/configmaps/etcd-all-bundles/metrics-ca-bundle.crt \ --listen-cipher-suites TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_AES_128_GCM_SHA256,TLS_AES_256_GCM_SHA384,TLS_CHACHA20_POLY1305_SHA256 env: ${COMPUTED_ENV_VARS} diff --git a/bindata/etcd/restore-pod.yaml b/bindata/etcd/restore-pod.yaml index 4a3a5d715..0863d8d7c 100644 --- a/bindata/etcd/restore-pod.yaml +++ b/bindata/etcd/restore-pod.yaml @@ -89,11 +89,11 @@ spec: --initial-advertise-peer-urls=https://${NODE_NODE_ENVVAR_NAME_IP}:2380 \ --cert-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-NODE_NAME.crt \ --key-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-NODE_NAME.key \ - --trusted-ca-file=/etc/kubernetes/static-pod-certs/configmaps/etcd-serving-ca/ca-bundle.crt \ + --trusted-ca-file=/etc/kubernetes/static-pod-certs/configmaps/etcd-all-bundles/server-ca-bundle.crt \ --client-cert-auth=true \ --peer-cert-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-peer-NODE_NAME.crt \ --peer-key-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-peer-NODE_NAME.key \ - --peer-trusted-ca-file=/etc/kubernetes/static-pod-certs/configmaps/etcd-peer-client-ca/ca-bundle.crt \ + --peer-trusted-ca-file=/etc/kubernetes/static-pod-certs/configmaps/etcd-all-bundles/server-ca-bundle.crt \ --peer-client-cert-auth=true \ --advertise-client-urls=https://${NODE_NODE_ENVVAR_NAME_IP}:2379 \ --listen-client-urls=https://${LISTEN_ON_ALL_IPS}:2379 \ diff --git a/pkg/cmd/render/certs_test.go b/pkg/cmd/render/certs_test.go index 039f377c4..79b41b9fb 100644 --- a/pkg/cmd/render/certs_test.go +++ b/pkg/cmd/render/certs_test.go @@ -23,7 +23,7 @@ func TestCertSingleNode(t *testing.T) { require.NoError(t, err) require.Equal(t, 11, len(secrets)) - require.Equal(t, 9, len(bundles)) + require.Equal(t, 10, len(bundles)) assertCertificateCorrectness(t, secrets) assertBundleCorrectness(t, secrets, bundles) @@ -40,7 +40,7 @@ func TestCertsMultiNode(t *testing.T) { require.NoError(t, err) require.Equal(t, 17, len(secrets)) - require.Equal(t, 9, len(bundles)) + require.Equal(t, 10, len(bundles)) assertCertificateCorrectness(t, secrets) assertBundleCorrectness(t, secrets, bundles) @@ -116,27 +116,41 @@ func assertBundleCorrectness(t *testing.T, secrets []corev1.Secret, bundles []co } for _, bundle := range bundles { - bundleCerts, err := cert.ParseCertsPEM([]byte(bundle.Data["ca-bundle.crt"])) - require.NoError(t, err) + if bundle.Name == tlshelpers.EtcdAllBundlesConfigMapName { + serverBundleCerts, err := cert.ParseCertsPEM([]byte(bundle.Data["server-ca-bundle.crt"])) + require.NoError(t, err) + assertMatchingBundles(t, signerMap[tlshelpers.EtcdSignerCertSecretName], serverBundleCerts) - var signers []*x509.Certificate - if strings.Contains(bundle.Name, "metric") { - signers = signerMap[tlshelpers.EtcdMetricsSignerCertSecretName] + metricsBundleCerts, err := cert.ParseCertsPEM([]byte(bundle.Data["metrics-ca-bundle.crt"])) + require.NoError(t, err) + assertMatchingBundles(t, signerMap[tlshelpers.EtcdMetricsSignerCertSecretName], metricsBundleCerts) } else { - signers = signerMap[tlshelpers.EtcdSignerCertSecretName] - } + bundleCerts, err := cert.ParseCertsPEM([]byte(bundle.Data["ca-bundle.crt"])) + require.NoError(t, err) - sort.SliceStable(signers, func(i, j int) bool { - return bytes.Compare(signers[i].Raw, signers[j].Raw) < 0 - }) - sort.SliceStable(bundleCerts, func(i, j int) bool { - return bytes.Compare(bundleCerts[i].Raw, bundleCerts[j].Raw) < 0 - }) + var signers []*x509.Certificate + if strings.Contains(bundle.Name, "metric") { + signers = signerMap[tlshelpers.EtcdMetricsSignerCertSecretName] + } else { + signers = signerMap[tlshelpers.EtcdSignerCertSecretName] + } - require.Equal(t, signers, bundleCerts) + assertMatchingBundles(t, signers, bundleCerts) + } } } +func assertMatchingBundles(t *testing.T, expectedBundle []*x509.Certificate, actualBundle []*x509.Certificate) { + sort.SliceStable(expectedBundle, func(i, j int) bool { + return bytes.Compare(expectedBundle[i].Raw, expectedBundle[j].Raw) < 0 + }) + sort.SliceStable(actualBundle, func(i, j int) bool { + return bytes.Compare(actualBundle[i].Raw, actualBundle[j].Raw) < 0 + }) + + require.Equal(t, expectedBundle, actualBundle) +} + // we only check that all certificates were generated, the correctness is verified above func assertNodeSpecificCertificates(t *testing.T, node *corev1.Node, secrets []corev1.Secret) { expectedSet := sets.NewString(fmt.Sprintf("etcd-peer-%s", node.Name), diff --git a/pkg/etcdenvvar/envvarcontroller_test.go b/pkg/etcdenvvar/envvarcontroller_test.go index 22855ae6d..423989c84 100644 --- a/pkg/etcdenvvar/envvarcontroller_test.go +++ b/pkg/etcdenvvar/envvarcontroller_test.go @@ -37,7 +37,7 @@ var ( defaultEnvResult = map[string]string{ "ALL_ETCD_ENDPOINTS": "https://192.168.2.0:2379,https://192.168.2.1:2379,https://192.168.2.2:2379", "ETCDCTL_API": "3", - "ETCDCTL_CACERT": "/etc/kubernetes/static-pod-certs/configmaps/etcd-serving-ca/ca-bundle.crt", + "ETCDCTL_CACERT": "/etc/kubernetes/static-pod-certs/configmaps/etcd-all-bundles/server-ca-bundle.crt", "ETCDCTL_CERT": "/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-peer-NODE_NAME.crt", "ETCDCTL_ENDPOINTS": "https://192.168.2.0:2379,https://192.168.2.1:2379,https://192.168.2.2:2379", "ETCDCTL_KEY": "/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-peer-NODE_NAME.key", diff --git a/pkg/etcdenvvar/etcd_env.go b/pkg/etcdenvvar/etcd_env.go index 1f1d6e379..e92922e1c 100644 --- a/pkg/etcdenvvar/etcd_env.go +++ b/pkg/etcdenvvar/etcd_env.go @@ -126,7 +126,7 @@ func getEtcdctlEnvVars(envVarContext envVarContext) (map[string]string, error) { } return map[string]string{ "ETCDCTL_API": "3", - "ETCDCTL_CACERT": "/etc/kubernetes/static-pod-certs/configmaps/etcd-serving-ca/ca-bundle.crt", + "ETCDCTL_CACERT": "/etc/kubernetes/static-pod-certs/configmaps/etcd-all-bundles/server-ca-bundle.crt", "ETCDCTL_CERT": "/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-peer-NODE_NAME.crt", "ETCDCTL_KEY": "/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-peer-NODE_NAME.key", "ETCDCTL_ENDPOINTS": endpoints, diff --git a/pkg/operator/etcd_assets/bindata.go b/pkg/operator/etcd_assets/bindata.go index bc2a1e8b7..f8e67500d 100644 --- a/pkg/operator/etcd_assets/bindata.go +++ b/pkg/operator/etcd_assets/bindata.go @@ -1061,7 +1061,7 @@ ${COMPUTED_ENV_VARS} # this has a non-zero return code if the command is non-zero. If you use an export first, it doesn't and you # will succeed when you should fail. ETCD_INITIAL_CLUSTER=$(discover-etcd-initial-cluster \ - --cacert=/etc/kubernetes/static-pod-certs/configmaps/etcd-serving-ca/ca-bundle.crt \ + --cacert=/etc/kubernetes/static-pod-certs/configmaps/etcd-all-bundles/server-ca-bundle.crt \ --cert=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-peer-NODE_NAME.crt \ --key=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-peer-NODE_NAME.key \ --endpoints=${ALL_ETCD_ENDPOINTS} \ @@ -1091,11 +1091,11 @@ ${COMPUTED_ENV_VARS} --initial-advertise-peer-urls=https://${NODE_NODE_ENVVAR_NAME_IP}:2380 \ --cert-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-NODE_NAME.crt \ --key-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-NODE_NAME.key \ - --trusted-ca-file=/etc/kubernetes/static-pod-certs/configmaps/etcd-serving-ca/ca-bundle.crt \ + --trusted-ca-file=/etc/kubernetes/static-pod-certs/configmaps/etcd-all-bundles/server-ca-bundle.crt \ --client-cert-auth=true \ --peer-cert-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-peer-NODE_NAME.crt \ --peer-key-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-peer-NODE_NAME.key \ - --peer-trusted-ca-file=/etc/kubernetes/static-pod-certs/configmaps/etcd-peer-client-ca/ca-bundle.crt \ + --peer-trusted-ca-file=/etc/kubernetes/static-pod-certs/configmaps/etcd-all-bundles/server-ca-bundle.crt \ --peer-client-cert-auth=true \ --advertise-client-urls=https://${NODE_NODE_ENVVAR_NAME_IP}:2379 \ --listen-client-urls=https://${LISTEN_ON_ALL_IPS}:2379,unixs://${NODE_NODE_ENVVAR_NAME_IP}:0 \ @@ -1171,8 +1171,8 @@ ${COMPUTED_ENV_VARS} --key-file /etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-metrics-NODE_NAME.key \ --cert /etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-peer-NODE_NAME.crt \ --cert-file /etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-metrics-NODE_NAME.crt \ - --cacert /etc/kubernetes/static-pod-certs/configmaps/etcd-peer-client-ca/ca-bundle.crt \ - --trusted-ca-file /etc/kubernetes/static-pod-certs/configmaps/etcd-metrics-proxy-serving-ca/ca-bundle.crt \ + --cacert /etc/kubernetes/static-pod-certs/configmaps/etcd-all-bundles/server-ca-bundle.crt \ + --trusted-ca-file /etc/kubernetes/static-pod-certs/configmaps/etcd-all-bundles/metrics-ca-bundle.crt \ --listen-cipher-suites TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_AES_128_GCM_SHA256,TLS_AES_256_GCM_SHA384,TLS_CHACHA20_POLY1305_SHA256 env: ${COMPUTED_ENV_VARS} @@ -1462,11 +1462,11 @@ spec: --initial-advertise-peer-urls=https://${NODE_NODE_ENVVAR_NAME_IP}:2380 \ --cert-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-NODE_NAME.crt \ --key-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-NODE_NAME.key \ - --trusted-ca-file=/etc/kubernetes/static-pod-certs/configmaps/etcd-serving-ca/ca-bundle.crt \ + --trusted-ca-file=/etc/kubernetes/static-pod-certs/configmaps/etcd-all-bundles/server-ca-bundle.crt \ --client-cert-auth=true \ --peer-cert-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-peer-NODE_NAME.crt \ --peer-key-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-peer-NODE_NAME.key \ - --peer-trusted-ca-file=/etc/kubernetes/static-pod-certs/configmaps/etcd-peer-client-ca/ca-bundle.crt \ + --peer-trusted-ca-file=/etc/kubernetes/static-pod-certs/configmaps/etcd-all-bundles/server-ca-bundle.crt \ --peer-client-cert-auth=true \ --advertise-client-urls=https://${NODE_NODE_ENVVAR_NAME_IP}:2379 \ --listen-client-urls=https://${LISTEN_ON_ALL_IPS}:2379 \ diff --git a/pkg/operator/etcdcertsigner/etcdcertsignercontroller.go b/pkg/operator/etcdcertsigner/etcdcertsignercontroller.go index 4cbca95b2..284a5990d 100644 --- a/pkg/operator/etcdcertsigner/etcdcertsignercontroller.go +++ b/pkg/operator/etcdcertsigner/etcdcertsignercontroller.go @@ -69,6 +69,7 @@ type EtcdCertSignerController struct { secretInformer corev1informers.SecretInformer secretLister corev1listers.SecretLister secretClient corev1client.SecretsGetter + configMapClient corev1client.ConfigMapsGetter quorumChecker ceohelpers.QuorumChecker certConfig *certConfig @@ -151,6 +152,7 @@ func NewEtcdCertSignerController( secretInformer: secretInformer, secretLister: secretLister, secretClient: secretClient, + configMapClient: cmGetter, quorumChecker: quorumChecker, certConfig: certCfg, signerExpirationGauge: signerExpirationGauge, @@ -231,7 +233,8 @@ func (c *EtcdCertSignerController) syncAllMasterCertificates(ctx context.Context return fmt.Errorf("error on ensuring metrics-signer cert: %w", err) } - signerBundle, metricsSignerBundle, err := c.ensureBundles(ctx, []*crypto.CA{signerCaPair, newSignerCaPair}, []*crypto.CA{metricsSignerCaPair, newMetricsSignerCaPair}) + signerBundle, metricsSignerBundle, err := c.ensureBundles(ctx, recorder, + []*crypto.CA{signerCaPair, newSignerCaPair}, []*crypto.CA{metricsSignerCaPair, newMetricsSignerCaPair}) if err != nil { return fmt.Errorf("error on ensuring bundles: %w", err) } @@ -308,8 +311,8 @@ func (c *EtcdCertSignerController) syncAllMasterCertificates(ctx context.Context // ensureBundles will take the metrics and server CA certificates and ensure the bundle is updated. // This will cause a revision rollout if a change in the bundle was detected. -// TODO(thomas): that bundle rollout is not transactional: it's very likely the first update will already trigger a new revision, the metrics could come in a follow-up -func (c *EtcdCertSignerController) ensureBundles(ctx context.Context, serverCA []*crypto.CA, metricsCA []*crypto.CA) (serverBundle []*x509.Certificate, metricsBundle []*x509.Certificate, err error) { +func (c *EtcdCertSignerController) ensureBundles(ctx context.Context, recorder events.Recorder, + serverCA []*crypto.CA, metricsCA []*crypto.CA) (serverBundle []*x509.Certificate, metricsBundle []*x509.Certificate, err error) { for _, ca := range serverCA { serverBundle, err = c.certConfig.signerCaBundle.EnsureConfigMapCABundle(ctx, ca) if err != nil { @@ -317,6 +320,11 @@ func (c *EtcdCertSignerController) ensureBundles(ctx context.Context, serverCA [ } } + serverCaBytes, err := crypto.EncodeCertificates(serverBundle...) + if err != nil { + return nil, nil, fmt.Errorf("could not encode server bundle: %w", err) + } + for _, ca := range metricsCA { metricsBundle, err = c.certConfig.metricsSignerCaBundle.EnsureConfigMapCABundle(ctx, ca) if err != nil { @@ -324,6 +332,48 @@ func (c *EtcdCertSignerController) ensureBundles(ctx context.Context, serverCA [ } } + metricsCaBytes, err := crypto.EncodeCertificates(metricsBundle...) + if err != nil { + return nil, nil, fmt.Errorf("could not encode metrics bundle: %w", err) + } + + // Write a configmap that aggregates all certs for all nodes for the static + // pod controller to watch. A single configmap ensures that a bundle change + // (e.g. node addition or cert rotation) triggers at most one static pod + // rollout. If multiple configmaps were written, the static pod controller + // might initiate rollout before all configmaps had been updated. + configMap := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: operatorclient.TargetNamespace, + Name: tlshelpers.EtcdAllBundlesConfigMapName, + Annotations: map[string]string{ + apiannotations.OpenShiftComponent: "Etcd", + }, + }, + Data: map[string]string{ + "server-ca-bundle.crt": string(serverCaBytes), + "metrics-ca-bundle.crt": string(metricsCaBytes), + }, + } + + safe, err := c.quorumChecker.IsSafeToUpdateRevision() + if err != nil { + return nil, nil, fmt.Errorf("EtcdCertSignerController.ensureBundles can't evaluate whether quorum is safe: %w", err) + } + + if !safe { + return nil, nil, fmt.Errorf("skipping EtcdCertSignerController.ensureBundles reconciliation due to insufficient quorum") + } + _, changed, err := resourceapply.ApplyConfigMap(ctx, c.configMapClient, recorder, configMap) + if err != nil { + return nil, nil, fmt.Errorf("could not apply bundle configmap: %w", err) + } + + if changed { + // TODO(thomas): set the rollout bundle revision in the CEO status + // TODO(thomas): we need to notify the outside caller that it's not safe to rotate the leafs just yet + } + return } diff --git a/pkg/operator/etcdcertsigner/etcdcertsignercontroller_test.go b/pkg/operator/etcdcertsigner/etcdcertsignercontroller_test.go index e3baa9c73..12168e5e7 100644 --- a/pkg/operator/etcdcertsigner/etcdcertsignercontroller_test.go +++ b/pkg/operator/etcdcertsigner/etcdcertsignercontroller_test.go @@ -62,10 +62,11 @@ func TestSyncAllMasters(t *testing.T) { fakeKubeClient, controller, recorder := setupController(t, []runtime.Object{}) require.NoError(t, controller.Sync(context.TODO(), factory.NewSyncContext("test", recorder))) - nodes, secretMap := allNodesAndSecrets(t, fakeKubeClient) + nodes, secretMap, configMaps := allNodesAndSecrets(t, fakeKubeClient) require.Equal(t, 3, len(nodes.Items)) assertNodeCerts(t, nodes, secretMap) assertStaticPodAllCerts(t, nodes, secretMap) + assertStaticPodAllBundles(t, configMaps) assertClientCerts(t, secretMap) assertExpirationMetric(t) } @@ -75,10 +76,11 @@ func TestNewNodeAdded(t *testing.T) { require.NoError(t, controller.Sync(context.TODO(), factory.NewSyncContext("test", recorder))) - nodes, secretMap := allNodesAndSecrets(t, fakeKubeClient) + nodes, secretMap, configMaps := allNodesAndSecrets(t, fakeKubeClient) require.Equal(t, 3, len(nodes.Items)) assertNodeCerts(t, nodes, secretMap) assertStaticPodAllCerts(t, nodes, secretMap) + assertStaticPodAllBundles(t, configMaps) assertClientCerts(t, secretMap) _, err := fakeKubeClient.CoreV1().Nodes().Create(context.TODO(), u.FakeNode("master-3", u.WithMasterLabel(), u.WithNodeInternalIP("10.0.0.4")), metav1.CreateOptions{}) @@ -86,10 +88,11 @@ func TestNewNodeAdded(t *testing.T) { require.NoError(t, controller.Sync(context.TODO(), factory.NewSyncContext("test", recorder))) - nodes, secretMap = allNodesAndSecrets(t, fakeKubeClient) + nodes, secretMap, configMaps = allNodesAndSecrets(t, fakeKubeClient) require.Equal(t, 4, len(nodes.Items)) assertNodeCerts(t, nodes, secretMap) assertStaticPodAllCerts(t, nodes, secretMap) + assertStaticPodAllBundles(t, configMaps) assertClientCerts(t, secretMap) } @@ -98,10 +101,11 @@ func TestNodeChangingIPs(t *testing.T) { require.NoError(t, controller.Sync(context.TODO(), factory.NewSyncContext("test", recorder))) - nodes, secretMap := allNodesAndSecrets(t, fakeKubeClient) + nodes, secretMap, configMaps := allNodesAndSecrets(t, fakeKubeClient) require.Equal(t, 3, len(nodes.Items)) assertNodeCerts(t, nodes, secretMap) assertStaticPodAllCerts(t, nodes, secretMap) + assertStaticPodAllBundles(t, configMaps) assertClientCerts(t, secretMap) n, err := fakeKubeClient.CoreV1().Nodes().Get(context.TODO(), "master-1", metav1.GetOptions{}) @@ -113,10 +117,11 @@ func TestNodeChangingIPs(t *testing.T) { require.NoError(t, controller.Sync(context.TODO(), factory.NewSyncContext("test", recorder))) - nodes, secretMap = allNodesAndSecrets(t, fakeKubeClient) + nodes, secretMap, configMaps = allNodesAndSecrets(t, fakeKubeClient) require.Equal(t, 3, len(nodes.Items)) assertNodeCerts(t, nodes, secretMap) assertStaticPodAllCerts(t, nodes, secretMap) + assertStaticPodAllBundles(t, configMaps) assertClientCerts(t, secretMap) } @@ -125,10 +130,11 @@ func TestClientCertsRemoval(t *testing.T) { require.NoError(t, controller.Sync(context.TODO(), factory.NewSyncContext("test", recorder))) - nodes, secretMap := allNodesAndSecrets(t, fakeKubeClient) + nodes, secretMap, configMaps := allNodesAndSecrets(t, fakeKubeClient) require.Equal(t, 3, len(nodes.Items)) assertNodeCerts(t, nodes, secretMap) assertStaticPodAllCerts(t, nodes, secretMap) + assertStaticPodAllBundles(t, configMaps) assertClientCerts(t, secretMap) oldClientCert, err := fakeKubeClient.CoreV1().Secrets(operatorclient.TargetNamespace).Get(context.TODO(), tlshelpers.EtcdClientCertSecretName, metav1.GetOptions{}) @@ -144,10 +150,11 @@ func TestClientCertsRemoval(t *testing.T) { // this should regenerate the certificates require.NoError(t, controller.Sync(context.TODO(), factory.NewSyncContext("test", recorder))) - nodes, secretMap = allNodesAndSecrets(t, fakeKubeClient) + nodes, secretMap, configMaps = allNodesAndSecrets(t, fakeKubeClient) require.Equal(t, 3, len(nodes.Items)) assertNodeCerts(t, nodes, secretMap) assertStaticPodAllCerts(t, nodes, secretMap) + assertStaticPodAllBundles(t, configMaps) assertClientCerts(t, secretMap) // test that the secrets actually differ and the cert was regenerated require.NotEqual(t, oldClientCert.Data, secretMap[tlshelpers.EtcdClientCertSecretName]) @@ -162,7 +169,7 @@ func TestSecretApplyFailureSyncError(t *testing.T) { require.Error(t, controller.Sync(context.TODO(), factory.NewSyncContext("test", recorder))) } -func allNodesAndSecrets(t *testing.T, fakeKubeClient *fake.Clientset) (*corev1.NodeList, map[string]corev1.Secret) { +func allNodesAndSecrets(t *testing.T, fakeKubeClient *fake.Clientset) (*corev1.NodeList, map[string]corev1.Secret, map[string]corev1.ConfigMap) { nodes, err := fakeKubeClient.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{}) require.NoError(t, err) secrets, err := fakeKubeClient.CoreV1().Secrets(operatorclient.TargetNamespace).List(context.Background(), metav1.ListOptions{}) @@ -172,7 +179,15 @@ func allNodesAndSecrets(t *testing.T, fakeKubeClient *fake.Clientset) (*corev1.N for _, secret := range secrets.Items { secretMap[secret.Name] = secret } - return nodes, secretMap + + configMaps, err := fakeKubeClient.CoreV1().ConfigMaps(operatorclient.TargetNamespace).List(context.Background(), metav1.ListOptions{}) + require.NoError(t, err) + + configMapsMap := map[string]corev1.ConfigMap{} + for _, cm := range configMaps.Items { + configMapsMap[cm.Name] = cm + } + return nodes, secretMap, configMapsMap } func assertStaticPodAllCerts(t *testing.T, nodes *corev1.NodeList, secretMap map[string]corev1.Secret) { @@ -196,6 +211,18 @@ func assertStaticPodAllCerts(t *testing.T, nodes *corev1.NodeList, secretMap map }) } +func assertStaticPodAllBundles(t *testing.T, configMaps map[string]corev1.ConfigMap) { + cmName := tlshelpers.EtcdAllBundlesConfigMapName + t.Run(cmName, func(t *testing.T) { + allBundles, ok := configMaps[cmName] + require.Truef(t, ok, "expected configmaps/%s to exist", cmName) + // always should have server and metrics bundle + require.Equal(t, 2, len(allBundles.Data)) + require.NotNil(t, allBundles.Data["server-ca-bundle.crt"]) + require.NotNil(t, allBundles.Data["metrics-ca-bundle.crt"]) + }) +} + func assertNodeCerts(t *testing.T, nodes *corev1.NodeList, secretMap map[string]corev1.Secret) { // Cert secret per type per node for _, node := range nodes.Items { diff --git a/pkg/operator/starter.go b/pkg/operator/starter.go index 6650fd2ba..884a43c4d 100644 --- a/pkg/operator/starter.go +++ b/pkg/operator/starter.go @@ -615,12 +615,8 @@ func getEnabledDisabledFeatures(features featuregates.FeatureGate) ([]string, [] // the first element should be the configmap that contains the static pod manifest var RevisionConfigMaps = []revision.RevisionResource{ {Name: "etcd-pod"}, - - {Name: "etcd-serving-ca"}, - {Name: "etcd-peer-client-ca"}, - {Name: "etcd-metrics-proxy-serving-ca"}, - {Name: "etcd-metrics-proxy-client-ca"}, {Name: "etcd-endpoints"}, + {Name: "etcd-all-bundles"}, } // RevisionSecrets is a list of secrets that are directly copied for the current values. A different actor/controller modifies these. @@ -631,10 +627,7 @@ var RevisionSecrets = []revision.RevisionResource{ var CertConfigMaps = []installer.UnrevisionedResource{ {Name: "restore-etcd-pod"}, {Name: "etcd-scripts"}, - {Name: "etcd-serving-ca"}, - {Name: "etcd-peer-client-ca"}, - {Name: "etcd-metrics-proxy-serving-ca"}, - {Name: "etcd-metrics-proxy-client-ca"}, + {Name: "etcd-all-bundles"}, } var CertSecrets = []installer.UnrevisionedResource{ diff --git a/pkg/tlshelpers/tlshelpers.go b/pkg/tlshelpers/tlshelpers.go index 3eece55ac..15d331b4e 100644 --- a/pkg/tlshelpers/tlshelpers.go +++ b/pkg/tlshelpers/tlshelpers.go @@ -37,6 +37,7 @@ const ( EtcdMetricsSignerCertSecretName = "etcd-metric-signer" EtcdMetricsSignerCaBundleConfigMapName = "etcd-metrics-ca-bundle" EtcdAllCertsSecretName = "etcd-all-certs" + EtcdAllBundlesConfigMapName = "etcd-all-bundles" EtcdClientCertSecretName = "etcd-client" EtcdMetricsClientCertSecretName = "etcd-metric-client" ) From 7e71119e9d4a036381029284151bf5b433cdceb6 Mon Sep 17 00:00:00 2001 From: Thomas Jungblut Date: Mon, 10 Jun 2024 09:13:04 +0200 Subject: [PATCH 2/2] Throttle all operator events To avoid spamming pathological events due to hot looping controllers, we're setting a more strict limit on the amount of events emitted. Signed-off-by: Thomas Jungblut --- pkg/cmd/operator/cmd.go | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/pkg/cmd/operator/cmd.go b/pkg/cmd/operator/cmd.go index 8d42e50fb..0cba261f0 100644 --- a/pkg/cmd/operator/cmd.go +++ b/pkg/cmd/operator/cmd.go @@ -3,18 +3,21 @@ package operator import ( "context" "fmt" - "net/http" - "github.com/openshift/cluster-etcd-operator/pkg/operator" "github.com/openshift/cluster-etcd-operator/pkg/version" "github.com/openshift/library-go/pkg/controller/controllercmd" "github.com/spf13/cobra" + corev1 "k8s.io/api/core/v1" "k8s.io/apiserver/pkg/server/healthz" + "k8s.io/client-go/tools/record" + "net/http" + "strings" ) func NewOperator() *cobra.Command { cmd := controllercmd. NewControllerCommandConfig("openshift-cluster-etcd-operator", version.Get(), operator.RunOperator). + WithEventRecorderOptions(EtcdOperatorCorrelatorOptions()). WithHealthChecks(healthz.NamedCheck("controller-aliveness", func(_ *http.Request) error { if !operator.AlivenessChecker.Alive() { return fmt.Errorf("found unhealthy aliveness check, returning error") @@ -27,3 +30,17 @@ func NewOperator() *cobra.Command { return cmd } + +// EtcdOperatorCorrelatorOptions is a very strict correlator policy to avoid spamming etcd/apiserver with duplicated events +func EtcdOperatorCorrelatorOptions() record.CorrelatorOptions { + return record.CorrelatorOptions{ + // only allow the same event five times in 10m + MaxEvents: 5, + MaxIntervalInSeconds: 600, + BurstSize: 5, // default: 25 (change allows a single source to send 5 events about object per minute) + QPS: 1. / 300., // default: 1/300 (change allows refill rate to 1 new event every 300s) + KeyFunc: func(event *corev1.Event) (aggregateKey string, localKey string) { + return strings.Join([]string{event.Type, event.Reason, event.Message}, "_"), event.Message + }, + } +}