Skip to content

Commit

Permalink
Merge pull request #1209 from tjungblu/gc_old_node_certs
Browse files Browse the repository at this point in the history
ETCD-518: garbage collect unused dynamic certs
  • Loading branch information
openshift-merge-bot[bot] committed Mar 7, 2024
2 parents 5b9240f + d3610f5 commit 2f6e1de
Show file tree
Hide file tree
Showing 5 changed files with 370 additions and 11 deletions.
187 changes: 187 additions & 0 deletions pkg/operator/etcdcertcleaner/etcd_cert_cleaner_controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
package etcdcertcleaner

import (
"context"
"fmt"
"github.com/openshift/cluster-etcd-operator/pkg/operator/ceohelpers"
"github.com/openshift/cluster-etcd-operator/pkg/operator/health"
"github.com/openshift/cluster-etcd-operator/pkg/operator/operatorclient"
"github.com/openshift/cluster-etcd-operator/pkg/tlshelpers"
"github.com/openshift/library-go/pkg/controller/factory"
"github.com/openshift/library-go/pkg/operator/events"
"github.com/openshift/library-go/pkg/operator/v1helpers"
corev1 "k8s.io/api/core/v1"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
corev1informers "k8s.io/client-go/informers/core/v1"
"k8s.io/client-go/kubernetes"
corev1client "k8s.io/client-go/kubernetes/typed/core/v1"
corev1listers "k8s.io/client-go/listers/core/v1"
"k8s.io/klog/v2"
"strings"
"time"
)

const DeletionGracePeriodAnnotation = "openshift.io/ceo-delete-grace-start"
const DeletionGracePeriod = 24 * 3 * time.Hour

var dynamicCertPrefixes = []string{
tlshelpers.GetPeerClientSecretNameForNode(""),
tlshelpers.GetServingSecretNameForNode(""),
tlshelpers.GetServingMetricsSecretNameForNode(""),
}

// EtcdCertCleanerController will clean up old and unused dynamically generated ("node-based") secrets.
// This is to avoid filling up etcd in environments where the control plane is rotated/scaled up and down very often.
type EtcdCertCleanerController struct {
eventRecorder events.Recorder
kubeClient kubernetes.Interface
operatorClient v1helpers.StaticPodOperatorClient
masterNodeLister corev1listers.NodeLister
masterNodeSelector labels.Selector
secretInformer corev1informers.SecretInformer
secretLister corev1listers.SecretLister
secretClient corev1client.SecretsGetter
quorumChecker ceohelpers.QuorumChecker
}

func NewEtcdCertCleanerController(
livenessChecker *health.MultiAlivenessChecker,
kubeClient kubernetes.Interface,
operatorClient v1helpers.StaticPodOperatorClient,
kubeInformers v1helpers.KubeInformersForNamespaces,
masterNodeLister corev1listers.NodeLister,
masterNodeSelector labels.Selector,
eventRecorder events.Recorder,
) factory.Controller {
secretInformer := kubeInformers.InformersFor(operatorclient.TargetNamespace).Core().V1().Secrets()
secretLister := secretInformer.Lister()
secretClient := v1helpers.CachedSecretGetter(kubeClient.CoreV1(), kubeInformers)

c := &EtcdCertCleanerController{
eventRecorder: eventRecorder,
kubeClient: kubeClient,
operatorClient: operatorClient,
masterNodeLister: masterNodeLister,
masterNodeSelector: masterNodeSelector,
secretInformer: secretInformer,
secretLister: secretLister,
secretClient: secretClient,
}

// we estimate the rate of control plane scales to maybe 1-2 a day, so running this once an hour with 6h of health threshold
// should be more than sufficient. Note that this controller does not sync on any informers.
syncer := health.NewCheckingSyncWrapper(c.sync, 6*time.Hour)
livenessChecker.Add("EtcdCertCleanerController", syncer)
return factory.New().ResyncEvery(1*time.Hour).WithSync(syncer.Sync).ToController("EtcdCertCleanerController", c.eventRecorder)
}

// sync will try to find unused cert secrets which are then marked for deletion, after three days the secrets are deleted.
// This control loop does not implement "undelete". In case the secrets become managed by CertSignerController again, the
// deletion label is getting removed automatically - in the worst case the cert is getting recreated entirely.
func (c *EtcdCertCleanerController) sync(ctx context.Context, _ factory.SyncContext) error {
secrets, err := c.findUnusedNodeBasedSecrets(ctx)
if err != nil {
return err
}

for _, secretToDelete := range secrets {
klog.Infof("deleting secret [%s] detected unused for %s", secretToDelete.Name, DeletionGracePeriod)
err := c.secretClient.Secrets(operatorclient.TargetNamespace).Delete(ctx, secretToDelete.Name, v1.DeleteOptions{})
if err != nil {
return fmt.Errorf("EtcdCertCleanerController: could not delete unused secret [%s], error was: %w", secretToDelete.Name, err)
}
}

return nil
}

func (c *EtcdCertCleanerController) findUnusedNodeBasedSecrets(ctx context.Context) ([]*corev1.Secret, error) {
secrets, err := c.secretLister.List(labels.Everything())
if err != nil {
return nil, fmt.Errorf("EtcdCertCleanerController: could not list secrets, error was: %w", err)
}

secrets = filterNodeBasedSecrets(secrets)
nodes, err := c.masterNodeLister.List(c.masterNodeSelector)
if err != nil {
return nil, fmt.Errorf("EtcdCertCleanerController: could not list control plane nodes, error was: %w", err)
}

var existingNodeSecrets []*corev1.Secret
for _, n := range nodes {
for i := 0; i < len(secrets); i++ {
if strings.HasSuffix(secrets[i].Name, n.Name) {
existingNodeSecrets = append(existingNodeSecrets, secrets[i])
}
}
}

for _, secretToUntag := range existingNodeSecrets {
if secretToUntag.Annotations == nil {
secretToUntag.Annotations = map[string]string{}
}
delete(secretToUntag.Annotations, DeletionGracePeriodAnnotation)
klog.Infof("unused secret [%s] became used again, removing label for grace period", secretToUntag.Name)
_, err := c.secretClient.Secrets(operatorclient.TargetNamespace).Update(ctx, secretToUntag, v1.UpdateOptions{})
if err != nil {
return nil, fmt.Errorf("EtcdCertCleanerController: could not untag used secret [%s], error was: %w", secretToUntag.Name, err)
}
}

var goneNodeSecrets []*corev1.Secret
for i := 0; i < len(secrets); i++ {
nodeFound := false
for _, n := range nodes {
if strings.HasSuffix(secrets[i].Name, n.Name) {
nodeFound = true
break
}
}

if !nodeFound {
goneNodeSecrets = append(goneNodeSecrets, secrets[i])
}
}

var secretsRipeForDeletion []*corev1.Secret
for _, secretToTag := range goneNodeSecrets {
if secretToTag.Annotations == nil {
secretToTag.Annotations = map[string]string{}
}

if _, ok := secretToTag.Annotations[DeletionGracePeriodAnnotation]; !ok {
klog.Infof("unused secret [%s] detected, label for grace period of %s", secretToTag.Name, DeletionGracePeriod)
secretToTag.Annotations[DeletionGracePeriodAnnotation] = time.Now().Format(time.RFC3339)
_, err := c.secretClient.Secrets(operatorclient.TargetNamespace).Update(ctx, secretToTag, v1.UpdateOptions{})
if err != nil {
return nil, fmt.Errorf("EtcdCertCleanerController: could not tag unused secret [%s], error was: %w", secretToTag.Name, err)
}
} else {
parsed, err := time.Parse(time.RFC3339, secretToTag.Annotations[DeletionGracePeriodAnnotation])
if err != nil {
return nil, fmt.Errorf("EtcdCertCleanerController: could not parse secret deletion grace period [%s], error was: %w", secretToTag.Name, err)
}

if time.Now().After(parsed.Add(DeletionGracePeriod)) {
secretsRipeForDeletion = append(secretsRipeForDeletion, secretToTag.DeepCopy())
}
}
}

return secretsRipeForDeletion, nil
}

func filterNodeBasedSecrets(secrets []*corev1.Secret) []*corev1.Secret {
var filtered []*corev1.Secret
for _, s := range secrets {
for _, prefix := range dynamicCertPrefixes {
if strings.HasPrefix(s.Name, prefix) {
filtered = append(filtered, s.DeepCopy())
break
}
}
}

return filtered
}
159 changes: 159 additions & 0 deletions pkg/operator/etcdcertcleaner/etcd_cert_cleaner_controller_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
package etcdcertcleaner

import (
"context"
"github.com/openshift/cluster-etcd-operator/pkg/operator/operatorclient"
u "github.com/openshift/cluster-etcd-operator/pkg/testutils"
"github.com/openshift/cluster-etcd-operator/pkg/tlshelpers"
"github.com/openshift/library-go/pkg/operator/v1helpers"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/kubernetes/fake"
k8stesting "k8s.io/client-go/testing"
"testing"
"time"
)

func TestPreDeleteLabelHappyPath(t *testing.T) {
cleaner := newMockCleaner(t, []runtime.Object{
u.FakeNode("master-0", u.WithMasterLabel()),
u.FakeSecret(
operatorclient.TargetNamespace,
tlshelpers.GetPeerClientSecretNameForNode("master-1"),
map[string][]byte{}),
})
err := cleaner.sync(context.Background(), nil)
require.NoError(t, err)

// secret should NOT be deleted
updated := false
deleted := false
fx := cleaner.kubeClient.(*fake.Clientset)
for _, action := range fx.Actions() {
if action.Matches("delete", "secrets") {
require.Equal(t, "etcd-peer-master-1", action.(k8stesting.DeleteAction).GetName())
deleted = true
} else if action.Matches("update", "secrets") {
// secret should be relabeled with the grace annotation though
obj := action.(k8stesting.UpdateAction).GetObject().(*corev1.Secret)
require.Equal(t, "etcd-peer-master-1", obj.Name)
require.Contains(t, obj.Annotations, DeletionGracePeriodAnnotation)
updated = true
}
}

require.Truef(t, updated, "expected secret to be updated, but only found actions: %v", fx.Actions())
require.Falsef(t, deleted, "expected secret to not be deleted, but only found actions: %v", fx.Actions())
}

func TestDeleteHappyPath(t *testing.T) {
cleaner := newMockCleaner(t, []runtime.Object{
u.FakeNode("master-0", u.WithMasterLabel()),
u.FakeSecretWithAnnotations(
operatorclient.TargetNamespace,
tlshelpers.GetPeerClientSecretNameForNode("master-1"),
map[string][]byte{},
map[string]string{DeletionGracePeriodAnnotation: time.Now().Add(DeletionGracePeriod * -2).Format(time.RFC3339)}),
})
err := cleaner.sync(context.Background(), nil)
require.NoError(t, err)

// secret should have been deleted
deleted := false
fx := cleaner.kubeClient.(*fake.Clientset)
for _, action := range fx.Actions() {
if action.Matches("delete", "secrets") {
require.Equal(t, "etcd-peer-master-1", action.(k8stesting.DeleteAction).GetName())
deleted = true
}
}

require.Truef(t, deleted, "expected secret to be deleted, but only found actions: %v", fx.Actions())
}

func TestNodeCameBackWithinGrace(t *testing.T) {
cleaner := newMockCleaner(t, []runtime.Object{
u.FakeNode("master-0", u.WithMasterLabel()),
u.FakeSecretWithAnnotations(
operatorclient.TargetNamespace,
tlshelpers.GetPeerClientSecretNameForNode("master-0"),
map[string][]byte{},
map[string]string{DeletionGracePeriodAnnotation: time.Now().Add(-DeletionGracePeriod / 2).Format(time.RFC3339)}),
})
err := cleaner.sync(context.Background(), nil)
require.NoError(t, err)

// secret should NOT be deleted
updated := false
deleted := false
fx := cleaner.kubeClient.(*fake.Clientset)
for _, action := range fx.Actions() {
if action.Matches("delete", "secrets") {
require.Equal(t, "etcd-peer-master-0", action.(k8stesting.DeleteAction).GetName())
deleted = true
} else if action.Matches("update", "secrets") {
// secret should be relabeled and have the grace annotation removed again
obj := action.(k8stesting.UpdateAction).GetObject().(*corev1.Secret)
require.Equal(t, "etcd-peer-master-0", obj.Name)
require.NotContains(t, obj.Annotations, DeletionGracePeriodAnnotation)
updated = true
}
}

require.Truef(t, updated, "expected secret to be updated, but only found actions: %v", fx.Actions())
require.Falsef(t, deleted, "expected secret to not be deleted, but only found actions: %v", fx.Actions())
}

func TestDeleteNoDanglingSecrets(t *testing.T) {
cleaner := newMockCleaner(t, []runtime.Object{
u.FakeNode("master-0", u.WithMasterLabel()),
u.FakeSecret(operatorclient.TargetNamespace, tlshelpers.GetPeerClientSecretNameForNode("master-0"), map[string][]byte{}),
u.FakeNode("master-1", u.WithMasterLabel()),
u.FakeSecret(operatorclient.TargetNamespace, tlshelpers.GetPeerClientSecretNameForNode("master-1"), map[string][]byte{}),
u.FakeNode("master-2", u.WithMasterLabel()),
u.FakeSecret(operatorclient.TargetNamespace, tlshelpers.GetPeerClientSecretNameForNode("master-2"), map[string][]byte{}),
})
secrets, err := cleaner.findUnusedNodeBasedSecrets(context.Background())
require.NoError(t, err)
require.Equal(t, 0, len(secrets))
}

func TestDeleteNoSecrets(t *testing.T) {
cleaner := newMockCleaner(t, []runtime.Object{
u.FakeNode("master-0", u.WithMasterLabel()),
u.FakeNode("master-1", u.WithMasterLabel()),
u.FakeNode("master-2", u.WithMasterLabel()),
})
secrets, err := cleaner.findUnusedNodeBasedSecrets(context.Background())
require.NoError(t, err)
require.Equal(t, 0, len(secrets))
}

func newMockCleaner(t *testing.T, objects []runtime.Object) *EtcdCertCleanerController {
fakeKubeClient := fake.NewSimpleClientset(objects...)
kubeInformers := v1helpers.NewKubeInformersForNamespaces(fakeKubeClient, "", operatorclient.TargetNamespace)
nodeSelector, err := labels.Parse("node-role.kubernetes.io/master")
require.NoError(t, err)

cc := &EtcdCertCleanerController{
kubeClient: fakeKubeClient,
secretClient: fakeKubeClient.CoreV1(),
secretLister: kubeInformers.InformersFor(operatorclient.TargetNamespace).Core().V1().Secrets().Lister(),
masterNodeLister: kubeInformers.InformersFor("").Core().V1().Nodes().Lister(),
masterNodeSelector: nodeSelector,
}

stopChan := make(chan struct{})
t.Cleanup(func() {
close(stopChan)
})

kubeInformers.Start(stopChan)
for ns := range kubeInformers.Namespaces() {
kubeInformers.InformersFor(ns).WaitForCacheSync(stopChan)
}

return cc
}
12 changes: 12 additions & 0 deletions pkg/operator/starter.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package operator
import (
"context"
"fmt"
"github.com/openshift/cluster-etcd-operator/pkg/operator/etcdcertcleaner"
"os"
"regexp"
"time"
Expand Down Expand Up @@ -364,6 +365,16 @@ func RunOperator(ctx context.Context, controllerContext *controllercmd.Controlle
quorumChecker,
)

etcdCertCleanerController := etcdcertcleaner.NewEtcdCertCleanerController(
AlivenessChecker,
coreClient,
operatorClient,
kubeInformersForNamespaces,
masterNodeLister,
masterNodeLabelSelector,
controllerContext.EventRecorder,
)

etcdEndpointsController := etcdendpointscontroller.NewEtcdEndpointsController(
AlivenessChecker,
operatorClient,
Expand Down Expand Up @@ -560,6 +571,7 @@ func RunOperator(ctx context.Context, controllerContext *controllercmd.Controlle
go staticResourceController.Run(ctx, 1)
go targetConfigReconciler.Run(ctx, 1)
go etcdCertSignerController.Run(ctx, 1)
go etcdCertCleanerController.Run(ctx, 1)
go etcdEndpointsController.Run(ctx, 1)
go resourceSyncController.Run(ctx, 1)
go statusController.Run(ctx, 1)
Expand Down
12 changes: 12 additions & 0 deletions pkg/testutils/testutils.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,18 @@ func FakeSecret(namespace, name string, cert map[string][]byte) *corev1.Secret {
return secret
}

func FakeSecretWithAnnotations(namespace, name string, cert map[string][]byte, annotations map[string]string) *corev1.Secret {
secret := &corev1.Secret{
ObjectMeta: metav1.ObjectMeta{
Namespace: namespace,
Name: name,
Annotations: annotations,
},
Data: cert,
}
return secret
}

func ClusterConfigConfigMap(maxLearner int) *corev1.ConfigMap {
installConfig := map[string]interface{}{
"ControlPlane": map[string]interface{}{
Expand Down

0 comments on commit 2f6e1de

Please sign in to comment.