From ba0842134ce21ad77874adc5cb1ebe7cf5794283 Mon Sep 17 00:00:00 2001 From: Salvatore Dario Minonne Date: Mon, 20 Apr 2026 14:17:46 +0200 Subject: [PATCH 1/2] fix(aws): prevent IAM resource leaks during cluster destroy When destroying an AWS cluster, failures at any step in the cleanup chain would cause all subsequent steps to be skipped, leaking IAM resources (OIDC providers, IAM roles, instance profiles). Change destroyPlatformSpecifics, DestroyIAM, DestroyOIDCResources, and DestroySharedVPCRoles to collect errors and attempt all cleanup steps instead of returning on the first failure. This ensures that e.g. an infrastructure destroy failure does not prevent IAM cleanup, and a single role deletion failure does not skip the remaining roles. Co-Authored-By: Claude Opus 4.6 --- cmd/cluster/aws/destroy.go | 8 ++- cmd/infra/aws/destroy_iam.go | 95 ++++++++++++++++--------------- cmd/infra/aws/destroy_iam_test.go | 9 ++- 3 files changed, 60 insertions(+), 52 deletions(-) diff --git a/cmd/cluster/aws/destroy.go b/cmd/cluster/aws/destroy.go index 602d9f1f26c..2c5b9f3359f 100644 --- a/cmd/cluster/aws/destroy.go +++ b/cmd/cluster/aws/destroy.go @@ -75,6 +75,8 @@ func destroyPlatformSpecifics(ctx context.Context, o *core.DestroyOptions) error } } + var errs []error + o.Log.Info("Destroying infrastructure", "infraID", infraID) destroyInfraOpts := awsinfra.DestroyInfraOptions{ Region: region, @@ -91,7 +93,7 @@ func destroyPlatformSpecifics(ctx context.Context, o *core.DestroyOptions) error PrivateZonesInClusterAccount: o.AWSPlatform.PrivateZonesInClusterAccount, } if err := destroyInfraOpts.Run(ctx); err != nil { - return fmt.Errorf("failed to destroy infrastructure: %w", err) + errs = append(errs, fmt.Errorf("failed to destroy infrastructure: %w", err)) } if !o.AWSPlatform.PreserveIAM { @@ -106,10 +108,10 @@ func destroyPlatformSpecifics(ctx context.Context, o *core.DestroyOptions) error PrivateZonesInClusterAccount: o.AWSPlatform.PrivateZonesInClusterAccount, } if err := destroyOpts.Run(ctx); err != nil { - return fmt.Errorf("failed to destroy IAM: %w", err) + errs = append(errs, fmt.Errorf("failed to destroy IAM: %w", err)) } } - return nil + return errors.NewAggregate(errs) } func DestroyCluster(ctx context.Context, o *core.DestroyOptions) error { diff --git a/cmd/infra/aws/destroy_iam.go b/cmd/infra/aws/destroy_iam.go index 99b56a4e1b1..19e978627cb 100644 --- a/cmd/infra/aws/destroy_iam.go +++ b/cmd/infra/aws/destroy_iam.go @@ -16,6 +16,7 @@ import ( "github.com/aws/aws-sdk-go-v2/service/iam" iamtypes "github.com/aws/aws-sdk-go-v2/service/iam/types" + utilerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/wait" "github.com/go-logr/logr" @@ -95,29 +96,31 @@ func (o *DestroyIAMOptions) DestroyIAM(ctx context.Context) error { o.Retryer = awsConfig() }) - err = o.DestroyOIDCResources(ctx, iamClient) - if err != nil { - return err + // Attempt all IAM cleanup steps even if some fail, to avoid leaking + // IAM resources when one step encounters an error. + var errs []error + + if err = o.DestroyOIDCResources(ctx, iamClient); err != nil { + errs = append(errs, err) } - err = o.DestroyWorkerInstanceProfile(ctx, iamClient) - if err != nil { - return err + if err = o.DestroyWorkerInstanceProfile(ctx, iamClient); err != nil { + errs = append(errs, err) } if o.VPCOwnerCredentialsOpts.AWSCredentialsFile != "" { vpcOwnerAWSSession, err := o.VPCOwnerCredentialsOpts.GetSession(ctx, "cli-destroy-iam", nil, o.Region) if err != nil { - return err + return utilerrors.NewAggregate(append(errs, err)) } vpcOwnerIAMClient := iam.NewFromConfig(*vpcOwnerAWSSession, func(o *iam.Options) { o.Retryer = awsConfig() }) if err = o.DestroySharedVPCRoles(ctx, iamClient, vpcOwnerIAMClient); err != nil { - return err + errs = append(errs, err) } } - return nil + return utilerrors.NewAggregate(errs) } func (o *DestroyIAMOptions) DestroyOIDCResources(ctx context.Context, iamClient awsapi.IAMAPI) error { @@ -126,6 +129,8 @@ func (o *DestroyIAMOptions) DestroyOIDCResources(ctx context.Context, iamClient return err } + var errs []error + for _, provider := range oidcProviderList.OpenIDConnectProviderList { // OIDC Provider ARN is of the form arn:aws:iam:::oidc-provider/hypershift-ci-2-oidc.s3.us-east-1.amazonaws.com/ arnTokens := strings.Split(*provider.Arn, "/") @@ -138,7 +143,7 @@ func (o *DestroyIAMOptions) DestroyOIDCResources(ctx context.Context, iamClient var nse *iamtypes.NoSuchEntityException if !errors.As(err, &nse) { o.Log.Error(err, "Error deleting OIDC provider", "providerARN", provider.Arn) - return err + errs = append(errs, fmt.Errorf("failed to delete OIDC provider %s: %w", *provider.Arn, err)) } } else { o.Log.Info("Deleted OIDC provider", "providerARN", provider.Arn) @@ -150,43 +155,39 @@ func (o *DestroyIAMOptions) DestroyOIDCResources(ctx context.Context, iamClient // Delete the shared role removed := false if removed, err = o.DestroyOIDCRole(ctx, iamClient, "shared-role"); err != nil { - return err + return utilerrors.NewAggregate(append(errs, err)) } if removed { // The cluster was created with a single shared role, so we are done. // Save on additional API calls and just return here. - return nil - } - // Delete individual component roles - if err = o.DestroyOIDCRoleWithRetry(ctx, iamClient, "openshift-ingress"); err != nil { - return err - } - if _, err = o.DestroyOIDCRole(ctx, iamClient, "openshift-image-registry"); err != nil { - return err - } - if _, err = o.DestroyOIDCRole(ctx, iamClient, "aws-ebs-csi-driver-controller"); err != nil { - return err - } - if _, err = o.DestroyOIDCRole(ctx, iamClient, "cloud-controller"); err != nil { - return err - } - if _, err = o.DestroyOIDCRole(ctx, iamClient, "node-pool"); err != nil { - return err - } - if _, err = o.DestroyOIDCRole(ctx, iamClient, "control-plane-operator"); err != nil { - return err - } - if _, err = o.DestroyOIDCRole(ctx, iamClient, "cloud-network-config-controller"); err != nil { - return err - } - if _, err = o.DestroyOIDCRole(ctx, iamClient, "kms-provider"); err != nil { - return err - } - if _, err = o.DestroyOIDCRole(ctx, iamClient, "karpenter"); err != nil { - return err + return utilerrors.NewAggregate(errs) + } + // Delete individual component roles. Attempt all deletions even if some + // fail, to avoid leaking IAM roles. + roleNames := []string{ + "openshift-ingress", + "openshift-image-registry", + "aws-ebs-csi-driver-controller", + "cloud-controller", + "node-pool", + "control-plane-operator", + "cloud-network-config-controller", + "kms-provider", + "karpenter", + } + for _, name := range roleNames { + if name == "openshift-ingress" { + if err := o.DestroyOIDCRoleWithRetry(ctx, iamClient, name); err != nil { + errs = append(errs, err) + } + } else { + if _, err := o.DestroyOIDCRole(ctx, iamClient, name); err != nil { + errs = append(errs, err) + } + } } - return nil + return utilerrors.NewAggregate(errs) } // DestroyOIDCRoleWithRetry retries the entire DestroyOIDCRole operation if it fails due to attached policies @@ -368,16 +369,16 @@ func (o *DestroyIAMOptions) DestroyWorkerInstanceProfile(ctx context.Context, cl } func (o *DestroyIAMOptions) DestroySharedVPCRoles(ctx context.Context, iamClient, vpcOwnerIAMClient awsapi.IAMAPI) error { - var err error ingressRoleClient := vpcOwnerIAMClient if o.PrivateZonesInClusterAccount { ingressRoleClient = iamClient } - if _, err = o.DestroyOIDCRole(ctx, ingressRoleClient, "shared-vpc-ingress"); err != nil { - return err + var errs []error + if _, err := o.DestroyOIDCRole(ctx, ingressRoleClient, "shared-vpc-ingress"); err != nil { + errs = append(errs, err) } - if _, err = o.DestroyOIDCRole(ctx, vpcOwnerIAMClient, "shared-vpc-control-plane"); err != nil { - return err + if _, err := o.DestroyOIDCRole(ctx, vpcOwnerIAMClient, "shared-vpc-control-plane"); err != nil { + errs = append(errs, err) } - return nil + return utilerrors.NewAggregate(errs) } diff --git a/cmd/infra/aws/destroy_iam_test.go b/cmd/infra/aws/destroy_iam_test.go index 004664869a9..98423e5110d 100644 --- a/cmd/infra/aws/destroy_iam_test.go +++ b/cmd/infra/aws/destroy_iam_test.go @@ -466,7 +466,7 @@ func TestDestroyOIDCResources(t *testing.T) { errorContains: "api error", }, { - name: "When DeleteOpenIDConnectProvider fails with a non-NSE error it should return the error", + name: "When DeleteOpenIDConnectProvider fails with a non-NSE error it should still attempt role cleanup and return the error", setupMock: func(m *awsapi.MockIAMAPI) { gomock.InOrder( m.EXPECT().ListOpenIDConnectProviders(gomock.Any(), gomock.Any(), gomock.Any()). @@ -477,6 +477,9 @@ func TestDestroyOIDCResources(t *testing.T) { }, nil), m.EXPECT().DeleteOpenIDConnectProvider(gomock.Any(), gomock.Any(), gomock.Any()). Return(nil, errors.New("permission denied")), + // continues to role cleanup; shared-role + 9 component roles all not found + m.EXPECT().GetRole(gomock.Any(), gomock.Any(), gomock.Any()). + Return(nil, noSuchEntity()).Times(totalRoleChecks), ) }, expectError: true, @@ -546,12 +549,14 @@ func TestDestroySharedVPCRoles(t *testing.T) { }, }, { - name: "When destroying the ingress role fails it should return the error", + name: "When destroying the ingress role fails it should still attempt control-plane role and return the error", privateZonesInCluster: false, setupIAMMock: func(_ *awsapi.MockIAMAPI) {}, setupVPCOwnerMock: func(m *awsapi.MockIAMAPI) { m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(ingressRoleName)}, gomock.Any()). Return(nil, errors.New("api error")) + m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(cpRoleName)}, gomock.Any()). + Return(nil, noSuchEntity()) }, expectError: true, errorContains: "cannot check for existing role", From 100b927b686382f8d2802528133991cf8011a236 Mon Sep 17 00:00:00 2001 From: Salvatore Dario Minonne Date: Mon, 27 Apr 2026 14:15:28 +0200 Subject: [PATCH 2/2] fix(aws): address review feedback on IAM destroy error aggregation Fix shared-role early return that still skipped per-component role cleanup on error, extract openshift-ingress retry handling out of the main role loop into a separate roleNamesWithRetry list, and add tests for multiple simultaneous deletion failures to verify proper error aggregation. Co-Authored-By: Claude Opus 4.6 --- cmd/infra/aws/destroy_iam.go | 25 ++++++---- cmd/infra/aws/destroy_iam_test.go | 77 +++++++++++++++++++++++++++++-- 2 files changed, 88 insertions(+), 14 deletions(-) diff --git a/cmd/infra/aws/destroy_iam.go b/cmd/infra/aws/destroy_iam.go index 19e978627cb..1f70facca8a 100644 --- a/cmd/infra/aws/destroy_iam.go +++ b/cmd/infra/aws/destroy_iam.go @@ -155,7 +155,7 @@ func (o *DestroyIAMOptions) DestroyOIDCResources(ctx context.Context, iamClient // Delete the shared role removed := false if removed, err = o.DestroyOIDCRole(ctx, iamClient, "shared-role"); err != nil { - return utilerrors.NewAggregate(append(errs, err)) + errs = append(errs, err) } if removed { // The cluster was created with a single shared role, so we are done. @@ -164,8 +164,19 @@ func (o *DestroyIAMOptions) DestroyOIDCResources(ctx context.Context, iamClient } // Delete individual component roles. Attempt all deletions even if some // fail, to avoid leaking IAM roles. - roleNames := []string{ + // + // Roles that require retry (e.g. due to eventually-consistent policy + // detachment) are handled first via DestroyOIDCRoleWithRetry. + roleNamesWithRetry := []string{ "openshift-ingress", + } + for _, name := range roleNamesWithRetry { + if err := o.DestroyOIDCRoleWithRetry(ctx, iamClient, name); err != nil { + errs = append(errs, err) + } + } + + roleNames := []string{ "openshift-image-registry", "aws-ebs-csi-driver-controller", "cloud-controller", @@ -176,14 +187,8 @@ func (o *DestroyIAMOptions) DestroyOIDCResources(ctx context.Context, iamClient "karpenter", } for _, name := range roleNames { - if name == "openshift-ingress" { - if err := o.DestroyOIDCRoleWithRetry(ctx, iamClient, name); err != nil { - errs = append(errs, err) - } - } else { - if _, err := o.DestroyOIDCRole(ctx, iamClient, name); err != nil { - errs = append(errs, err) - } + if _, err := o.DestroyOIDCRole(ctx, iamClient, name); err != nil { + errs = append(errs, err) } } diff --git a/cmd/infra/aws/destroy_iam_test.go b/cmd/infra/aws/destroy_iam_test.go index 98423e5110d..18d755ff4c4 100644 --- a/cmd/infra/aws/destroy_iam_test.go +++ b/cmd/infra/aws/destroy_iam_test.go @@ -393,10 +393,11 @@ func TestDestroyOIDCResources(t *testing.T) { const totalRoleChecks = 10 tests := []struct { - name string - setupMock func(*awsapi.MockIAMAPI) - expectError bool - errorContains string + name string + setupMock func(*awsapi.MockIAMAPI) + expectError bool + errorContains string + errorContainsAll []string }{ { name: "When OIDC provider matches infraID and shared role exists it should delete the provider and return early", @@ -485,6 +486,54 @@ func TestDestroyOIDCResources(t *testing.T) { expectError: true, errorContains: "permission denied", }, + { + name: "When shared-role fails it should still attempt all component role deletions and aggregate errors", + setupMock: func(m *awsapi.MockIAMAPI) { + sharedRoleName := testInfraID + "-shared-role" + ingressRoleName := testInfraID + "-openshift-ingress" + gomock.InOrder( + m.EXPECT().ListOpenIDConnectProviders(gomock.Any(), gomock.Any(), gomock.Any()). + Return(&iam.ListOpenIDConnectProvidersOutput{}, nil), + // shared-role GetRole fails + m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(sharedRoleName)}, gomock.Any()). + Return(nil, errors.New("shared-role api error")), + // continues to component roles; openshift-ingress also fails + m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(ingressRoleName)}, gomock.Any()). + Return(nil, errors.New("ingress api error")), + // remaining 8 component roles not found + m.EXPECT().GetRole(gomock.Any(), gomock.Any(), gomock.Any()). + Return(nil, noSuchEntity()).Times(8), + ) + }, + expectError: true, + errorContainsAll: []string{"shared-role api error", "ingress api error"}, + }, + { + name: "When multiple component role deletions fail it should aggregate all errors", + setupMock: func(m *awsapi.MockIAMAPI) { + sharedRoleName := testInfraID + "-shared-role" + ingressRoleName := testInfraID + "-openshift-ingress" + registryRoleName := testInfraID + "-openshift-image-registry" + gomock.InOrder( + m.EXPECT().ListOpenIDConnectProviders(gomock.Any(), gomock.Any(), gomock.Any()). + Return(&iam.ListOpenIDConnectProvidersOutput{}, nil), + // shared-role not found + m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(sharedRoleName)}, gomock.Any()). + Return(nil, noSuchEntity()), + // openshift-ingress fails + m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(ingressRoleName)}, gomock.Any()). + Return(nil, errors.New("ingress api error")), + // openshift-image-registry fails + m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(registryRoleName)}, gomock.Any()). + Return(nil, errors.New("registry api error")), + // remaining 7 component roles not found + m.EXPECT().GetRole(gomock.Any(), gomock.Any(), gomock.Any()). + Return(nil, noSuchEntity()).Times(7), + ) + }, + expectError: true, + errorContainsAll: []string{"ingress api error", "registry api error"}, + }, } for _, tt := range tests { @@ -502,6 +551,9 @@ func TestDestroyOIDCResources(t *testing.T) { if tt.errorContains != "" { g.Expect(err.Error()).To(ContainSubstring(tt.errorContains)) } + for _, substr := range tt.errorContainsAll { + g.Expect(err.Error()).To(ContainSubstring(substr)) + } } else { g.Expect(err).NotTo(HaveOccurred()) } @@ -522,6 +574,7 @@ func TestDestroySharedVPCRoles(t *testing.T) { setupVPCOwnerMock func(*awsapi.MockIAMAPI) expectError bool errorContains string + errorContainsAll []string }{ { name: "When PrivateZonesInClusterAccount is false ingress role should use vpcOwnerClient", @@ -576,6 +629,19 @@ func TestDestroySharedVPCRoles(t *testing.T) { expectError: true, errorContains: "cannot check for existing role", }, + { + name: "When both ingress and control-plane role deletions fail it should aggregate all errors", + privateZonesInCluster: false, + setupIAMMock: func(_ *awsapi.MockIAMAPI) {}, + setupVPCOwnerMock: func(m *awsapi.MockIAMAPI) { + m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(ingressRoleName)}, gomock.Any()). + Return(nil, errors.New("ingress api error")) + m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(cpRoleName)}, gomock.Any()). + Return(nil, errors.New("cp api error")) + }, + expectError: true, + errorContainsAll: []string{"ingress api error", "cp api error"}, + }, } for _, tt := range tests { @@ -599,6 +665,9 @@ func TestDestroySharedVPCRoles(t *testing.T) { if tt.errorContains != "" { g.Expect(err.Error()).To(ContainSubstring(tt.errorContains)) } + for _, substr := range tt.errorContainsAll { + g.Expect(err.Error()).To(ContainSubstring(substr)) + } } else { g.Expect(err).NotTo(HaveOccurred()) }