Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions cmd/cluster/aws/destroy.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ func destroyPlatformSpecifics(ctx context.Context, o *core.DestroyOptions) error
}
}

var errs []error

o.Log.Info("Destroying infrastructure", "infraID", infraID)
destroyInfraOpts := awsinfra.DestroyInfraOptions{
Region: region,
Expand All @@ -91,7 +93,7 @@ func destroyPlatformSpecifics(ctx context.Context, o *core.DestroyOptions) error
PrivateZonesInClusterAccount: o.AWSPlatform.PrivateZonesInClusterAccount,
}
if err := destroyInfraOpts.Run(ctx); err != nil {
return fmt.Errorf("failed to destroy infrastructure: %w", err)
errs = append(errs, fmt.Errorf("failed to destroy infrastructure: %w", err))
}

if !o.AWSPlatform.PreserveIAM {
Expand All @@ -106,10 +108,10 @@ func destroyPlatformSpecifics(ctx context.Context, o *core.DestroyOptions) error
PrivateZonesInClusterAccount: o.AWSPlatform.PrivateZonesInClusterAccount,
}
if err := destroyOpts.Run(ctx); err != nil {
return fmt.Errorf("failed to destroy IAM: %w", err)
errs = append(errs, fmt.Errorf("failed to destroy IAM: %w", err))
}
}
return nil
return errors.NewAggregate(errs)
}

func DestroyCluster(ctx context.Context, o *core.DestroyOptions) error {
Expand Down
96 changes: 51 additions & 45 deletions cmd/infra/aws/destroy_iam.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/aws/aws-sdk-go-v2/service/iam"
iamtypes "github.com/aws/aws-sdk-go-v2/service/iam/types"

utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/wait"

"github.com/go-logr/logr"
Expand Down Expand Up @@ -95,29 +96,31 @@ func (o *DestroyIAMOptions) DestroyIAM(ctx context.Context) error {
o.Retryer = awsConfig()
})

err = o.DestroyOIDCResources(ctx, iamClient)
if err != nil {
return err
// Attempt all IAM cleanup steps even if some fail, to avoid leaking
// IAM resources when one step encounters an error.
var errs []error

if err = o.DestroyOIDCResources(ctx, iamClient); err != nil {
errs = append(errs, err)
}
err = o.DestroyWorkerInstanceProfile(ctx, iamClient)
if err != nil {
return err
if err = o.DestroyWorkerInstanceProfile(ctx, iamClient); err != nil {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inside of the DestroyWorkerInstanceProfile function, we're still returning early instead of use the error aggregation pattern. Make sense to move it same strategy or keep in return early approach?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm unsure. I would say here we have to return immediately.

errs = append(errs, err)
}

if o.VPCOwnerCredentialsOpts.AWSCredentialsFile != "" {
vpcOwnerAWSSession, err := o.VPCOwnerCredentialsOpts.GetSession(ctx, "cli-destroy-iam", nil, o.Region)
if err != nil {
return err
return utilerrors.NewAggregate(append(errs, err))
}
Comment on lines 110 to 114
Copy link
Copy Markdown
Contributor

@coderabbitai coderabbitai Bot Apr 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

shared-vpc-ingress can still leak when VPC-owner session creation fails.

At Line 113, returning immediately skips any shared-VPC cleanup attempt. In PrivateZonesInClusterAccount=true, shared-vpc-ingress lives in the cluster account and can still be deleted with iamClient, but this path never attempts it.

Suggested fix
  if o.VPCOwnerCredentialsOpts.AWSCredentialsFile != "" {
      vpcOwnerAWSSession, err := o.VPCOwnerCredentialsOpts.GetSession(ctx, "cli-destroy-iam", nil, o.Region)
      if err != nil {
-         return utilerrors.NewAggregate(append(errs, err))
+         errs = append(errs, fmt.Errorf("failed to create VPC owner IAM session: %w", err))
+         // In private-zones mode, ingress role is in cluster account; still attempt cleanup.
+         if o.PrivateZonesInClusterAccount {
+             if _, ingressErr := o.DestroyOIDCRole(ctx, iamClient, "shared-vpc-ingress"); ingressErr != nil {
+                 errs = append(errs, ingressErr)
+             }
+         }
+         return utilerrors.NewAggregate(errs)
      }
      vpcOwnerIAMClient := iam.NewFromConfig(*vpcOwnerAWSSession, func(o *iam.Options) {
          o.Retryer = awsConfig()
      })
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@cmd/infra/aws/destroy_iam.go` around lines 110 - 114, When creating the
VPC-owner AWS session fails (the block using
VPCOwnerCredentialsOpts.AWSCredentialsFile and GetSession which currently
returns early with utilerrors.NewAggregate), do not return immediately; instead
append the session error into errs and continue so we still attempt shared-VPC
cleanup using the cluster-account iamClient when PrivateZonesInClusterAccount is
true. Update the code around vpcOwnerAWSSession/GetSession to collect the error
(add it to errs) and proceed, guarding any use of vpcOwnerAWSSession when nil,
and ensure the shared-VPC ingress deletion path (which uses iamClient and the
PrivateZonesInClusterAccount flag) runs even if GetSession failed.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a valid observation, but I'm intentionally not addressing it in this PR. Here's why:

  1. When GetSession for the VPC owner fails, it indicates a fundamental credentials problem. Proceeding with partial cleanup (only the ingress role) while silently skipping the shared-vpc-control-plane role cleanup would leave things in an inconsistent state.

  2. The error aggregation pattern in DestroyIAM already ensures the VPC session error is reported to the caller. The outer Run method retries on retryable errors, so transient credential issues will be retried.

  3. The PrivateZonesInClusterAccount + VPC owner session failure is an edge case that would require refactoring DestroyIAM to make GetSession mockable for proper testing. That's out of scope for this bug fix PR.

If we see this scenario causing real leaks in production, we can address it in a follow-up.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sdminonne, thanks for the detailed explanation — the reasoning is sound. You're right that attempting partial shared-VPC cleanup when the VPC-owner credentials are fundamentally broken could leave things in a worse, harder-to-reason-about state than a clean failure that gets retried. The existing retry loop in Run() is the appropriate safety net for transient credential issues.

Would you like me to open a follow-up GitHub issue to track the PrivateZonesInClusterAccount + VPC-owner session-failure edge case so it doesn't get lost?


📝 Learnings were identified and not saved (knowledge base disabled). Enable

vpcOwnerIAMClient := iam.NewFromConfig(*vpcOwnerAWSSession, func(o *iam.Options) {
o.Retryer = awsConfig()
})
if err = o.DestroySharedVPCRoles(ctx, iamClient, vpcOwnerIAMClient); err != nil {
return err
errs = append(errs, err)
}
}

return nil
return utilerrors.NewAggregate(errs)
}

func (o *DestroyIAMOptions) DestroyOIDCResources(ctx context.Context, iamClient awsapi.IAMAPI) error {
Expand All @@ -126,6 +129,8 @@ func (o *DestroyIAMOptions) DestroyOIDCResources(ctx context.Context, iamClient
return err
}

var errs []error

for _, provider := range oidcProviderList.OpenIDConnectProviderList {
// OIDC Provider ARN is of the form arn:aws:iam::<account-id>:oidc-provider/hypershift-ci-2-oidc.s3.us-east-1.amazonaws.com/<infra-id>
arnTokens := strings.Split(*provider.Arn, "/")
Expand All @@ -138,7 +143,7 @@ func (o *DestroyIAMOptions) DestroyOIDCResources(ctx context.Context, iamClient
var nse *iamtypes.NoSuchEntityException
if !errors.As(err, &nse) {
o.Log.Error(err, "Error deleting OIDC provider", "providerARN", provider.Arn)
return err
errs = append(errs, fmt.Errorf("failed to delete OIDC provider %s: %w", *provider.Arn, err))
}
} else {
o.Log.Info("Deleted OIDC provider", "providerARN", provider.Arn)
Expand All @@ -150,43 +155,44 @@ func (o *DestroyIAMOptions) DestroyOIDCResources(ctx context.Context, iamClient
// Delete the shared role
removed := false
if removed, err = o.DestroyOIDCRole(ctx, iamClient, "shared-role"); err != nil {
return err
errs = append(errs, err)
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
if removed {
// The cluster was created with a single shared role, so we are done.
// Save on additional API calls and just return here.
return nil
}
// Delete individual component roles
if err = o.DestroyOIDCRoleWithRetry(ctx, iamClient, "openshift-ingress"); err != nil {
return err
}
if _, err = o.DestroyOIDCRole(ctx, iamClient, "openshift-image-registry"); err != nil {
return err
}
if _, err = o.DestroyOIDCRole(ctx, iamClient, "aws-ebs-csi-driver-controller"); err != nil {
return err
}
if _, err = o.DestroyOIDCRole(ctx, iamClient, "cloud-controller"); err != nil {
return err
}
if _, err = o.DestroyOIDCRole(ctx, iamClient, "node-pool"); err != nil {
return err
}
if _, err = o.DestroyOIDCRole(ctx, iamClient, "control-plane-operator"); err != nil {
return err
}
if _, err = o.DestroyOIDCRole(ctx, iamClient, "cloud-network-config-controller"); err != nil {
return err
return utilerrors.NewAggregate(errs)
}
// Delete individual component roles. Attempt all deletions even if some
// fail, to avoid leaking IAM roles.
//
// Roles that require retry (e.g. due to eventually-consistent policy
// detachment) are handled first via DestroyOIDCRoleWithRetry.
roleNamesWithRetry := []string{
"openshift-ingress",
}
for _, name := range roleNamesWithRetry {
if err := o.DestroyOIDCRoleWithRetry(ctx, iamClient, name); err != nil {
errs = append(errs, err)
}
}
if _, err = o.DestroyOIDCRole(ctx, iamClient, "kms-provider"); err != nil {
return err

roleNames := []string{
"openshift-image-registry",
"aws-ebs-csi-driver-controller",
"cloud-controller",
"node-pool",
"control-plane-operator",
"cloud-network-config-controller",
"kms-provider",
"karpenter",
}
if _, err = o.DestroyOIDCRole(ctx, iamClient, "karpenter"); err != nil {
return err
for _, name := range roleNames {
if _, err := o.DestroyOIDCRole(ctx, iamClient, name); err != nil {
errs = append(errs, err)
}
}

return nil
return utilerrors.NewAggregate(errs)
}

// DestroyOIDCRoleWithRetry retries the entire DestroyOIDCRole operation if it fails due to attached policies
Expand Down Expand Up @@ -368,16 +374,16 @@ func (o *DestroyIAMOptions) DestroyWorkerInstanceProfile(ctx context.Context, cl
}

func (o *DestroyIAMOptions) DestroySharedVPCRoles(ctx context.Context, iamClient, vpcOwnerIAMClient awsapi.IAMAPI) error {
var err error
ingressRoleClient := vpcOwnerIAMClient
if o.PrivateZonesInClusterAccount {
ingressRoleClient = iamClient
}
if _, err = o.DestroyOIDCRole(ctx, ingressRoleClient, "shared-vpc-ingress"); err != nil {
return err
var errs []error
if _, err := o.DestroyOIDCRole(ctx, ingressRoleClient, "shared-vpc-ingress"); err != nil {
errs = append(errs, err)
}
if _, err = o.DestroyOIDCRole(ctx, vpcOwnerIAMClient, "shared-vpc-control-plane"); err != nil {
return err
if _, err := o.DestroyOIDCRole(ctx, vpcOwnerIAMClient, "shared-vpc-control-plane"); err != nil {
errs = append(errs, err)
}
return nil
return utilerrors.NewAggregate(errs)
}
86 changes: 80 additions & 6 deletions cmd/infra/aws/destroy_iam_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -393,10 +393,11 @@ func TestDestroyOIDCResources(t *testing.T) {
const totalRoleChecks = 10

tests := []struct {
name string
setupMock func(*awsapi.MockIAMAPI)
expectError bool
errorContains string
name string
setupMock func(*awsapi.MockIAMAPI)
expectError bool
errorContains string
errorContainsAll []string
}{
{
name: "When OIDC provider matches infraID and shared role exists it should delete the provider and return early",
Expand Down Expand Up @@ -466,7 +467,7 @@ func TestDestroyOIDCResources(t *testing.T) {
errorContains: "api error",
},
{
name: "When DeleteOpenIDConnectProvider fails with a non-NSE error it should return the error",
name: "When DeleteOpenIDConnectProvider fails with a non-NSE error it should still attempt role cleanup and return the error",
setupMock: func(m *awsapi.MockIAMAPI) {
gomock.InOrder(
m.EXPECT().ListOpenIDConnectProviders(gomock.Any(), gomock.Any(), gomock.Any()).
Expand All @@ -477,11 +478,62 @@ func TestDestroyOIDCResources(t *testing.T) {
}, nil),
m.EXPECT().DeleteOpenIDConnectProvider(gomock.Any(), gomock.Any(), gomock.Any()).
Return(nil, errors.New("permission denied")),
// continues to role cleanup; shared-role + 9 component roles all not found
m.EXPECT().GetRole(gomock.Any(), gomock.Any(), gomock.Any()).
Return(nil, noSuchEntity()).Times(totalRoleChecks),
)
},
expectError: true,
errorContains: "permission denied",
},
{
name: "When shared-role fails it should still attempt all component role deletions and aggregate errors",
setupMock: func(m *awsapi.MockIAMAPI) {
sharedRoleName := testInfraID + "-shared-role"
ingressRoleName := testInfraID + "-openshift-ingress"
gomock.InOrder(
m.EXPECT().ListOpenIDConnectProviders(gomock.Any(), gomock.Any(), gomock.Any()).
Return(&iam.ListOpenIDConnectProvidersOutput{}, nil),
// shared-role GetRole fails
m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(sharedRoleName)}, gomock.Any()).
Return(nil, errors.New("shared-role api error")),
// continues to component roles; openshift-ingress also fails
m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(ingressRoleName)}, gomock.Any()).
Return(nil, errors.New("ingress api error")),
// remaining 8 component roles not found
m.EXPECT().GetRole(gomock.Any(), gomock.Any(), gomock.Any()).
Return(nil, noSuchEntity()).Times(8),
)
},
expectError: true,
errorContainsAll: []string{"shared-role api error", "ingress api error"},
},
{
name: "When multiple component role deletions fail it should aggregate all errors",
setupMock: func(m *awsapi.MockIAMAPI) {
sharedRoleName := testInfraID + "-shared-role"
ingressRoleName := testInfraID + "-openshift-ingress"
registryRoleName := testInfraID + "-openshift-image-registry"
gomock.InOrder(
m.EXPECT().ListOpenIDConnectProviders(gomock.Any(), gomock.Any(), gomock.Any()).
Return(&iam.ListOpenIDConnectProvidersOutput{}, nil),
// shared-role not found
m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(sharedRoleName)}, gomock.Any()).
Return(nil, noSuchEntity()),
// openshift-ingress fails
m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(ingressRoleName)}, gomock.Any()).
Return(nil, errors.New("ingress api error")),
// openshift-image-registry fails
m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(registryRoleName)}, gomock.Any()).
Return(nil, errors.New("registry api error")),
// remaining 7 component roles not found
m.EXPECT().GetRole(gomock.Any(), gomock.Any(), gomock.Any()).
Return(nil, noSuchEntity()).Times(7),
)
},
expectError: true,
errorContainsAll: []string{"ingress api error", "registry api error"},
},
}

for _, tt := range tests {
Expand All @@ -499,6 +551,9 @@ func TestDestroyOIDCResources(t *testing.T) {
if tt.errorContains != "" {
g.Expect(err.Error()).To(ContainSubstring(tt.errorContains))
}
for _, substr := range tt.errorContainsAll {
g.Expect(err.Error()).To(ContainSubstring(substr))
}
} else {
g.Expect(err).NotTo(HaveOccurred())
}
Expand All @@ -519,6 +574,7 @@ func TestDestroySharedVPCRoles(t *testing.T) {
setupVPCOwnerMock func(*awsapi.MockIAMAPI)
expectError bool
errorContains string
errorContainsAll []string
}{
{
name: "When PrivateZonesInClusterAccount is false ingress role should use vpcOwnerClient",
Expand Down Expand Up @@ -546,12 +602,14 @@ func TestDestroySharedVPCRoles(t *testing.T) {
},
},
{
name: "When destroying the ingress role fails it should return the error",
name: "When destroying the ingress role fails it should still attempt control-plane role and return the error",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add test cases for multiple simultaneous failures in the role deletion loop to verify all errors are properly aggregated

privateZonesInCluster: false,
setupIAMMock: func(_ *awsapi.MockIAMAPI) {},
setupVPCOwnerMock: func(m *awsapi.MockIAMAPI) {
m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(ingressRoleName)}, gomock.Any()).
Return(nil, errors.New("api error"))
m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(cpRoleName)}, gomock.Any()).
Return(nil, noSuchEntity())
},
expectError: true,
errorContains: "cannot check for existing role",
Expand All @@ -571,6 +629,19 @@ func TestDestroySharedVPCRoles(t *testing.T) {
expectError: true,
errorContains: "cannot check for existing role",
},
{
name: "When both ingress and control-plane role deletions fail it should aggregate all errors",
privateZonesInCluster: false,
setupIAMMock: func(_ *awsapi.MockIAMAPI) {},
setupVPCOwnerMock: func(m *awsapi.MockIAMAPI) {
m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(ingressRoleName)}, gomock.Any()).
Return(nil, errors.New("ingress api error"))
m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(cpRoleName)}, gomock.Any()).
Return(nil, errors.New("cp api error"))
},
expectError: true,
errorContainsAll: []string{"ingress api error", "cp api error"},
},
}

for _, tt := range tests {
Expand All @@ -594,6 +665,9 @@ func TestDestroySharedVPCRoles(t *testing.T) {
if tt.errorContains != "" {
g.Expect(err.Error()).To(ContainSubstring(tt.errorContains))
}
for _, substr := range tt.errorContainsAll {
g.Expect(err.Error()).To(ContainSubstring(substr))
}
} else {
g.Expect(err).NotTo(HaveOccurred())
}
Expand Down