-
Notifications
You must be signed in to change notification settings - Fork 475
OCPBUGS-83868: prevent IAM resource leaks during cluster destroy #8303
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,6 +16,7 @@ import ( | |
| "github.com/aws/aws-sdk-go-v2/service/iam" | ||
| iamtypes "github.com/aws/aws-sdk-go-v2/service/iam/types" | ||
|
|
||
| utilerrors "k8s.io/apimachinery/pkg/util/errors" | ||
| "k8s.io/apimachinery/pkg/util/wait" | ||
|
|
||
| "github.com/go-logr/logr" | ||
|
|
@@ -95,29 +96,31 @@ func (o *DestroyIAMOptions) DestroyIAM(ctx context.Context) error { | |
| o.Retryer = awsConfig() | ||
| }) | ||
|
|
||
| err = o.DestroyOIDCResources(ctx, iamClient) | ||
| if err != nil { | ||
| return err | ||
| // Attempt all IAM cleanup steps even if some fail, to avoid leaking | ||
| // IAM resources when one step encounters an error. | ||
| var errs []error | ||
|
|
||
| if err = o.DestroyOIDCResources(ctx, iamClient); err != nil { | ||
| errs = append(errs, err) | ||
| } | ||
| err = o.DestroyWorkerInstanceProfile(ctx, iamClient) | ||
| if err != nil { | ||
| return err | ||
| if err = o.DestroyWorkerInstanceProfile(ctx, iamClient); err != nil { | ||
| errs = append(errs, err) | ||
| } | ||
|
|
||
| if o.VPCOwnerCredentialsOpts.AWSCredentialsFile != "" { | ||
| vpcOwnerAWSSession, err := o.VPCOwnerCredentialsOpts.GetSession(ctx, "cli-destroy-iam", nil, o.Region) | ||
| if err != nil { | ||
| return err | ||
| return utilerrors.NewAggregate(append(errs, err)) | ||
| } | ||
|
Comment on lines
110
to
114
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
At Line 113, returning immediately skips any shared-VPC cleanup attempt. In Suggested fix if o.VPCOwnerCredentialsOpts.AWSCredentialsFile != "" {
vpcOwnerAWSSession, err := o.VPCOwnerCredentialsOpts.GetSession(ctx, "cli-destroy-iam", nil, o.Region)
if err != nil {
- return utilerrors.NewAggregate(append(errs, err))
+ errs = append(errs, fmt.Errorf("failed to create VPC owner IAM session: %w", err))
+ // In private-zones mode, ingress role is in cluster account; still attempt cleanup.
+ if o.PrivateZonesInClusterAccount {
+ if _, ingressErr := o.DestroyOIDCRole(ctx, iamClient, "shared-vpc-ingress"); ingressErr != nil {
+ errs = append(errs, ingressErr)
+ }
+ }
+ return utilerrors.NewAggregate(errs)
}
vpcOwnerIAMClient := iam.NewFromConfig(*vpcOwnerAWSSession, func(o *iam.Options) {
o.Retryer = awsConfig()
})🤖 Prompt for AI Agents
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a valid observation, but I'm intentionally not addressing it in this PR. Here's why:
If we see this scenario causing real leaks in production, we can address it in a follow-up.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Would you like me to open a follow-up GitHub issue to track the
|
||
| vpcOwnerIAMClient := iam.NewFromConfig(*vpcOwnerAWSSession, func(o *iam.Options) { | ||
| o.Retryer = awsConfig() | ||
| }) | ||
| if err = o.DestroySharedVPCRoles(ctx, iamClient, vpcOwnerIAMClient); err != nil { | ||
| return err | ||
| errs = append(errs, err) | ||
| } | ||
| } | ||
|
|
||
| return nil | ||
| return utilerrors.NewAggregate(errs) | ||
| } | ||
|
|
||
| func (o *DestroyIAMOptions) DestroyOIDCResources(ctx context.Context, iamClient awsapi.IAMAPI) error { | ||
|
|
@@ -126,6 +129,8 @@ func (o *DestroyIAMOptions) DestroyOIDCResources(ctx context.Context, iamClient | |
| return err | ||
| } | ||
|
|
||
| var errs []error | ||
|
|
||
| for _, provider := range oidcProviderList.OpenIDConnectProviderList { | ||
| // OIDC Provider ARN is of the form arn:aws:iam::<account-id>:oidc-provider/hypershift-ci-2-oidc.s3.us-east-1.amazonaws.com/<infra-id> | ||
| arnTokens := strings.Split(*provider.Arn, "/") | ||
|
|
@@ -138,7 +143,7 @@ func (o *DestroyIAMOptions) DestroyOIDCResources(ctx context.Context, iamClient | |
| var nse *iamtypes.NoSuchEntityException | ||
| if !errors.As(err, &nse) { | ||
| o.Log.Error(err, "Error deleting OIDC provider", "providerARN", provider.Arn) | ||
| return err | ||
| errs = append(errs, fmt.Errorf("failed to delete OIDC provider %s: %w", *provider.Arn, err)) | ||
| } | ||
| } else { | ||
| o.Log.Info("Deleted OIDC provider", "providerARN", provider.Arn) | ||
|
|
@@ -150,43 +155,44 @@ func (o *DestroyIAMOptions) DestroyOIDCResources(ctx context.Context, iamClient | |
| // Delete the shared role | ||
| removed := false | ||
| if removed, err = o.DestroyOIDCRole(ctx, iamClient, "shared-role"); err != nil { | ||
| return err | ||
| errs = append(errs, err) | ||
| } | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| if removed { | ||
| // The cluster was created with a single shared role, so we are done. | ||
| // Save on additional API calls and just return here. | ||
| return nil | ||
| } | ||
| // Delete individual component roles | ||
| if err = o.DestroyOIDCRoleWithRetry(ctx, iamClient, "openshift-ingress"); err != nil { | ||
| return err | ||
| } | ||
| if _, err = o.DestroyOIDCRole(ctx, iamClient, "openshift-image-registry"); err != nil { | ||
| return err | ||
| } | ||
| if _, err = o.DestroyOIDCRole(ctx, iamClient, "aws-ebs-csi-driver-controller"); err != nil { | ||
| return err | ||
| } | ||
| if _, err = o.DestroyOIDCRole(ctx, iamClient, "cloud-controller"); err != nil { | ||
| return err | ||
| } | ||
| if _, err = o.DestroyOIDCRole(ctx, iamClient, "node-pool"); err != nil { | ||
| return err | ||
| } | ||
| if _, err = o.DestroyOIDCRole(ctx, iamClient, "control-plane-operator"); err != nil { | ||
| return err | ||
| } | ||
| if _, err = o.DestroyOIDCRole(ctx, iamClient, "cloud-network-config-controller"); err != nil { | ||
| return err | ||
| return utilerrors.NewAggregate(errs) | ||
| } | ||
| // Delete individual component roles. Attempt all deletions even if some | ||
| // fail, to avoid leaking IAM roles. | ||
| // | ||
| // Roles that require retry (e.g. due to eventually-consistent policy | ||
| // detachment) are handled first via DestroyOIDCRoleWithRetry. | ||
| roleNamesWithRetry := []string{ | ||
| "openshift-ingress", | ||
| } | ||
| for _, name := range roleNamesWithRetry { | ||
| if err := o.DestroyOIDCRoleWithRetry(ctx, iamClient, name); err != nil { | ||
| errs = append(errs, err) | ||
| } | ||
| } | ||
| if _, err = o.DestroyOIDCRole(ctx, iamClient, "kms-provider"); err != nil { | ||
| return err | ||
|
|
||
| roleNames := []string{ | ||
| "openshift-image-registry", | ||
| "aws-ebs-csi-driver-controller", | ||
| "cloud-controller", | ||
| "node-pool", | ||
| "control-plane-operator", | ||
| "cloud-network-config-controller", | ||
| "kms-provider", | ||
| "karpenter", | ||
| } | ||
| if _, err = o.DestroyOIDCRole(ctx, iamClient, "karpenter"); err != nil { | ||
| return err | ||
| for _, name := range roleNames { | ||
| if _, err := o.DestroyOIDCRole(ctx, iamClient, name); err != nil { | ||
| errs = append(errs, err) | ||
| } | ||
| } | ||
|
|
||
| return nil | ||
| return utilerrors.NewAggregate(errs) | ||
| } | ||
|
|
||
| // DestroyOIDCRoleWithRetry retries the entire DestroyOIDCRole operation if it fails due to attached policies | ||
|
|
@@ -368,16 +374,16 @@ func (o *DestroyIAMOptions) DestroyWorkerInstanceProfile(ctx context.Context, cl | |
| } | ||
|
|
||
| func (o *DestroyIAMOptions) DestroySharedVPCRoles(ctx context.Context, iamClient, vpcOwnerIAMClient awsapi.IAMAPI) error { | ||
| var err error | ||
| ingressRoleClient := vpcOwnerIAMClient | ||
| if o.PrivateZonesInClusterAccount { | ||
| ingressRoleClient = iamClient | ||
| } | ||
| if _, err = o.DestroyOIDCRole(ctx, ingressRoleClient, "shared-vpc-ingress"); err != nil { | ||
| return err | ||
| var errs []error | ||
| if _, err := o.DestroyOIDCRole(ctx, ingressRoleClient, "shared-vpc-ingress"); err != nil { | ||
| errs = append(errs, err) | ||
| } | ||
| if _, err = o.DestroyOIDCRole(ctx, vpcOwnerIAMClient, "shared-vpc-control-plane"); err != nil { | ||
| return err | ||
| if _, err := o.DestroyOIDCRole(ctx, vpcOwnerIAMClient, "shared-vpc-control-plane"); err != nil { | ||
| errs = append(errs, err) | ||
| } | ||
| return nil | ||
| return utilerrors.NewAggregate(errs) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -393,10 +393,11 @@ func TestDestroyOIDCResources(t *testing.T) { | |
| const totalRoleChecks = 10 | ||
|
|
||
| tests := []struct { | ||
| name string | ||
| setupMock func(*awsapi.MockIAMAPI) | ||
| expectError bool | ||
| errorContains string | ||
| name string | ||
| setupMock func(*awsapi.MockIAMAPI) | ||
| expectError bool | ||
| errorContains string | ||
| errorContainsAll []string | ||
| }{ | ||
| { | ||
| name: "When OIDC provider matches infraID and shared role exists it should delete the provider and return early", | ||
|
|
@@ -466,7 +467,7 @@ func TestDestroyOIDCResources(t *testing.T) { | |
| errorContains: "api error", | ||
| }, | ||
| { | ||
| name: "When DeleteOpenIDConnectProvider fails with a non-NSE error it should return the error", | ||
| name: "When DeleteOpenIDConnectProvider fails with a non-NSE error it should still attempt role cleanup and return the error", | ||
| setupMock: func(m *awsapi.MockIAMAPI) { | ||
| gomock.InOrder( | ||
| m.EXPECT().ListOpenIDConnectProviders(gomock.Any(), gomock.Any(), gomock.Any()). | ||
|
|
@@ -477,11 +478,62 @@ func TestDestroyOIDCResources(t *testing.T) { | |
| }, nil), | ||
| m.EXPECT().DeleteOpenIDConnectProvider(gomock.Any(), gomock.Any(), gomock.Any()). | ||
| Return(nil, errors.New("permission denied")), | ||
| // continues to role cleanup; shared-role + 9 component roles all not found | ||
| m.EXPECT().GetRole(gomock.Any(), gomock.Any(), gomock.Any()). | ||
| Return(nil, noSuchEntity()).Times(totalRoleChecks), | ||
| ) | ||
| }, | ||
| expectError: true, | ||
| errorContains: "permission denied", | ||
| }, | ||
| { | ||
| name: "When shared-role fails it should still attempt all component role deletions and aggregate errors", | ||
| setupMock: func(m *awsapi.MockIAMAPI) { | ||
| sharedRoleName := testInfraID + "-shared-role" | ||
| ingressRoleName := testInfraID + "-openshift-ingress" | ||
| gomock.InOrder( | ||
| m.EXPECT().ListOpenIDConnectProviders(gomock.Any(), gomock.Any(), gomock.Any()). | ||
| Return(&iam.ListOpenIDConnectProvidersOutput{}, nil), | ||
| // shared-role GetRole fails | ||
| m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(sharedRoleName)}, gomock.Any()). | ||
| Return(nil, errors.New("shared-role api error")), | ||
| // continues to component roles; openshift-ingress also fails | ||
| m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(ingressRoleName)}, gomock.Any()). | ||
| Return(nil, errors.New("ingress api error")), | ||
| // remaining 8 component roles not found | ||
| m.EXPECT().GetRole(gomock.Any(), gomock.Any(), gomock.Any()). | ||
| Return(nil, noSuchEntity()).Times(8), | ||
| ) | ||
| }, | ||
| expectError: true, | ||
| errorContainsAll: []string{"shared-role api error", "ingress api error"}, | ||
| }, | ||
| { | ||
| name: "When multiple component role deletions fail it should aggregate all errors", | ||
| setupMock: func(m *awsapi.MockIAMAPI) { | ||
| sharedRoleName := testInfraID + "-shared-role" | ||
| ingressRoleName := testInfraID + "-openshift-ingress" | ||
| registryRoleName := testInfraID + "-openshift-image-registry" | ||
| gomock.InOrder( | ||
| m.EXPECT().ListOpenIDConnectProviders(gomock.Any(), gomock.Any(), gomock.Any()). | ||
| Return(&iam.ListOpenIDConnectProvidersOutput{}, nil), | ||
| // shared-role not found | ||
| m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(sharedRoleName)}, gomock.Any()). | ||
| Return(nil, noSuchEntity()), | ||
| // openshift-ingress fails | ||
| m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(ingressRoleName)}, gomock.Any()). | ||
| Return(nil, errors.New("ingress api error")), | ||
| // openshift-image-registry fails | ||
| m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(registryRoleName)}, gomock.Any()). | ||
| Return(nil, errors.New("registry api error")), | ||
| // remaining 7 component roles not found | ||
| m.EXPECT().GetRole(gomock.Any(), gomock.Any(), gomock.Any()). | ||
| Return(nil, noSuchEntity()).Times(7), | ||
| ) | ||
| }, | ||
| expectError: true, | ||
| errorContainsAll: []string{"ingress api error", "registry api error"}, | ||
| }, | ||
| } | ||
|
|
||
| for _, tt := range tests { | ||
|
|
@@ -499,6 +551,9 @@ func TestDestroyOIDCResources(t *testing.T) { | |
| if tt.errorContains != "" { | ||
| g.Expect(err.Error()).To(ContainSubstring(tt.errorContains)) | ||
| } | ||
| for _, substr := range tt.errorContainsAll { | ||
| g.Expect(err.Error()).To(ContainSubstring(substr)) | ||
| } | ||
| } else { | ||
| g.Expect(err).NotTo(HaveOccurred()) | ||
| } | ||
|
|
@@ -519,6 +574,7 @@ func TestDestroySharedVPCRoles(t *testing.T) { | |
| setupVPCOwnerMock func(*awsapi.MockIAMAPI) | ||
| expectError bool | ||
| errorContains string | ||
| errorContainsAll []string | ||
| }{ | ||
| { | ||
| name: "When PrivateZonesInClusterAccount is false ingress role should use vpcOwnerClient", | ||
|
|
@@ -546,12 +602,14 @@ func TestDestroySharedVPCRoles(t *testing.T) { | |
| }, | ||
| }, | ||
| { | ||
| name: "When destroying the ingress role fails it should return the error", | ||
| name: "When destroying the ingress role fails it should still attempt control-plane role and return the error", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add test cases for multiple simultaneous failures in the role deletion loop to verify all errors are properly aggregated |
||
| privateZonesInCluster: false, | ||
| setupIAMMock: func(_ *awsapi.MockIAMAPI) {}, | ||
| setupVPCOwnerMock: func(m *awsapi.MockIAMAPI) { | ||
| m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(ingressRoleName)}, gomock.Any()). | ||
| Return(nil, errors.New("api error")) | ||
| m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(cpRoleName)}, gomock.Any()). | ||
| Return(nil, noSuchEntity()) | ||
| }, | ||
| expectError: true, | ||
| errorContains: "cannot check for existing role", | ||
|
|
@@ -571,6 +629,19 @@ func TestDestroySharedVPCRoles(t *testing.T) { | |
| expectError: true, | ||
| errorContains: "cannot check for existing role", | ||
| }, | ||
| { | ||
| name: "When both ingress and control-plane role deletions fail it should aggregate all errors", | ||
| privateZonesInCluster: false, | ||
| setupIAMMock: func(_ *awsapi.MockIAMAPI) {}, | ||
| setupVPCOwnerMock: func(m *awsapi.MockIAMAPI) { | ||
| m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(ingressRoleName)}, gomock.Any()). | ||
| Return(nil, errors.New("ingress api error")) | ||
| m.EXPECT().GetRole(gomock.Any(), &iam.GetRoleInput{RoleName: aws.String(cpRoleName)}, gomock.Any()). | ||
| Return(nil, errors.New("cp api error")) | ||
| }, | ||
| expectError: true, | ||
| errorContainsAll: []string{"ingress api error", "cp api error"}, | ||
| }, | ||
| } | ||
|
|
||
| for _, tt := range tests { | ||
|
|
@@ -594,6 +665,9 @@ func TestDestroySharedVPCRoles(t *testing.T) { | |
| if tt.errorContains != "" { | ||
| g.Expect(err.Error()).To(ContainSubstring(tt.errorContains)) | ||
| } | ||
| for _, substr := range tt.errorContainsAll { | ||
| g.Expect(err.Error()).To(ContainSubstring(substr)) | ||
| } | ||
| } else { | ||
| g.Expect(err).NotTo(HaveOccurred()) | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Inside of the DestroyWorkerInstanceProfile function, we're still returning early instead of use the error aggregation pattern. Make sense to move it same strategy or keep in return early approach?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmmm unsure. I would say here we have to return immediately.