Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release-4.9] Bug 2015977: Stop putting CCO in degraded state when stale credentials are found #404

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,11 @@ spec:
for: 5m
labels:
severity: warning
- alert: CloudCredentialOperatorStaleCredentials
annotations:
message: 1 or more credentials requests are stale and should be deleted. Check the status.conditions on CredentialsRequest CRs to identify the stale one(s).
expr: cco_credentials_requests_conditions{condition="StaleCredentials"}
> 0
for: 5m
labels:
severity: warning
25 changes: 4 additions & 21 deletions pkg/operator/credentialsrequest/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,11 @@ func computeStatusConditions(
conditions = append(conditions, *upgradeableCondition)
}

if operatorIsDisabled {
return conditions
}

failingCredRequests := 0
staleCredRequests := 0

validCredRequests := []minterv1.CredentialsRequest{}
// Filter out credRequests that are for different clouds
Expand Down Expand Up @@ -136,26 +139,6 @@ func computeStatusConditions(
if foundFailure {
failingCredRequests = failingCredRequests + 1
}

// Check for stale credential request condition
staleCond := utils.FindCredentialsRequestCondition(cr.Status.Conditions, minterv1.StaleCredentials)
if staleCond != nil && staleCond.Status == corev1.ConditionTrue {
staleCredRequests = staleCredRequests + 1
}
}

if operatorIsDisabled {
if staleCredRequests > 0 {
var degradedCondition configv1.ClusterOperatorStatusCondition
degradedCondition.Type = configv1.OperatorDegraded
degradedCondition.Status = configv1.ConditionTrue
degradedCondition.Reason = reasonStaleCredentials
degradedCondition.Message = fmt.Sprintf(
"%d of %d credentials requests are stale and should be deleted.",
staleCredRequests, len(validCredRequests))
conditions = append(conditions, degradedCondition)
}
return conditions
}

if failingCredRequests > 0 {
Expand Down
15 changes: 0 additions & 15 deletions pkg/operator/credentialsrequest/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,21 +156,6 @@ func TestClusterOperatorStatus(t *testing.T) {
testCondition(configv1.OperatorDegraded, configv1.ConditionTrue, reasonCredentialsFailing),
},
},
{
name: "stale credentials request that is no longer required",
credRequests: []minterv1.CredentialsRequest{
testCredentialsRequestWithStatus("cred1", true, []minterv1.CredentialsRequestCondition{}, nil),
testCredentialsRequestWithStatus("cred2", true, []minterv1.CredentialsRequestCondition{
testCRCondition(minterv1.StaleCredentials, corev1.ConditionTrue),
}, nil),
testCredentialsRequestWithStatus("cred3", true, []minterv1.CredentialsRequestCondition{}, nil),
},
cloudPlatform: configv1.AWSPlatformType,
operatorDisabled: true,
expectedConditions: []configv1.ClusterOperatorStatusCondition{
testCondition(configv1.OperatorDegraded, configv1.ConditionTrue, reasonStaleCredentials),
},
},
{
name: "ignore nonAWS credreqs",
credRequests: []minterv1.CredentialsRequest{
Expand Down
14 changes: 12 additions & 2 deletions pkg/operator/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ func newAccumulator(client client.Client, logger log.FieldLogger) *credRequestAc
for _, c := range credreqv1.FailureConditionTypes {
acc.crConditions[c] = 0
}
acc.crConditions[credreqv1.StaleCredentials] = 0

return acc
}
Expand All @@ -252,10 +253,19 @@ func (a *credRequestAccumulator) processCR(cr *credreqv1.CredentialsRequest, cco
a.podIdentityCredentials++
}

// Skip reporting conditions if CCO is disabled, as we shouldn't be alerting in that case.
// Skip reporting conditions if CCO is disabled, as we shouldn't be alerting in that case, except for stale credentials.
// condition. The stale credentials are removed by cleanup controller. But when CCO is disabled the only way to inform
// users to remove these credentials is through alerts.
if !ccoDisabled {
for _, cond := range cr.Status.Conditions {
if cond.Status == corev1.ConditionTrue {
// do not report stale credentials when CCO is enabled as it will be removed by cleanup controller.
if cond.Status == corev1.ConditionTrue && cond.Type != credreqv1.StaleCredentials {
a.crConditions[cond.Type]++
}
}
} else {
for _, cond := range cr.Status.Conditions {
if cond.Status == corev1.ConditionTrue && cond.Type == credreqv1.StaleCredentials {
a.crConditions[cond.Type]++
}
}
Expand Down
16 changes: 14 additions & 2 deletions pkg/operator/metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ var (
Type: credreqv1.InsufficientCloudCredentials,
Status: corev1.ConditionTrue,
}

staleCredsCond = credreqv1.CredentialsRequestCondition{
Type: credreqv1.StaleCredentials,
Status: corev1.ConditionTrue,
}
)

func TestSecretGetter(t *testing.T) {
Expand Down Expand Up @@ -165,6 +170,8 @@ func TestCredentialsRequests(t *testing.T) {
}(),
// insufficient cloud creds condition
testCredReqWithConditions(testAWSCredRequest("ainsufficientcreds"), []credreqv1.CredentialsRequestCondition{insufficientCredsCond}),
// stale creds condition
testCredReqWithConditions(testAWSCredRequest("astalecreds"), []credreqv1.CredentialsRequestCondition{staleCredsCond}),

// regular GCP credreq
testGCPCredRequest("gregular"),
Expand All @@ -174,14 +181,15 @@ func TestCredentialsRequests(t *testing.T) {
},
validate: func(t *testing.T, accumulator *credRequestAccumulator) {
// total cred requests
assert.Equal(t, 5, accumulator.crTotals["aws"])
assert.Equal(t, 6, accumulator.crTotals["aws"])
assert.Equal(t, 2, accumulator.crTotals["gcp"])

// conditions
assert.Equal(t, 1, accumulator.crConditions[credreqv1.MissingTargetNamespace])
assert.Equal(t, 1, accumulator.crConditions[credreqv1.CredentialsProvisionFailure])
assert.Equal(t, 1, accumulator.crConditions[credreqv1.Ignored])
assert.Equal(t, 1, accumulator.crConditions[credreqv1.InsufficientCloudCredentials])
assert.Equal(t, 0, accumulator.crConditions[credreqv1.StaleCredentials])
},
},
{
Expand All @@ -194,19 +202,23 @@ func TestCredentialsRequests(t *testing.T) {
testCredReqWithConditions(testAWSCredRequest("aprovisionfailed"), []credreqv1.CredentialsRequestCondition{provisionFailedCond}),
// insufficient cloud creds condition
testCredReqWithConditions(testAWSCredRequest("ainsufficientcreds"), []credreqv1.CredentialsRequestCondition{insufficientCredsCond}),
// stale creds condition
testCredReqWithConditions(testAWSCredRequest("astalecreds"), []credreqv1.CredentialsRequestCondition{staleCredsCond}),

// GCP credreq with condition set
testCredReqWithConditions(testGCPCredRequest("gignored"), []credreqv1.CredentialsRequestCondition{ignoredCond}),
},
validate: func(t *testing.T, accumulator *credRequestAccumulator) {
// total cred requests
assert.Equal(t, 3, accumulator.crTotals["aws"])
assert.Equal(t, 4, accumulator.crTotals["aws"])
assert.Equal(t, 1, accumulator.crTotals["gcp"])

// failure conditions should all be zero as CCO is disabled
for _, cond := range credreqv1.FailureConditionTypes {
assert.Equal(t, 0, accumulator.crConditions[cond])
}
// stale conditions should be reported when cco is disabled
assert.Equal(t, 1, accumulator.crConditions[credreqv1.StaleCredentials])
},
},
{
Expand Down