Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug 1879176: Alerting on failed image prune job #612

Merged
merged 1 commit into from Sep 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 9 additions & 0 deletions manifests/09-prometheus-rules.yaml
Expand Up @@ -17,3 +17,12 @@ spec:
message: |
Image Registry Storage configuration has changed in the last 30
minutes. This change may have caused data loss.
- name: ImagePruner
rules:
- alert: ImagePrunerIsFailing
expr: image_registry_operator_image_pruner_job_status == 1
labels:
severity: warning
annotations:
message: |
Image Pruner job is failing, please check job output log.
5 changes: 5 additions & 0 deletions pkg/metrics/metrics.go
Expand Up @@ -12,11 +12,16 @@ var (
Name: "image_registry_operator_image_pruner_install_status",
Help: "Installation status code related to the automatic image pruning feature. 0 = not installed, 1 = suspended, 2 = enabled",
})
imagePrunerJobStatus = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "image_registry_operator_image_pruner_job_status",
Help: "This metric reports the image pruner job current status. 0 = working, 1 = failing",
})
)

func init() {
registry.MustRegister(
storageReconfigured,
imagePrunerInstallStatus,
imagePrunerJobStatus,
)
}
9 changes: 9 additions & 0 deletions pkg/metrics/server.go
Expand Up @@ -73,3 +73,12 @@ func ImagePrunerInstallStatus(installed bool, enabled bool) {
}
imagePrunerInstallStatus.Set(2)
}

// ImagePrunerJobStatus reports if the pruning job is working or failing.
func ImagePrunerJobStatus(failed bool) {
if failed {
imagePrunerJobStatus.Set(1)
return
}
imagePrunerJobStatus.Set(0)
}
10 changes: 2 additions & 8 deletions pkg/operator/status.go
Expand Up @@ -149,11 +149,9 @@ func (c *ImagePrunerController) syncPrunerStatus(cr *imageregistryv1.ImagePruner
}

var foundFailed bool
var failedMessage string
for _, condition := range lastJobConditions {
if condition.Type == batchv1.JobFailed {
foundFailed = true
failedMessage = condition.Message
prunerLastJobStatus := operatorapiv1.OperatorCondition{
Status: operatorapiv1.ConditionTrue,
Message: condition.Message,
Expand Down Expand Up @@ -199,18 +197,14 @@ func (c *ImagePrunerController) syncPrunerStatus(cr *imageregistryv1.ImagePruner
Reason: "SyncError",
Message: fmt.Sprintf("Error: %v", applyError),
})
} else if foundFailed {
updatePrunerCondition(cr, "Degraded", operatorapiv1.OperatorCondition{
Status: operatorapiv1.ConditionTrue,
Reason: "JobFailed",
Message: failedMessage,
})
} else {
updatePrunerCondition(cr, "Degraded", operatorapiv1.OperatorCondition{
Status: operatorapiv1.ConditionFalse,
Reason: "AsExpected",
})
}

metrics.ImagePrunerJobStatus(foundFailed)
}

func (c *Controller) syncStatus(cr *imageregistryv1.Config, deploy *appsapi.Deployment, applyError error) {
Expand Down