-
Notifications
You must be signed in to change notification settings - Fork 4.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
create event intervals for alerts #26508
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
package monitor | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"math" | ||
"strings" | ||
"time" | ||
|
||
"k8s.io/kube-openapi/pkg/util/sets" | ||
|
||
routeclient "github.com/openshift/client-go/route/clientset/versioned" | ||
"github.com/openshift/library-go/test/library/metrics" | ||
"github.com/openshift/origin/pkg/monitor/monitorapi" | ||
prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" | ||
prometheustypes "github.com/prometheus/common/model" | ||
"k8s.io/client-go/kubernetes" | ||
"k8s.io/client-go/rest" | ||
) | ||
|
||
func CreateEventIntervalsForAlerts(ctx context.Context, restConfig *rest.Config, startTime time.Time) ([]monitorapi.EventInterval, error) { | ||
kubeClient, err := kubernetes.NewForConfig(restConfig) | ||
if err != nil { | ||
return nil, err | ||
} | ||
routeClient, err := routeclient.NewForConfig(restConfig) | ||
if err != nil { | ||
return nil, err | ||
} | ||
prometheusClient, err := metrics.NewPrometheusClient(ctx, kubeClient, routeClient) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
timeRange := prometheusv1.Range{ | ||
Start: startTime, | ||
End: time.Now(), | ||
Step: 1 * time.Second, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. by default alerting rules are evaluated every 30 seconds. It can be overriden by the PrometheusRules resources but AFAIK no operator does that. It should be ok to use a step of 10 seconds which would reduce the amount of data returned by Prometheus. |
||
} | ||
alerts, warningsForQuery, err := prometheusClient.QueryRange(ctx, `ALERTS{alertstate="firing"}`, timeRange) | ||
if err != nil { | ||
return nil, err | ||
} | ||
if len(warningsForQuery) > 0 { | ||
fmt.Printf("#### warnings \n\t%v\n", strings.Join(warningsForQuery, "\n\t")) | ||
} | ||
|
||
firingAlerts, err := createEventIntervalsForAlerts(ctx, alerts, startTime) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
alerts, warningsForQuery, err = prometheusClient.QueryRange(ctx, `ALERTS{alertstate="pending"}`, timeRange) | ||
if err != nil { | ||
return nil, err | ||
} | ||
if len(warningsForQuery) > 0 { | ||
fmt.Printf("#### warnings \n\t%v\n", strings.Join(warningsForQuery, "\n\t")) | ||
} | ||
pendingAlerts, err := createEventIntervalsForAlerts(ctx, alerts, startTime) | ||
if err != nil { | ||
return nil, err | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does AlertPending mean? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html - pending is "metric value matches, but it didn't last long enough yet" |
||
|
||
ret := []monitorapi.EventInterval{} | ||
ret = append(ret, firingAlerts...) | ||
ret = append(ret, pendingAlerts...) | ||
|
||
return ret, nil | ||
} | ||
|
||
// Be careful placing things in this list. In many cases, knowing a condition has gone bad is noteworthy when looking | ||
// for related errors in CI runs. | ||
var pendingAlertsToIgnoreForIntervals = sets.NewString( | ||
//"KubeContainerWaiting", | ||
//"AlertmanagerReceiversNotConfigured", | ||
//"KubeJobCompletion", | ||
//"KubeDeploymentReplicasMismatch", | ||
) | ||
|
||
func createEventIntervalsForAlerts(ctx context.Context, alerts prometheustypes.Value, startTime time.Time) ([]monitorapi.EventInterval, error) { | ||
ret := []monitorapi.EventInterval{} | ||
|
||
switch { | ||
case alerts.Type() == prometheustypes.ValMatrix: | ||
matrixAlert := alerts.(prometheustypes.Matrix) | ||
for _, alert := range matrixAlert { | ||
alertName := alert.Metric[prometheustypes.AlertNameLabel] | ||
// don't skip Watchdog because gaps in watchdog are noteworthy, unexpected, and they do happen. | ||
//if alertName == "Watchdog" { | ||
// continue | ||
//} | ||
// many pending alerts we just don't care about | ||
if alert.Metric["alertstate"] == "pending" { | ||
if pendingAlertsToIgnoreForIntervals.Has(string(alertName)) { | ||
continue | ||
} | ||
} | ||
|
||
locator := "alert/" + alertName | ||
if node := alert.Metric["instance"]; len(node) > 0 { | ||
locator += " node/" + node | ||
} | ||
if namespace := alert.Metric["namespace"]; len(namespace) > 0 { | ||
locator += " ns/" + namespace | ||
} | ||
if pod := alert.Metric["pod"]; len(pod) > 0 { | ||
locator += " pod/" + pod | ||
} | ||
if container := alert.Metric["container"]; len(container) > 0 { | ||
locator += " container/" + container | ||
} | ||
|
||
alertIntervalTemplate := monitorapi.EventInterval{ | ||
Condition: monitorapi.Condition{ | ||
Locator: string(locator), | ||
Message: alert.Metric.String(), | ||
}, | ||
} | ||
switch { | ||
// as I understand it, pending alerts are cases where the conditions except for "how long has been happening" | ||
// are all met. Pending alerts include what level the eventual alert will be, but they are not errors in and | ||
// of themselves. They are you useful to show in time to find patterns of "X fails concurrent with Y" | ||
case alert.Metric["alertstate"] == "pending": | ||
alertIntervalTemplate.Level = monitorapi.Info | ||
|
||
case alert.Metric["severity"] == "warning": | ||
alertIntervalTemplate.Level = monitorapi.Warning | ||
case alert.Metric["severity"] == "critical": | ||
alertIntervalTemplate.Level = monitorapi.Error | ||
case alert.Metric["severity"] == "info": | ||
alertIntervalTemplate.Level = monitorapi.Info | ||
default: | ||
alertIntervalTemplate.Level = monitorapi.Error | ||
} | ||
|
||
var alertStartTime *time.Time | ||
var lastTime *time.Time | ||
for _, currValue := range alert.Values { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This probably can be offloaded to Prometheus - iiuc There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
if someone wants to refine later, I won't stop them. This PR needs to merge in the current state though to get some data. |
||
currTime := currValue.Timestamp.Time() | ||
if alertStartTime == nil { | ||
alertStartTime = &currTime | ||
} | ||
if lastTime == nil { | ||
lastTime = &currTime | ||
} | ||
// if it has been less than five seconds since we saw this, consider it the same interval and check | ||
// the next time. | ||
if math.Abs(currTime.Sub(*lastTime).Seconds()) < (5 * time.Second).Seconds() { | ||
lastTime = &currTime | ||
continue | ||
} | ||
|
||
// if it has been more than five seconds, consider this the start of a new occurrence and add the interval | ||
currAlertInterval := alertIntervalTemplate // shallow copy | ||
currAlertInterval.From = *alertStartTime | ||
currAlertInterval.To = *lastTime | ||
ret = append(ret, currAlertInterval) | ||
|
||
// now reset the tracking | ||
alertStartTime = &currTime | ||
lastTime = nil | ||
} | ||
|
||
currAlertInterval := alertIntervalTemplate // shallow copy | ||
currAlertInterval.From = *alertStartTime | ||
currAlertInterval.To = *lastTime | ||
ret = append(ret, currAlertInterval) | ||
} | ||
|
||
default: | ||
ret = append(ret, monitorapi.EventInterval{ | ||
Condition: monitorapi.Condition{ | ||
Level: monitorapi.Error, | ||
Locator: "alert/all", | ||
Message: fmt.Sprintf("unhandled type: %v", alerts.Type()), | ||
}, | ||
From: startTime, | ||
To: time.Now(), | ||
}) | ||
} | ||
|
||
return ret, nil | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -367,6 +367,15 @@ func (opt *Options) Run(suite *TestSuite) error { | |
sort.Sort(events) | ||
} | ||
} | ||
|
||
// add events from alerts so we can create the intervals | ||
alertEventIntervals, err := monitor.CreateEventIntervalsForAlerts(ctx, restConfig, start) | ||
if err != nil { | ||
fmt.Printf("\n\n\n#### alertErr=%v\n", err) | ||
} | ||
events = append(events, alertEventIntervals...) | ||
sort.Sort(events) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does this new |
||
|
||
events.Clamp(start, end) | ||
|
||
if len(opt.JUnitDir) > 0 { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Intentional to default to critical? AlertUnknown perhaps? Should have a comment either way.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Showing it as Critical here would have us find misconfigured alerts