Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

create event intervals for alerts #26508

Merged
merged 4 commits into from Oct 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
75 changes: 64 additions & 11 deletions e2echart/e2e-chart-template.html
Expand Up @@ -125,16 +125,23 @@ <h5 class="modal-title">Resource</h5>
return false
}

function isAlert(eventInterval) {
if (eventInterval.locator.startsWith("alert/")) {
return true
}
return false
}

const rePhase = new RegExp("(^| )phase/([^ ]+)")
function nodeStateValue(item) {
let roles = ""
let i = item.message.indexOf("roles/")
if (i != -1) {
roles = item.message.substring(i+"roles/".length)
let j = roles.indexOf(" ")
if (j != -1) {
roles = roles.substring(0, j)
}
roles = item.message.substring(i+"roles/".length)
let j = roles.indexOf(" ")
if (j != -1) {
roles = roles.substring(0, j)
}
}

if (item.message.includes("node is not ready")) {
Expand All @@ -147,6 +154,30 @@ <h5 class="modal-title">Resource</h5>
return [item.locator, ` (${roles},updates)`, "Update"];
}

function alertSeverity(item) {
// the other types can be pending, so check pending first
let pendingIndex = item.message.indexOf("pending")
if (pendingIndex != -1) {
return [item.locator, "", "AlertPending"]
}

let infoIndex = item.message.indexOf("info")
if (infoIndex != -1) {
return [item.locator, "", "AlertInfo"]
}
let warningIndex = item.message.indexOf("warning")
if (warningIndex != -1) {
return [item.locator, "", "AlertWarning"]
}
let criticalIndex = item.message.indexOf("critical")
if (criticalIndex != -1) {
return [item.locator, "", "AlertCritical"]
}

// color as critical if nothing matches so that we notice that something has gone wrong
return [item.locator, "", "AlertCritical"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intentional to default to critical? AlertUnknown perhaps? Should have a comment either way.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Showing it as Critical here would have us find misconfigured alerts

}

function createTimelineData(timelineVal, timelineData, rawEventIntervals, preconditionFunc) {
const data = {}
var now = new Date();
Expand Down Expand Up @@ -211,13 +242,25 @@ <h5 class="modal-title">Resource</h5>
timelineGroups.push({group: "operator-progressing", data: []})
createTimelineData("OperatorProgressing", timelineGroups[timelineGroups.length - 1].data, eventIntervals, isOperatorProgressing)

timelineGroups.push({group: "alerts", data: []})
createTimelineData(alertSeverity, timelineGroups[timelineGroups.length - 1].data, eventIntervals, isAlert)
// leaving this for posterity so future me (or someone else) can try it, but I think ordering by name makes the
// patterns shown by timing hide and timing appears more relevant to my eyes.
// sort alerts alphabetically for display purposes, but keep the json itself ordered by time.
// timelineGroups[timelineGroups.length - 1].data.sort(function (e1 ,e2){
// if (e1.label.includes("alert") && e2.label.includes("alert")) {
// return e1.label < e2.label ? -1 : e1.label > e2.label;
// }
// return 0
// })

timelineGroups.push({group: "node-state", data: []})
createTimelineData(nodeStateValue, timelineGroups[timelineGroups.length - 1].data, eventIntervals, isNodeState)
timelineGroups[timelineGroups.length - 1].data.sort(function (e1 ,e2){
if (e1.label.includes("master") && e2.label.includes("worker")) {
return -1
}
return 0
if (e1.label.includes("master") && e2.label.includes("worker")) {
return -1
}
return 0
})

timelineGroups.push({group: "apiserver-availability", data: []})
Expand Down Expand Up @@ -260,8 +303,18 @@ <h5 class="modal-title">Resource</h5>
const el = document.querySelector('#chart');
const myChart = TimelinesChart();
var ordinalScale = d3.scaleOrdinal()
.domain(['OperatorUnavailable', 'OperatorDegraded', 'OperatorProgressing', 'Update', 'Drain', 'Reboot', 'OperatingSystemUpdate', 'NodeNotReady', 'Passed', 'Skipped', 'Flaked', 'Failed', 'Degraded', 'Upgradeable', 'False', 'Unknown'])
.range(['#d0312d', '#ffa500', '#fada5e', '#1e7bd9', '#4294e6', '#6aaef2', '#96cbff', '#fada5e', '#3cb043', '#ceba76', '#ffa500', '#d0312d', '#b65049', '#32b8b6', '#ffffff', '#bbbbbb']);
.domain([
'AlertInfo', 'AlertPending', 'AlertWarning', 'AlertCritical', // alerts
'OperatorUnavailable', 'OperatorDegraded', 'OperatorProgressing', // operators
'Update', 'Drain', 'Reboot', 'OperatingSystemUpdate', 'NodeNotReady', // nodes
'Passed', 'Skipped', 'Flaked', 'Failed', // tests
'Degraded', 'Upgradeable', 'False', 'Unknown'])
.range([
'#fada5e','#fada5e','#ffa500','#d0312d', // alerts
'#d0312d', '#ffa500', '#fada5e', // operators
'#1e7bd9', '#4294e6', '#6aaef2', '#96cbff', '#fada5e', // nodes
'#3cb043', '#ceba76', '#ffa500', '#d0312d', // tests
'#b65049', '#32b8b6', '#ffffff', '#bbbbbb']);
myChart.data(timelineGroups).zQualitative(true).enableAnimations(false).leftMargin(240).rightMargin(550).maxLineHeight(20).maxHeight(10000).zColorScale(ordinalScale).onSegmentClick(segmentFunc)
(el);

Expand Down
184 changes: 184 additions & 0 deletions pkg/monitor/alerts.go
@@ -0,0 +1,184 @@
package monitor

import (
"context"
"fmt"
"math"
"strings"
"time"

"k8s.io/kube-openapi/pkg/util/sets"

routeclient "github.com/openshift/client-go/route/clientset/versioned"
"github.com/openshift/library-go/test/library/metrics"
"github.com/openshift/origin/pkg/monitor/monitorapi"
prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
prometheustypes "github.com/prometheus/common/model"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
)

func CreateEventIntervalsForAlerts(ctx context.Context, restConfig *rest.Config, startTime time.Time) ([]monitorapi.EventInterval, error) {
kubeClient, err := kubernetes.NewForConfig(restConfig)
if err != nil {
return nil, err
}
routeClient, err := routeclient.NewForConfig(restConfig)
if err != nil {
return nil, err
}
prometheusClient, err := metrics.NewPrometheusClient(ctx, kubeClient, routeClient)
if err != nil {
return nil, err
}

timeRange := prometheusv1.Range{
Start: startTime,
End: time.Now(),
Step: 1 * time.Second,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

by default alerting rules are evaluated every 30 seconds. It can be overriden by the PrometheusRules resources but AFAIK no operator does that. It should be ok to use a step of 10 seconds which would reduce the amount of data returned by Prometheus.

}
alerts, warningsForQuery, err := prometheusClient.QueryRange(ctx, `ALERTS{alertstate="firing"}`, timeRange)
if err != nil {
return nil, err
}
if len(warningsForQuery) > 0 {
fmt.Printf("#### warnings \n\t%v\n", strings.Join(warningsForQuery, "\n\t"))
}

firingAlerts, err := createEventIntervalsForAlerts(ctx, alerts, startTime)
if err != nil {
return nil, err
}

alerts, warningsForQuery, err = prometheusClient.QueryRange(ctx, `ALERTS{alertstate="pending"}`, timeRange)
if err != nil {
return nil, err
}
if len(warningsForQuery) > 0 {
fmt.Printf("#### warnings \n\t%v\n", strings.Join(warningsForQuery, "\n\t"))
}
pendingAlerts, err := createEventIntervalsForAlerts(ctx, alerts, startTime)
if err != nil {
return nil, err
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does AlertPending mean?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html - pending is "metric value matches, but it didn't last long enough yet"


ret := []monitorapi.EventInterval{}
ret = append(ret, firingAlerts...)
ret = append(ret, pendingAlerts...)

return ret, nil
}

// Be careful placing things in this list. In many cases, knowing a condition has gone bad is noteworthy when looking
// for related errors in CI runs.
var pendingAlertsToIgnoreForIntervals = sets.NewString(
//"KubeContainerWaiting",
//"AlertmanagerReceiversNotConfigured",
//"KubeJobCompletion",
//"KubeDeploymentReplicasMismatch",
)

func createEventIntervalsForAlerts(ctx context.Context, alerts prometheustypes.Value, startTime time.Time) ([]monitorapi.EventInterval, error) {
ret := []monitorapi.EventInterval{}

switch {
case alerts.Type() == prometheustypes.ValMatrix:
matrixAlert := alerts.(prometheustypes.Matrix)
for _, alert := range matrixAlert {
alertName := alert.Metric[prometheustypes.AlertNameLabel]
// don't skip Watchdog because gaps in watchdog are noteworthy, unexpected, and they do happen.
//if alertName == "Watchdog" {
// continue
//}
// many pending alerts we just don't care about
if alert.Metric["alertstate"] == "pending" {
if pendingAlertsToIgnoreForIntervals.Has(string(alertName)) {
continue
}
}

locator := "alert/" + alertName
if node := alert.Metric["instance"]; len(node) > 0 {
locator += " node/" + node
}
if namespace := alert.Metric["namespace"]; len(namespace) > 0 {
locator += " ns/" + namespace
}
if pod := alert.Metric["pod"]; len(pod) > 0 {
locator += " pod/" + pod
}
if container := alert.Metric["container"]; len(container) > 0 {
locator += " container/" + container
}

alertIntervalTemplate := monitorapi.EventInterval{
Condition: monitorapi.Condition{
Locator: string(locator),
Message: alert.Metric.String(),
},
}
switch {
// as I understand it, pending alerts are cases where the conditions except for "how long has been happening"
// are all met. Pending alerts include what level the eventual alert will be, but they are not errors in and
// of themselves. They are you useful to show in time to find patterns of "X fails concurrent with Y"
case alert.Metric["alertstate"] == "pending":
alertIntervalTemplate.Level = monitorapi.Info

case alert.Metric["severity"] == "warning":
alertIntervalTemplate.Level = monitorapi.Warning
case alert.Metric["severity"] == "critical":
alertIntervalTemplate.Level = monitorapi.Error
case alert.Metric["severity"] == "info":
alertIntervalTemplate.Level = monitorapi.Info
default:
alertIntervalTemplate.Level = monitorapi.Error
}

var alertStartTime *time.Time
var lastTime *time.Time
for _, currValue := range alert.Values {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This probably can be offloaded to Prometheus - iiuc count_over_time(ALERTS[<test_duration>:1s]) would return a number of seconds the alert has been found

Copy link
Contributor Author

@deads2k deads2k Oct 8, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This probably can be offloaded to Prometheus - iiuc count_over_time(ALERTS[<test_duration>:1s]) would return a number of seconds the alert has been found

if someone wants to refine later, I won't stop them. This PR needs to merge in the current state though to get some data.

currTime := currValue.Timestamp.Time()
if alertStartTime == nil {
alertStartTime = &currTime
}
if lastTime == nil {
lastTime = &currTime
}
// if it has been less than five seconds since we saw this, consider it the same interval and check
// the next time.
if math.Abs(currTime.Sub(*lastTime).Seconds()) < (5 * time.Second).Seconds() {
lastTime = &currTime
continue
}

// if it has been more than five seconds, consider this the start of a new occurrence and add the interval
currAlertInterval := alertIntervalTemplate // shallow copy
currAlertInterval.From = *alertStartTime
currAlertInterval.To = *lastTime
ret = append(ret, currAlertInterval)

// now reset the tracking
alertStartTime = &currTime
lastTime = nil
}

currAlertInterval := alertIntervalTemplate // shallow copy
currAlertInterval.From = *alertStartTime
currAlertInterval.To = *lastTime
ret = append(ret, currAlertInterval)
}

default:
ret = append(ret, monitorapi.EventInterval{
Condition: monitorapi.Condition{
Level: monitorapi.Error,
Locator: "alert/all",
Message: fmt.Sprintf("unhandled type: %v", alerts.Type()),
},
From: startTime,
To: time.Now(),
})
}

return ret, nil
}
9 changes: 9 additions & 0 deletions pkg/test/ginkgo/cmd_runsuite.go
Expand Up @@ -367,6 +367,15 @@ func (opt *Options) Run(suite *TestSuite) error {
sort.Sort(events)
}
}

// add events from alerts so we can create the intervals
alertEventIntervals, err := monitor.CreateEventIntervalsForAlerts(ctx, restConfig, start)
if err != nil {
fmt.Printf("\n\n\n#### alertErr=%v\n", err)
}
events = append(events, alertEventIntervals...)
sort.Sort(events)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this new sort.Sort call allow us to drop some from earlier in the function (e.g. the one from above after loading from AdditionalEvents_*)?


events.Clamp(start, end)

if len(opt.JUnitDir) > 0 {
Expand Down