openshift · openshift-merge-robot · Oct 8, 2021 · Oct 6, 2021 · Oct 6, 2021 · Oct 8, 2021
diff --git a/e2echart/e2e-chart-template.html b/e2echart/e2e-chart-template.html
@@ -125,16 +125,23 @@ <h5 class="modal-title">Resource</h5>
         return false
     }
 
+    function isAlert(eventInterval) {
+        if (eventInterval.locator.startsWith("alert/")) {
+            return true
+        }
+        return false
+    }
+
     const rePhase = new RegExp("(^| )phase/([^ ]+)")
     function nodeStateValue(item) {
         let roles = ""
         let i = item.message.indexOf("roles/")
         if (i != -1) {
-          roles = item.message.substring(i+"roles/".length)
-          let j = roles.indexOf(" ")
-          if (j != -1) {
-            roles = roles.substring(0, j)
-          }
+            roles = item.message.substring(i+"roles/".length)
+            let j = roles.indexOf(" ")
+            if (j != -1) {
+                roles = roles.substring(0, j)
+            }
         }
 
         if (item.message.includes("node is not ready")) {
@@ -147,6 +154,30 @@ <h5 class="modal-title">Resource</h5>
         return [item.locator, ` (${roles},updates)`, "Update"];
     }
 
+    function alertSeverity(item) {
+        // the other types can be pending, so check pending first
+        let pendingIndex = item.message.indexOf("pending")
+        if (pendingIndex != -1) {
+            return [item.locator, "", "AlertPending"]
+        }
+
+        let infoIndex = item.message.indexOf("info")
+        if (infoIndex != -1) {
+            return [item.locator, "", "AlertInfo"]
+        }
+        let warningIndex = item.message.indexOf("warning")
+        if (warningIndex != -1) {
+            return [item.locator, "", "AlertWarning"]
+        }
+        let criticalIndex = item.message.indexOf("critical")
+        if (criticalIndex != -1) {
+            return [item.locator, "", "AlertCritical"]
+        }
+
+        // color as critical if nothing matches so that we notice that something has gone wrong
+        return [item.locator, "", "AlertCritical"]
+    }
+
     function createTimelineData(timelineVal, timelineData, rawEventIntervals, preconditionFunc) {
         const data = {}
         var now = new Date();
@@ -211,13 +242,25 @@ <h5 class="modal-title">Resource</h5>
     timelineGroups.push({group: "operator-progressing", data: []})
     createTimelineData("OperatorProgressing", timelineGroups[timelineGroups.length - 1].data, eventIntervals, isOperatorProgressing)
 
+    timelineGroups.push({group: "alerts", data: []})
+    createTimelineData(alertSeverity, timelineGroups[timelineGroups.length - 1].data, eventIntervals, isAlert)
+    // leaving this for posterity so future me (or someone else) can try it, but I think ordering by name makes the
+    // patterns shown by timing hide and timing appears more relevant to my eyes.
+    // sort alerts alphabetically for display purposes, but keep the json itself ordered by time.
+    // timelineGroups[timelineGroups.length - 1].data.sort(function (e1 ,e2){
+    //     if (e1.label.includes("alert") && e2.label.includes("alert")) {
+    //         return e1.label < e2.label ? -1 : e1.label > e2.label;
+    //     }
+    //     return 0
+    // })
+
     timelineGroups.push({group: "node-state", data: []})
     createTimelineData(nodeStateValue, timelineGroups[timelineGroups.length - 1].data, eventIntervals, isNodeState)
     timelineGroups[timelineGroups.length - 1].data.sort(function (e1 ,e2){
-      if (e1.label.includes("master") && e2.label.includes("worker")) {
-        return -1
-      }
-      return 0
+        if (e1.label.includes("master") && e2.label.includes("worker")) {
+            return -1
+        }
+        return 0
     })
 
     timelineGroups.push({group: "apiserver-availability", data: []})
@@ -260,8 +303,18 @@ <h5 class="modal-title">Resource</h5>
     const el = document.querySelector('#chart');
     const myChart = TimelinesChart();
     var ordinalScale = d3.scaleOrdinal()
-        .domain(['OperatorUnavailable', 'OperatorDegraded', 'OperatorProgressing', 'Update', 'Drain', 'Reboot', 'OperatingSystemUpdate', 'NodeNotReady', 'Passed', 'Skipped', 'Flaked', 'Failed', 'Degraded', 'Upgradeable', 'False', 'Unknown'])
-        .range(['#d0312d', '#ffa500', '#fada5e', '#1e7bd9', '#4294e6', '#6aaef2', '#96cbff', '#fada5e', '#3cb043', '#ceba76', '#ffa500', '#d0312d', '#b65049', '#32b8b6', '#ffffff', '#bbbbbb']);
+        .domain([
+            'AlertInfo', 'AlertPending', 'AlertWarning', 'AlertCritical', // alerts
+            'OperatorUnavailable', 'OperatorDegraded', 'OperatorProgressing', // operators
+            'Update', 'Drain', 'Reboot', 'OperatingSystemUpdate', 'NodeNotReady', // nodes
+            'Passed', 'Skipped', 'Flaked', 'Failed',  // tests
+            'Degraded', 'Upgradeable', 'False', 'Unknown'])
+        .range([
+            '#fada5e','#fada5e','#ffa500','#d0312d',  // alerts
+            '#d0312d', '#ffa500', '#fada5e', // operators
+            '#1e7bd9', '#4294e6', '#6aaef2', '#96cbff', '#fada5e', // nodes
+            '#3cb043', '#ceba76', '#ffa500', '#d0312d', // tests
+            '#b65049', '#32b8b6', '#ffffff', '#bbbbbb']);
     myChart.data(timelineGroups).zQualitative(true).enableAnimations(false).leftMargin(240).rightMargin(550).maxLineHeight(20).maxHeight(10000).zColorScale(ordinalScale).onSegmentClick(segmentFunc)
     (el);
 

diff --git a/pkg/monitor/alerts.go b/pkg/monitor/alerts.go
@@ -0,0 +1,184 @@
+package monitor
+
+import (
+	"context"
+	"fmt"
+	"math"
+	"strings"
+	"time"
+
+	"k8s.io/kube-openapi/pkg/util/sets"
+
+	routeclient "github.com/openshift/client-go/route/clientset/versioned"
+	"github.com/openshift/library-go/test/library/metrics"
+	"github.com/openshift/origin/pkg/monitor/monitorapi"
+	prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
+	prometheustypes "github.com/prometheus/common/model"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/rest"
+)
+
+func CreateEventIntervalsForAlerts(ctx context.Context, restConfig *rest.Config, startTime time.Time) ([]monitorapi.EventInterval, error) {
+	kubeClient, err := kubernetes.NewForConfig(restConfig)
+	if err != nil {
+		return nil, err
+	}
+	routeClient, err := routeclient.NewForConfig(restConfig)
+	if err != nil {
+		return nil, err
+	}
+	prometheusClient, err := metrics.NewPrometheusClient(ctx, kubeClient, routeClient)
+	if err != nil {
+		return nil, err
+	}
+
+	timeRange := prometheusv1.Range{
+		Start: startTime,
+		End:   time.Now(),
+		Step:  1 * time.Second,
+	}
+	alerts, warningsForQuery, err := prometheusClient.QueryRange(ctx, `ALERTS{alertstate="firing"}`, timeRange)
+	if err != nil {
+		return nil, err
+	}
+	if len(warningsForQuery) > 0 {
+		fmt.Printf("#### warnings \n\t%v\n", strings.Join(warningsForQuery, "\n\t"))
+	}
+
+	firingAlerts, err := createEventIntervalsForAlerts(ctx, alerts, startTime)
+	if err != nil {
+		return nil, err
+	}
+
+	alerts, warningsForQuery, err = prometheusClient.QueryRange(ctx, `ALERTS{alertstate="pending"}`, timeRange)
+	if err != nil {
+		return nil, err
+	}
+	if len(warningsForQuery) > 0 {
+		fmt.Printf("#### warnings \n\t%v\n", strings.Join(warningsForQuery, "\n\t"))
+	}
+	pendingAlerts, err := createEventIntervalsForAlerts(ctx, alerts, startTime)
+	if err != nil {
+		return nil, err
+	}
+
+	ret := []monitorapi.EventInterval{}
+	ret = append(ret, firingAlerts...)
+	ret = append(ret, pendingAlerts...)
+
+	return ret, nil
+}
+
+// Be careful placing things in this list.  In many cases, knowing a condition has gone bad is noteworthy when looking
+// for related errors in CI runs.
+var pendingAlertsToIgnoreForIntervals = sets.NewString(
+//"KubeContainerWaiting",
+//"AlertmanagerReceiversNotConfigured",
+//"KubeJobCompletion",
+//"KubeDeploymentReplicasMismatch",
+)
+
+func createEventIntervalsForAlerts(ctx context.Context, alerts prometheustypes.Value, startTime time.Time) ([]monitorapi.EventInterval, error) {
+	ret := []monitorapi.EventInterval{}
+
+	switch {
+	case alerts.Type() == prometheustypes.ValMatrix:
+		matrixAlert := alerts.(prometheustypes.Matrix)
+		for _, alert := range matrixAlert {
+			alertName := alert.Metric[prometheustypes.AlertNameLabel]
+			// don't skip Watchdog because gaps in watchdog are noteworthy, unexpected, and they do happen.
+			//if alertName == "Watchdog" {
+			//	continue
+			//}
+			// many pending alerts we just don't care about
+			if alert.Metric["alertstate"] == "pending" {
+				if pendingAlertsToIgnoreForIntervals.Has(string(alertName)) {
+					continue
+				}
+			}
+
+			locator := "alert/" + alertName
+			if node := alert.Metric["instance"]; len(node) > 0 {
+				locator += " node/" + node
+			}
+			if namespace := alert.Metric["namespace"]; len(namespace) > 0 {
+				locator += " ns/" + namespace
+			}
+			if pod := alert.Metric["pod"]; len(pod) > 0 {
+				locator += " pod/" + pod
+			}
+			if container := alert.Metric["container"]; len(container) > 0 {
+				locator += " container/" + container
+			}
+
+			alertIntervalTemplate := monitorapi.EventInterval{
+				Condition: monitorapi.Condition{
+					Locator: string(locator),
+					Message: alert.Metric.String(),
+				},
+			}
+			switch {
+			// as I understand it, pending alerts are cases where the conditions except for "how long has been happening"
+			// are all met.  Pending alerts include what level the eventual alert will be, but they are not errors in and
+			// of themselves.  They are you useful to show in time to find patterns of "X fails concurrent with Y"
+			case alert.Metric["alertstate"] == "pending":
+				alertIntervalTemplate.Level = monitorapi.Info
+
+			case alert.Metric["severity"] == "warning":
+				alertIntervalTemplate.Level = monitorapi.Warning
+			case alert.Metric["severity"] == "critical":
+				alertIntervalTemplate.Level = monitorapi.Error
+			case alert.Metric["severity"] == "info":
+				alertIntervalTemplate.Level = monitorapi.Info
+			default:
+				alertIntervalTemplate.Level = monitorapi.Error
+			}
+
+			var alertStartTime *time.Time
+			var lastTime *time.Time
+			for _, currValue := range alert.Values {
+				currTime := currValue.Timestamp.Time()
+				if alertStartTime == nil {
+					alertStartTime = &currTime
+				}
+				if lastTime == nil {
+					lastTime = &currTime
+				}
+				// if it has been less than five seconds since we saw this, consider it the same interval and check
+				// the next time.
+				if math.Abs(currTime.Sub(*lastTime).Seconds()) < (5 * time.Second).Seconds() {
+					lastTime = &currTime
+					continue
+				}
+
+				// if it has been more than five seconds, consider this the start of a new occurrence and add the interval
+				currAlertInterval := alertIntervalTemplate // shallow copy
+				currAlertInterval.From = *alertStartTime
+				currAlertInterval.To = *lastTime
+				ret = append(ret, currAlertInterval)
+
+				// now reset the tracking
+				alertStartTime = &currTime
+				lastTime = nil
+			}
+
+			currAlertInterval := alertIntervalTemplate // shallow copy
+			currAlertInterval.From = *alertStartTime
+			currAlertInterval.To = *lastTime
+			ret = append(ret, currAlertInterval)
+		}
+
+	default:
+		ret = append(ret, monitorapi.EventInterval{
+			Condition: monitorapi.Condition{
+				Level:   monitorapi.Error,
+				Locator: "alert/all",
+				Message: fmt.Sprintf("unhandled type: %v", alerts.Type()),
+			},
+			From: startTime,
+			To:   time.Now(),
+		})
+	}
+
+	return ret, nil
+}
diff --git a/pkg/test/ginkgo/cmd_runsuite.go b/pkg/test/ginkgo/cmd_runsuite.go
@@ -367,6 +367,15 @@ func (opt *Options) Run(suite *TestSuite) error {
 			sort.Sort(events)
 		}
 	}
+
+	// add events from alerts so we can create the intervals
+	alertEventIntervals, err := monitor.CreateEventIntervalsForAlerts(ctx, restConfig, start)
+	if err != nil {
+		fmt.Printf("\n\n\n#### alertErr=%v\n", err)
+	}
+	events = append(events, alertEventIntervals...)
+	sort.Sort(events)
+
 	events.Clamp(start, end)
 
 	if len(opt.JUnitDir) > 0 {