Skip to content

Commit

Permalink
Merge pull request #27946 from deads2k/tightent-types
Browse files Browse the repository at this point in the history
hard monitor.Event message creation
  • Loading branch information
openshift-merge-robot committed Jun 1, 2023
2 parents 4224a03 + ae5ffc5 commit dd0b05a
Show file tree
Hide file tree
Showing 21 changed files with 269 additions and 136 deletions.
3 changes: 1 addition & 2 deletions pkg/monitor/apiserveravailability/summarizer.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package apiserveravailability

import (
"fmt"
"strings"
"sync"
"time"
Expand Down Expand Up @@ -37,7 +36,7 @@ func (s *APIServerClientAccessFailureSummary) SummarizeLine(locator, line string
Condition: monitorapi.Condition{
Level: monitorapi.Warning,
Locator: locator,
Message: fmt.Sprintf("reason/iptables-operation-not-permitted %v", line),
Message: monitorapi.Message().Reason(monitorapi.IPTablesNotPermitted).Message(line),
},
From: timeOfLog,
To: timeOfLog.Add(1 * time.Second),
Expand Down
6 changes: 3 additions & 3 deletions pkg/monitor/backenddisruption/disruption_backend_sampler.go
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,7 @@ func (b *disruptionSampler) consumeSamples(ctx context.Context, interval time.Du
framework.Logf(message)
eventRecorder.Eventf(
&v1.ObjectReference{Kind: "OpenShiftTest", Namespace: "kube-system", Name: b.backendSampler.GetDisruptionBackendName()}, nil,
v1.EventTypeWarning, eventReason, "detected", message)
v1.EventTypeWarning, string(eventReason), "detected", message)
currCondition := monitorapi.Condition{
Level: level,
Locator: b.backendSampler.GetLocator(),
Expand All @@ -598,7 +598,7 @@ func (b *disruptionSampler) consumeSamples(ctx context.Context, interval time.Du
framework.Logf(message)
eventRecorder.Eventf(
&v1.ObjectReference{Kind: "OpenShiftTest", Namespace: "kube-system", Name: b.backendSampler.GetDisruptionBackendName()}, nil,
v1.EventTypeNormal, DisruptionEndedEventReason, "detected", message)
v1.EventTypeNormal, string(monitorapi.DisruptionEndedEventReason), "detected", message)
currCondition := monitorapi.Condition{
Level: monitorapi.Info,
Locator: b.backendSampler.GetLocator(),
Expand All @@ -616,7 +616,7 @@ func (b *disruptionSampler) consumeSamples(ctx context.Context, interval time.Du
framework.Logf(message)
eventRecorder.Eventf(
&v1.ObjectReference{Kind: "OpenShiftTest", Namespace: "kube-system", Name: b.backendSampler.GetDisruptionBackendName()}, nil,
v1.EventTypeWarning, eventReason, "detected", message)
v1.EventTypeWarning, string(eventReason), "detected", message)
currCondition := monitorapi.Condition{
Level: level,
Locator: b.backendSampler.GetLocator(),
Expand Down
44 changes: 25 additions & 19 deletions pkg/monitor/backenddisruption/disruption_locator.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,37 +28,43 @@ func DisruptionEndedMessage(locator string, connectionType monitorapi.BackendCon
// Used to downgrade to a warning instead of an error, and omitted from final disruption numbers and testing.
var DnsLookupRegex = regexp.MustCompile(`dial tcp: lookup.*: i/o timeout`)

const (
DisruptionBeganEventReason = "DisruptionBegan"
DisruptionEndedEventReason = "DisruptionEnded"
DisruptionSamplerOutageBeganEventReason = "DisruptionSamplerOutageBegan"
)

// DisruptionBegan examines the error received, attempts to determine if it looks like real disruption to the cluster under test,
// or other problems possibly on the system running the tests/monitor, and returns an appropriate user message, event reason, and monitoring level.
func DisruptionBegan(locator string, connectionType monitorapi.BackendConnectionType, err error) (string, string, monitorapi.EventLevel) {
func DisruptionBegan(locator string, connectionType monitorapi.BackendConnectionType, err error) (string, monitorapi.IntervalReason, monitorapi.EventLevel) {
if DnsLookupRegex.MatchString(err.Error()) {
switch connectionType {
case monitorapi.NewConnectionType:
return fmt.Sprintf("reason/%s DNS lookup timeouts began for %s GET requests over new connections: %v (likely a problem in cluster running tests, not the cluster under test)",
DisruptionSamplerOutageBeganEventReason, locator, err), DisruptionSamplerOutageBeganEventReason, monitorapi.Warning
return monitorapi.Message().
Reason(monitorapi.DisruptionSamplerOutageBeganEventReason).
Messagef("DNS lookup timeouts began for %s GET requests over new connections: %v (likely a problem in cluster running tests, not the cluster under test)", locator, err),
monitorapi.DisruptionSamplerOutageBeganEventReason, monitorapi.Warning
case monitorapi.ReusedConnectionType:
return fmt.Sprintf("reason/%s DNS lookup timeouts began for %s GET requests over reused connections: %v (likely a problem in cluster running tests, not the cluster under test)",
DisruptionSamplerOutageBeganEventReason, locator, err), DisruptionSamplerOutageBeganEventReason, monitorapi.Warning
return monitorapi.Message().
Reason(monitorapi.DisruptionSamplerOutageBeganEventReason).
Messagef("DNS lookup timeouts began for %s GET requests over reused connections: %v (likely a problem in cluster running tests, not the cluster under test)", locator, err),
monitorapi.DisruptionSamplerOutageBeganEventReason, monitorapi.Warning
default:
return fmt.Sprintf("reason/%s DNS lookup timeouts began for %s GET requests over %v connections: %v (likely a problem in cluster running tests, not the cluster under test)",
DisruptionSamplerOutageBeganEventReason, locator, "Unknown", err), DisruptionSamplerOutageBeganEventReason, monitorapi.Warning
return monitorapi.Message().
Reason(monitorapi.DisruptionSamplerOutageBeganEventReason).
Messagef("DNS lookup timeouts began for %s GET requests over %v connections: %v (likely a problem in cluster running tests, not the cluster under test)", locator, "Unknown", err),
monitorapi.DisruptionSamplerOutageBeganEventReason, monitorapi.Warning
}
}
switch connectionType {
case monitorapi.NewConnectionType:
return fmt.Sprintf("reason/%s %s stopped responding to GET requests over new connections: %v",
DisruptionBeganEventReason, locator, err), DisruptionBeganEventReason, monitorapi.Error
return monitorapi.Message().
Reason(monitorapi.DisruptionBeganEventReason).
Messagef("%s stopped responding to GET requests over new connections: %v", locator, err),
monitorapi.DisruptionBeganEventReason, monitorapi.Error
case monitorapi.ReusedConnectionType:
return fmt.Sprintf("reason/%s %s stopped responding to GET requests over reused connections: %v",
DisruptionBeganEventReason, locator, err), DisruptionBeganEventReason, monitorapi.Error
return monitorapi.Message().
Reason(monitorapi.DisruptionBeganEventReason).
Messagef("%s stopped responding to GET requests over reused connections: %v", locator, err),
monitorapi.DisruptionBeganEventReason, monitorapi.Error
default:
return fmt.Sprintf("reason/%s %s stopped responding to GET requests over %v connections: %v",
DisruptionBeganEventReason, locator, "Unknown", err), DisruptionBeganEventReason, monitorapi.Error
return monitorapi.Message().
Reason(monitorapi.DisruptionBeganEventReason).
Messagef("%s stopped responding to GET requests over %v connections: %v", locator, "Unknown", err),
monitorapi.DisruptionBeganEventReason, monitorapi.Error
}
}
18 changes: 9 additions & 9 deletions pkg/monitor/intervalcreation/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ func readinessFailure(logLine string) monitorapi.Intervals {
Condition: monitorapi.Condition{
Level: monitorapi.Info,
Locator: containerRef.ToLocator(),
Message: monitorapi.ReasonedMessage(monitorapi.ContainerReasonReadinessFailed, message),
Message: monitorapi.Message().Reason(monitorapi.ContainerReasonReadinessFailed).Message(message),
},
From: failureTime,
To: failureTime,
Expand Down Expand Up @@ -319,7 +319,7 @@ func readinessError(logLine string) monitorapi.Intervals {
Condition: monitorapi.Condition{
Level: monitorapi.Info,
Locator: containerRef.ToLocator(),
Message: monitorapi.ReasonedMessage(monitorapi.ContainerReasonReadinessErrored, message),
Message: monitorapi.Message().Reason(monitorapi.ContainerReasonReadinessErrored).Message(message),
},
From: failureTime,
To: failureTime,
Expand All @@ -345,7 +345,7 @@ func errParsingSignature(logLine string) monitorapi.Intervals {
Condition: monitorapi.Condition{
Level: monitorapi.Info,
Locator: containerRef.ToLocator(),
Message: monitorapi.ReasonedMessage(monitorapi.ContainerErrImagePull, monitorapi.ContainerUnrecognizedSignatureFormat),
Message: monitorapi.Message().Reason(monitorapi.ContainerErrImagePull).Cause(monitorapi.ContainerUnrecognizedSignatureFormat).NoDetails(),
},
From: failureTime,
To: failureTime,
Expand Down Expand Up @@ -390,7 +390,7 @@ func startupProbeError(logLine string) monitorapi.Intervals {
Condition: monitorapi.Condition{
Level: monitorapi.Info,
Locator: containerRef.ToLocator(),
Message: monitorapi.ReasonedMessage(monitorapi.ContainerReasonStartupProbeFailed, message),
Message: monitorapi.Message().Reason(monitorapi.ContainerReasonStartupProbeFailed).Message(message),
},
From: failureTime,
To: failureTime,
Expand Down Expand Up @@ -492,7 +492,7 @@ func failedToDeleteCGroupsPath(nodeLocator, logLine string) monitorapi.Intervals
Condition: monitorapi.Condition{
Level: monitorapi.Error,
Locator: nodeLocator,
Message: monitorapi.ReasonedMessage("FailedToDeleteCGroupsPath", logLine),
Message: monitorapi.Message().Reason("FailedToDeleteCGroupsPath").Message(logLine),
},
From: failureTime,
To: failureTime.Add(1 * time.Second),
Expand All @@ -512,7 +512,7 @@ func anonymousCertConnectionError(nodeLocator, logLine string) monitorapi.Interv
Condition: monitorapi.Condition{
Level: monitorapi.Error,
Locator: nodeLocator,
Message: monitorapi.ReasonedMessage("FailedToAuthenticateWithOpenShiftUser", logLine),
Message: monitorapi.Message().Reason("FailedToAuthenticateWithOpenShiftUser").Message(logLine),
},
From: failureTime,
To: failureTime.Add(1 * time.Second),
Expand All @@ -532,7 +532,7 @@ func kubeletNodeHttpClientConnectionLostError(logLine string) monitorapi.Interva
return nil
}

return commonErrorInterval(logLine, statusOutputRegex, "HttpClientConnectionLost", func() string {
return commonErrorInterval(logLine, statusOutputRegex, monitorapi.HttpClientConnectionLost, func() string {
nodeRefRegex.MatchString(logLine)
if !nodeRefRegex.MatchString(logLine) {
return ""
Expand All @@ -542,7 +542,7 @@ func kubeletNodeHttpClientConnectionLostError(logLine string) monitorapi.Interva

}

func commonErrorInterval(logLine string, messageExp *regexp.Regexp, reason string, locator func() string) monitorapi.Intervals {
func commonErrorInterval(logLine string, messageExp *regexp.Regexp, reason monitorapi.IntervalReason, locator func() string) monitorapi.Intervals {
messageExp.MatchString(logLine)
if !messageExp.MatchString(logLine) {
return nil
Expand All @@ -561,7 +561,7 @@ func commonErrorInterval(logLine string, messageExp *regexp.Regexp, reason strin
Condition: monitorapi.Condition{
Level: monitorapi.Info,
Locator: locator(),
Message: monitorapi.ReasonedMessage(reason, message),
Message: monitorapi.Message().Reason(reason).Message(message),
},
From: failureTime,
To: failureTime,
Expand Down
2 changes: 1 addition & 1 deletion pkg/monitor/intervalcreation/node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ func TestMonitorApiIntervals(t *testing.T) {
Condition: monitorapi.Condition{
Level: monitorapi.Info,
Locator: "ns/openshift-e2e-loki pod/loki-promtail-plm74 uid/59b26cbf-3421-407c-98ee-986b5a091ef4 container/oauth-proxy",
Message: "reason/ErrImagePull UnrecognizedSignatureFormat",
Message: "cause/UnrecognizedSignatureFormat reason/ErrImagePull",
},
From: systemdJournalLogTime("Feb 01 05:37:45.731611"),
To: systemdJournalLogTime("Feb 01 05:37:45.731611"),
Expand Down
11 changes: 6 additions & 5 deletions pkg/monitor/intervalcreation/pod.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package intervalcreation

import (
"fmt"
"sort"
"time"

Expand Down Expand Up @@ -443,7 +442,7 @@ type timeBounder interface {
getEndTime(locator string) time.Time
}

func buildTransitionsForCategory(locatorToConditions map[string][]monitorapi.EventInterval, startReason, endReason string, timeBounder timeBounder) monitorapi.Intervals {
func buildTransitionsForCategory(locatorToConditions map[string][]monitorapi.EventInterval, startReason, endReason monitorapi.IntervalReason, timeBounder timeBounder) monitorapi.Intervals {
ret := monitorapi.Intervals{}
// now step through each category and build the to/from interval
for locator, instantEvents := range locatorToConditions {
Expand All @@ -454,12 +453,14 @@ func buildTransitionsForCategory(locatorToConditions map[string][]monitorapi.Eve
hasPrev := len(prevEvent.Message) > 0
currEvent := instantEvents[i]
currReason := monitorapi.ReasonFrom(currEvent.Message)
prevAnnotations := monitorapi.AnnotationsFromMessage(prevEvent.Message)
prevBareMessage := monitorapi.NonAnnotationMessage(prevEvent.Message)

nextInterval := monitorapi.EventInterval{
Condition: monitorapi.Condition{
Level: monitorapi.Info,
Locator: locator,
Message: "constructed/true " + prevEvent.Message,
Message: monitorapi.Message().Constructed().WithAnnotations(prevAnnotations).Message(prevBareMessage),
},
From: prevEvent.From,
To: currEvent.From,
Expand All @@ -478,7 +479,7 @@ func buildTransitionsForCategory(locatorToConditions map[string][]monitorapi.Eve
case !hasPrev && currReason != startReason:
// we missed the startReason (it probably happened before the watch was established).
// adjust the message to indicate that we missed the start event for this locator
nextInterval.Message = "constructed/true " + monitorapi.ReasonedMessage(startReason, fmt.Sprintf("missed real %q", startReason))
nextInterval.Message = monitorapi.Message().Constructed().Reason(startReason).Messagef("missed real %q", startReason)
}

// if the current reason is a logical ending point, reset to an empty previous
Expand All @@ -494,7 +495,7 @@ func buildTransitionsForCategory(locatorToConditions map[string][]monitorapi.Eve
Condition: monitorapi.Condition{
Level: monitorapi.Info,
Locator: locator,
Message: "constructed/true " + prevEvent.Message,
Message: monitorapi.ExpandMessage(prevEvent.Message).Constructed().NoDetails(),
},
From: prevEvent.From,
To: timeBounder.getEndTime(locator),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
{
"level": "Info",
"locator": "ns/e2e-kubectl-3271 pod/without-label uid/e185b70c-ea3e-4600-850a-b2370a729a73",
"message": "constructed/true reason/Created ",
"message": "constructed/true reason/Created",
"from": "2022-03-07T18:41:46Z",
"to": "2022-03-07T18:41:46Z"
},
{
"level": "Info",
"locator": "ns/e2e-kubectl-3271 pod/without-label uid/e185b70c-ea3e-4600-850a-b2370a729a73",
"message": "constructed/true reason/Scheduled node/ip-10-0-141-9.us-west-2.compute.internal",
"message": "constructed/true node/ip-10-0-141-9.us-west-2.compute.internal reason/Scheduled",
"from": "2022-03-07T18:41:46Z",
"to": "2022-03-07T18:41:54Z"
},
Expand All @@ -31,14 +31,14 @@
{
"level": "Info",
"locator": "ns/e2e-kubectl-3271 pod/without-label uid/e185b70c-ea3e-4600-850a-b2370a729a73 container/without-label",
"message": "constructed/true reason/ContainerStart cause/ duration/6.00s",
"message": "cause/ constructed/true duration/6.00s reason/ContainerStart",
"from": "2022-03-07T18:41:52Z",
"to": "2022-03-07T18:41:54Z"
},
{
"level": "Info",
"locator": "ns/e2e-kubectl-3271 pod/without-label uid/e185b70c-ea3e-4600-850a-b2370a729a73 container/without-label",
"message": "constructed/true reason/Ready ",
"message": "constructed/true reason/Ready",
"from": "2022-03-07T18:41:52Z",
"to": "2022-03-07T18:41:54Z"
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,21 @@
{
"level": "Info",
"locator": "ns/openshift-kube-apiserver pod/revision-pruner-7-ip-10-0-214-214.us-west-1.compute.internal uid/6a81964d-169c-47e0-a986-551429370ae9",
"message": "constructed/true reason/Created ",
"message": "constructed/true node/ip-10-0-214-214.us-west-1.compute.internal reason/Scheduled",
"from": "2022-03-21T16:43:14Z",
"to": "2022-03-21T16:43:14Z"
},
{
"level": "Info",
"locator": "ns/openshift-kube-apiserver pod/revision-pruner-7-ip-10-0-214-214.us-west-1.compute.internal uid/6a81964d-169c-47e0-a986-551429370ae9",
"message": "constructed/true reason/Scheduled node/ip-10-0-214-214.us-west-1.compute.internal",
"message": "constructed/true reason/Created",
"from": "2022-03-21T16:43:14Z",
"to": "2022-03-21T16:43:14Z"
},
{
"level": "Info",
"locator": "ns/openshift-kube-apiserver pod/revision-pruner-7-ip-10-0-214-214.us-west-1.compute.internal uid/6a81964d-169c-47e0-a986-551429370ae9 container/pruner",
"message": "constructed/true reason/NotReady ",
"message": "constructed/true reason/NotReady",
"from": "2022-03-21T16:43:14Z",
"to": "2022-03-21T16:43:14Z"
}
Expand Down
12 changes: 6 additions & 6 deletions pkg/monitor/intervalcreation/podTest/installer-pod/expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
{
"level": "Info",
"locator": "ns/openshift-etcd pod/installer-9-ci-op-97t906zm-db044-bwrrn-master-0 uid/6fb10c53-7ed9-4f51-88db-f8a689050f21",
"message": "constructed/true reason/Created ",
"message": "constructed/true reason/Created",
"from": "2022-03-21T21:37:20Z",
"to": "2022-03-21T21:37:20Z"
},
{
"level": "Info",
"locator": "ns/openshift-etcd pod/installer-9-ci-op-97t906zm-db044-bwrrn-master-0 uid/6fb10c53-7ed9-4f51-88db-f8a689050f21",
"message": "constructed/true reason/Scheduled node/ci-op-97t906zm-db044-bwrrn-master-0",
"message": "constructed/true node/ci-op-97t906zm-db044-bwrrn-master-0 reason/Scheduled",
"from": "2022-03-21T21:37:20Z",
"to": "2022-03-21T21:37:56Z"
},
Expand All @@ -24,28 +24,28 @@
{
"level": "Info",
"locator": "ns/openshift-etcd pod/installer-9-ci-op-97t906zm-db044-bwrrn-master-0 uid/6fb10c53-7ed9-4f51-88db-f8a689050f21 container/installer",
"message": "constructed/true reason/NotReady ",
"message": "constructed/true reason/NotReady",
"from": "2022-03-21T21:37:23Z",
"to": "2022-03-21T21:37:23Z"
},
{
"level": "Info",
"locator": "ns/openshift-etcd pod/installer-9-ci-op-97t906zm-db044-bwrrn-master-0 uid/6fb10c53-7ed9-4f51-88db-f8a689050f21 container/installer",
"message": "constructed/true reason/ContainerStart cause/ duration/3.00s",
"message": "cause/ constructed/true duration/3.00s reason/ContainerStart",
"from": "2022-03-21T21:37:23Z",
"to": "2022-03-21T21:37:56Z"
},
{
"level": "Info",
"locator": "ns/openshift-etcd pod/installer-9-ci-op-97t906zm-db044-bwrrn-master-0 uid/6fb10c53-7ed9-4f51-88db-f8a689050f21 container/installer",
"message": "constructed/true reason/Ready ",
"message": "constructed/true reason/Ready",
"from": "2022-03-21T21:37:23Z",
"to": "2022-03-21T21:37:56Z"
},
{
"level": "Info",
"locator": "ns/openshift-etcd pod/installer-9-ci-op-97t906zm-db044-bwrrn-master-0 uid/6fb10c53-7ed9-4f51-88db-f8a689050f21 container/installer",
"message": "constructed/true reason/NotReady ",
"message": "constructed/true reason/NotReady",
"from": "2022-03-21T21:37:56Z",
"to": "2022-03-21T21:37:56Z"
}
Expand Down

0 comments on commit dd0b05a

Please sign in to comment.