Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions cmd/release-controller/splay_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package main

import (
"hash/fnv"
"testing"
"time"
)

// splayDelay mirrors the delay computation in ensureProwJobForReleaseTag
func splayDelay(tagName, prowJobName string, splay time.Duration) time.Duration {
h := fnv.New32a()
h.Write([]byte(tagName + "/" + prowJobName))
return time.Duration(h.Sum32()%uint32(splay.Seconds())) * time.Second
}

func TestSplayDelay(t *testing.T) {
splay := 15 * time.Minute

t.Run("deterministic", func(t *testing.T) {
d1 := splayDelay("4.22.0-0.nightly-2026-03-11-124441", "job-a", splay)
d2 := splayDelay("4.22.0-0.nightly-2026-03-11-124441", "job-a", splay)
if d1 != d2 {
t.Errorf("same inputs produced different delays: %v vs %v", d1, d2)
}
})

t.Run("within bounds", func(t *testing.T) {
for i := range 100 {
tag := "4.22.0-0.nightly-2026-03-11-124441"
job := "job-" + string(rune('a'+i))
d := splayDelay(tag, job, splay)
if d < 0 || d >= splay {
t.Errorf("delay %v out of range [0, %v) for job %s", d, splay, job)
}
}
})

t.Run("different jobs get different delays", func(t *testing.T) {
tag := "4.22.0-0.nightly-2026-03-11-124441"
delays := make(map[time.Duration]int)
for i := range 30 {
job := "aggregated-e2e-aws-analysis-" + string(rune('0'+i))
d := splayDelay(tag, job, splay)
delays[d]++
}
// With 30 jobs across 900 seconds, we should get reasonable distribution.
// At minimum, not all delays should be identical.
if len(delays) < 2 {
t.Errorf("expected varied delays across 30 jobs, got %d distinct values", len(delays))
}
})

t.Run("different payloads get different delays for same job", func(t *testing.T) {
d1 := splayDelay("4.22.0-0.nightly-2026-03-11-124441", "e2e-aws", splay)
d2 := splayDelay("4.22.0-0.nightly-2026-03-12-060000", "e2e-aws", splay)
if d1 == d2 {
t.Logf("note: same delay for different payloads (possible but unlikely): %v", d1)
}
})

// Note: when splay is 0, ensureProwJobForReleaseTag skips the splay gate
// entirely (guarded by `if splay > 0`), so splayDelay is never called
// with a zero duration.
}
15 changes: 11 additions & 4 deletions cmd/release-controller/sync_analysis.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@ const (
defaultAggregateProwJobName = "release-openshift-release-analysis-aggregator"
)

func (c *Controller) launchAnalysisJobs(release *releasecontroller.Release, verifyName string, verifyType releasecontroller.ReleaseVerification, releaseTag *imagev1.TagReference, previousTag, previousReleasePullSpec string) error {
// launchAnalysisJobs creates analysis jobs for an aggregated verification step.
// It returns true if all analysis jobs have been created (or already exist), and
// false if some jobs were deferred by splay and still need to be created later.
func (c *Controller) launchAnalysisJobs(release *releasecontroller.Release, verifyName string, verifyType releasecontroller.ReleaseVerification, releaseTag *imagev1.TagReference, previousTag, previousReleasePullSpec string) (bool, error) {
jobLabels := map[string]string{
releasecontroller.ProwJobLabelCapability: "rce",
"release.openshift.io/analysis": releaseTag.Name,
Expand All @@ -25,15 +28,19 @@ func (c *Controller) launchAnalysisJobs(release *releasecontroller.Release, veri
copied := verifyType.DeepCopy()
copied.AggregatedProwJob.AnalysisJobCount = 0

allCreated := true
for i := range verifyType.AggregatedProwJob.AnalysisJobCount {
// Postfix the name to differentiate it from the aggregator job
jobNameSuffix := fmt.Sprintf("analysis-%d", i)
_, err := c.ensureProwJobForReleaseTag(release, verifyName, jobNameSuffix, *copied, releaseTag, previousTag, previousReleasePullSpec, jobLabels)
job, err := c.ensureProwJobForReleaseTag(release, verifyName, jobNameSuffix, *copied, releaseTag, previousTag, previousReleasePullSpec, jobLabels)
if err != nil {
return err
return false, err
}
if job == nil {
allCreated = false
}
}
return nil
return allCreated, nil
}

func addAnalysisEnvToProwJobSpec(spec *prowjobv1.ProwJobSpec, payloadTag, verificationJobName string) (bool, error) {
Expand Down
2 changes: 2 additions & 0 deletions cmd/release-controller/sync_upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ func (c *Controller) ensureReleaseUpgradeJobs(release *releasecontroller.Release
if err != nil {
return err
}
// nil job with nil error means splay deferred the job; the controller
// will re-process and create it when the delay elapses.
}
return nil
}
Expand Down
18 changes: 17 additions & 1 deletion cmd/release-controller/sync_verify.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,16 +90,32 @@ func (c *Controller) ensureVerificationJobs(release *releasecontroller.Release,
releasecontroller.ReleaseLabelPayload: releaseTag.Name,
}
if verifyType.AggregatedProwJob != nil {
err := c.launchAnalysisJobs(release, name, verifyType, releaseTag, previousTag, previousReleasePullSpec)
allAnalysisCreated, err := c.launchAnalysisJobs(release, name, verifyType, releaseTag, previousTag, previousReleasePullSpec)
if err != nil {
return nil, err
}
if !allAnalysisCreated {
// Some analysis jobs are still deferred by splay; don't
// launch the aggregator yet. Re-queue to check again.
klog.V(4).Infof("Splay: deferring aggregator for %s/%s until all analysis jobs are created", releaseTag.Name, name)
if retryQueueDelay == 0 || 30*time.Second < retryQueueDelay {
retryQueueDelay = 30 * time.Second
}
continue
}
jobLabels["release.openshift.io/aggregator"] = releaseTag.Name
}
job, err := c.ensureProwJobForReleaseTag(release, name, jobNameSuffix, verifyType, releaseTag, previousTag, previousReleasePullSpec, jobLabels)
if err != nil {
return nil, err
}
if job == nil {
// Job creation was deferred by splay; re-queue to check again
if retryQueueDelay == 0 || 30*time.Second < retryQueueDelay {
retryQueueDelay = 30 * time.Second
}
continue
}
status, ok := releasecontroller.ProwJobVerificationStatus(job)
if !ok {
return nil, fmt.Errorf("unexpected error accessing prow job definition")
Expand Down
20 changes: 20 additions & 0 deletions cmd/release-controller/sync_verify_prow.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ import (
"bytes"
"context"
"fmt"
"hash/fnv"
"strconv"
"strings"
"time"

"github.com/openshift/release-controller/pkg/prow"

Expand Down Expand Up @@ -55,6 +57,24 @@ func (c *Controller) ensureProwJobForReleaseTag(release *releasecontroller.Relea
return obj.(*unstructured.Unstructured), nil
}

// If splay is configured, defer job creation by a deterministic random delay
// derived from the payload tag and job name. This spreads out job launches to
// reduce the blast radius of transient infrastructure issues.
splay := time.Duration(release.Config.Splay)
if splay > 0 {
tagCreated, parseErr := time.Parse(time.RFC3339, releaseTag.Annotations[releasecontroller.ReleaseAnnotationCreationTimestamp])
if parseErr == nil {
h := fnv.New32a()
h.Write([]byte(releaseTag.Name + "/" + prowJobName))
delay := time.Duration(h.Sum32()%uint32(splay.Seconds())) * time.Second
remaining := delay - time.Since(tagCreated)
if remaining > 0 {
klog.V(4).Infof("Splay: deferring job %s for %s (delay %s, tag created %s)", prowJobName, remaining.Truncate(time.Second), delay.Truncate(time.Second), tagCreated.Format(time.RFC3339))
return nil, nil
}
}
}

// It is very important that you do not modify anything returned from c.prowConfigLoader.Config():
// https://github.com/kubernetes/test-infra/blob/a68e557705be5e4b7d3cf45bb4fe1e93b82c5682/prow/config/agent.go#L389
// These are in-memory copies of the mounted prow configurations and changing them results in subsequent lookups that
Expand Down
7 changes: 7 additions & 0 deletions pkg/release-controller/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,13 @@ type ReleaseConfig struct {
// should be expired and removed. If unset, tags are not expired.
Expires utils.Duration `json:"expires"`

// Splay is the maximum random offset as a golang duration applied to verification
// job start times. Each job is assigned a deterministic random delay in [0, splay)
// based on the payload tag and job name. This spreads out job launches to reduce the
// blast radius of transient infrastructure issues like registry outages. If unset,
// all jobs start immediately.
Splay utils.Duration `json:"splay,omitempty"`

// Verify is a map of short names to verification steps that must succeed before the
// release is Accepted. Failures for some job types will cause the release to be
// rejected.
Expand Down