Skip to content

Commit

Permalink
Merge pull request #290 from gabemontero/retry-on-failed-import-metric
Browse files Browse the repository at this point in the history
[BUILD-92] add import retry count into metrics
  • Loading branch information
openshift-merge-robot committed Jun 29, 2020
2 parents 4177965 + 6c5a87d commit d43c691
Show file tree
Hide file tree
Showing 4 changed files with 152 additions and 77 deletions.
14 changes: 13 additions & 1 deletion pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ const (
// we went with the _info suffix vs. the _total suffix
failedImportsQuery = "openshift_samples_failed_imagestream_import_info"

// the one current exception to the _info classficiation
importRetryQuery = "openshift_samples_retry_imagestream_import_total"

degradedQuery = "openshift_samples_degraded_info"

invalidConfigQuery = "openshift_samples_invalidconfig_info"
Expand Down Expand Up @@ -61,6 +64,15 @@ var (
Name: tbrInaccessibleOnBootstrapQuery,
Help: "Indicates that during its initial installation the samples operator could not access registry.redhat.io and it boostrapped as removed.",
})
// looked into a imagestream name label on this metric but various prometheus guidance dissuaded that approach;
// looked into a constant metric like importsFailedDesc, but that requires more invasive changes to track the
// retry in our CRD;
// for now, with the failedImport metric having the name label, and this sanity check, we should get a sufficient validation that we
// are retrying - we can re-evaluate based on need and adoption in telemetry / OTA analysis / insights etc.
importRetryStat = prometheus.NewCounter(prometheus.CounterOpts{
Name: importRetryQuery,
Help: "Indicates the number of retries on imagestream import.",
})

sc = samplesCollector{}
registered = false
Expand Down Expand Up @@ -184,7 +196,7 @@ func addCountGauge(ch chan<- prometheus.Metric, desc *prometheus.Desc, name stri
}

func init() {
prometheus.MustRegister(degradedStat, configInvalidStat, tbrInaccessibleOnBootStat)
prometheus.MustRegister(degradedStat, configInvalidStat, tbrInaccessibleOnBootStat, importRetryStat)
}

func InitializeMetricsCollector(listers *client.Listers) {
Expand Down
4 changes: 4 additions & 0 deletions pkg/metrics/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,7 @@ func TBRInaccessibleOnBoot(badTBR bool) {
}
tbrInaccessibleOnBootStat.Set(0)
}

func ImageStreamImportRetry() {
importRetryStat.Inc()
}
209 changes: 133 additions & 76 deletions pkg/metrics/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,21 @@ import (
mr "math/rand"
"net/http"
"os"
"sync"
"testing"
"time"

io_prometheus_client "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt"
)

var portMap = sync.Map{}

func TestMain(m *testing.M) {
var err error

mr.Seed(time.Now().UnixNano())

tlsKey, tlsCRT, err = generateTempCertificates()
if err != nil {
panic(err)
Expand All @@ -39,9 +44,22 @@ func TestMain(m *testing.M) {
os.Exit(code)
}

func generatePort() int {
mr.Seed(time.Now().UnixNano())
return mr.Intn(4000) + 6000
func generatePort(t *testing.T) int {
port := 0
for i := 0; i < 200; i++ {
port = mr.Intn(4000) + 6000
_, alreadyLoaded := portMap.LoadOrStore(port, port)
if alreadyLoaded {
t.Logf("port %v in use, trying again", port)
time.Sleep(1 * time.Second)
continue
}
break
}
if port == 0 {
t.Fatal("could not get unique metrics port")
}
return port
}

func generateTempCertificates() (string, string, error) {
Expand Down Expand Up @@ -84,7 +102,7 @@ func generateTempCertificates() (string, string, error) {
func TestRun(t *testing.T) {
ch := make(chan struct{})
defer close(ch)
port := generatePort()
port := generatePort(t)
srv := BuildServer(port)
go RunServer(srv, ch)

Expand All @@ -98,81 +116,45 @@ func TestRun(t *testing.T) {
}
}

func TestBinaryMetrics(t *testing.T) {
func runServer(t *testing.T) (chan struct{}, int) {
ch := make(chan struct{})
defer close(ch)
port := generatePort()
port := generatePort(t)
srv := BuildServer(port)
go RunServer(srv, ch)
for _, t1 := range []struct {
method func(bool)
query string
}{
{
method: Degraded,
query: degradedQuery,
},
{
method: ConfigInvalid,
query: invalidConfigQuery,
},
{
method: TBRInaccessibleOnBoot,
query: tbrInaccessibleOnBootstrapQuery,
},
} {
for _, tt := range []struct {
name string
iter int
val bool
expt float64
}{
{
name: "single set to true",
iter: 1,
val: true,
expt: 1,
},
{
name: "single set to false",
iter: 1,
val: false,
expt: 0,
},
{
name: "multiple set to true",
iter: 2,
val: true,
expt: 1,
},
{
name: "multiple set to false",
iter: 2,
val: false,
expt: 0,
},
} {
t.Run(tt.name, func(t *testing.T) {
for i := 0; i < tt.iter; i++ {
t1.method(tt.val)
}

resp, err := http.Get(fmt.Sprintf("https://localhost:%d/metrics", port))
if err != nil {
t.Fatalf("error requesting metrics server: %v", err)
}

metrics := findMetricsByCounter(resp.Body, t1.query)
if len(metrics) == 0 {
t.Fatal("unable to locate metric", t1.query)
}

val := *metrics[0].Gauge.Value
if val != tt.expt {
t.Errorf("expected %.0f, found %.0f", tt.expt, val)
}
})
}
return ch, port
}

func testQueryGaugeMetric(t *testing.T, port, value int, query string) {
resp, err := http.Get(fmt.Sprintf("https://localhost:%d/metrics", port))
if err != nil {
t.Fatalf("error requesting metrics server: %v", err)
}
metrics := findMetricsByCounter(resp.Body, query)
if len(metrics) == 0 {
t.Fatalf("unable to locate metric %s", query)
}
if metrics[0].Gauge.Value == nil {
t.Fatalf("metric did not have value %s", query)
}
if *metrics[0].Gauge.Value != float64(value) {
t.Fatalf("incorrect metric value %v for query %s", *metrics[0].Gauge.Value, query)
}
}

func testQueryCounterMetric(t *testing.T, port, value int, query string) {
resp, err := http.Get(fmt.Sprintf("https://localhost:%d/metrics", port))
if err != nil {
t.Fatalf("error requesting metrics server: %v", err)
}
metrics := findMetricsByCounter(resp.Body, query)
if len(metrics) == 0 {
t.Fatalf("unable to locate metric %s", query)
}
if metrics[0].Counter.Value == nil {
t.Fatalf("metric did not have value %s", query)
}
if *metrics[0].Counter.Value != float64(value) {
t.Fatalf("incorrect metric value %v for query %s", *metrics[0].Counter.Value, query)
}
}

Expand All @@ -187,3 +169,78 @@ func findMetricsByCounter(buf io.ReadCloser, name string) []*io_prometheus_clien
}
return nil
}

func TestDegradedOn(t *testing.T) {
ch, port := runServer(t)
defer close(ch)

Degraded(true)
testQueryGaugeMetric(t, port, 1, degradedQuery)
// makes sure it does not go greater than 1
Degraded(true)
testQueryGaugeMetric(t, port, 1, degradedQuery)
}

func TestDegradedOff(t *testing.T) {
ch, port := runServer(t)
defer close(ch)

Degraded(false)
testQueryGaugeMetric(t, port, 0, degradedQuery)
// makes sure it does not go less than 0
Degraded(false)
testQueryGaugeMetric(t, port, 0, degradedQuery)
}

func TestConfigInvalidOn(t *testing.T) {
ch, port := runServer(t)
defer close(ch)

ConfigInvalid(true)
testQueryGaugeMetric(t, port, 1, invalidConfigQuery)
// makes sure it does not go greater than 1
ConfigInvalid(true)
testQueryGaugeMetric(t, port, 1, invalidConfigQuery)
}

func TestConfigInvalidOff(t *testing.T) {
ch, port := runServer(t)
defer close(ch)

ConfigInvalid(false)
testQueryGaugeMetric(t, port, 0, invalidConfigQuery)
// makes sure it does not go less than 0
ConfigInvalid(false)
testQueryGaugeMetric(t, port, 0, invalidConfigQuery)
}

func TestTBRInaccessibleOnBootOn(t *testing.T) {
ch, port := runServer(t)
defer close(ch)

TBRInaccessibleOnBoot(true)
testQueryGaugeMetric(t, port, 1, tbrInaccessibleOnBootstrapQuery)
// makes sure it does not go greater than 1
TBRInaccessibleOnBoot(true)
testQueryGaugeMetric(t, port, 1, tbrInaccessibleOnBootstrapQuery)
}

func TestTBRInaccessibleOnBootOff(t *testing.T) {
ch, port := runServer(t)
defer close(ch)

TBRInaccessibleOnBoot(false)
testQueryGaugeMetric(t, port, 0, tbrInaccessibleOnBootstrapQuery)
// makes sure it does not go less than 0
TBRInaccessibleOnBoot(false)
testQueryGaugeMetric(t, port, 0, tbrInaccessibleOnBootstrapQuery)
}

func TestImageStreamImportRetry(t *testing.T) {
ch, port := runServer(t)
defer close(ch)
for i := 1; i < 3; i++ {
ImageStreamImportRetry()
testQueryCounterMetric(t, port, i, importRetryQuery)
}
}
2 changes: 2 additions & 0 deletions pkg/stub/imagestreams.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
v1 "github.com/openshift/api/samples/v1"

"github.com/openshift/cluster-samples-operator/pkg/cache"
"github.com/openshift/cluster-samples-operator/pkg/metrics"
"github.com/openshift/cluster-samples-operator/pkg/util"
)

Expand Down Expand Up @@ -467,6 +468,7 @@ func (h *Handler) processImportStatus(is *imagev1.ImageStream, cfg *v1.Config) (
logrus.Warningf("attempted to initiate an imagestreamimport retry for imagestream/tag %s/%s but got err %v; simply moving on", is.Name, statusTag.Tag, err)
break
}
metrics.ImageStreamImportRetry()
logrus.Printf("initiated an imagestreamimport retry for imagestream/tag %s/%s", is.Name, statusTag.Tag)

}
Expand Down

0 comments on commit d43c691

Please sign in to comment.