Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test: Verify platform metrics are available #24117

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
29 changes: 27 additions & 2 deletions test/extended/prometheus/prometheus.go
Expand Up @@ -33,7 +33,9 @@ import (
exutil "github.com/openshift/origin/test/extended/util"
)

const waitForPrometheusStartSeconds = 240
const (
maxPrometheusQueryRetries = 5
)

var _ = g.Describe("[Feature:Prometheus][Conformance] Prometheus", func() {
defer g.GinkgoRecover()
Expand Down Expand Up @@ -81,7 +83,7 @@ var _ = g.Describe("[Feature:Prometheus][Conformance] Prometheus", func() {

g.By("checking the unsecured metrics path")
var metrics map[string]*dto.MetricFamily
o.Expect(wait.PollImmediate(10*time.Second, waitForPrometheusStartSeconds*time.Second, func() (bool, error) {
o.Expect(wait.PollImmediate(10*time.Second, 2*time.Minute, func() (bool, error) {
results, err := getInsecureURLViaPod(ns, execPod.Name, fmt.Sprintf("%s/metrics", url))
if err != nil {
e2e.Logf("unable to get unsecured metrics: %v", err)
Expand Down Expand Up @@ -173,6 +175,29 @@ var _ = g.Describe("[Feature:Prometheus][Conformance] Prometheus", func() {

e2e.Logf("Watchdog alert is firing")
})
g.It("should have important platform topology metrics", func() {
oc.SetupProject()
ns := oc.Namespace()
execPod := exutil.CreateCentosExecPodOrFail(oc.AdminKubeClient(), ns, "execpod", nil)
defer func() { oc.AdminKubeClient().CoreV1().Pods(ns).Delete(execPod.Name, metav1.NewDeleteOptions(1)) }()

tests := map[string]bool{
// track infrastructure type
`cluster_infrastructure_provider{type!=""}`: true,
`cluster_feature_set`: true,

// track installer type
`cluster_installer{type!="",invoker!=""}`: true,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How come we do not check that this value is > 0 as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good point


// track sum of etcd
`instance:etcd_object_counts:sum > 0`: true,

// track cores and sockets across node types
`sum(node_role_os_version_machine:cpu_capacity_cores:sum{label_kubernetes_io_arch!="",label_node_role_kubernetes_io_master!=""}) > 0`: true,
`sum(node_role_os_version_machine:cpu_capacity_sockets:sum{label_kubernetes_io_arch!="",label_node_hyperthread_enabled!="",label_node_role_kubernetes_io_master!=""}) > 0`: true,
}
runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
})
g.It("should have non-Pod host cAdvisor metrics", func() {
oc.SetupProject()
ns := oc.Namespace()
Expand Down
50 changes: 36 additions & 14 deletions test/extended/prometheus/prometheus_builds.go
Expand Up @@ -52,7 +52,7 @@ var _ = g.Describe("[Feature:Prometheus][Feature:Builds] Prometheus", func() {
// allow for some retry, a la prometheus.go and its initial hitting of the metrics endpoint after
// instantiating prometheus tempalte
var err error
for i := 0; i < waitForPrometheusStartSeconds; i++ {
for i := 0; i < maxPrometheusQueryRetries; i++ {
err = expectURLStatusCodeExec(ns, execPod.Name, url, 403)
if err == nil {
break
Expand Down Expand Up @@ -100,38 +100,60 @@ type prometheusResponseData struct {

func runQueries(promQueries map[string]bool, oc *exutil.CLI, ns, execPodName, baseURL, bearerToken string) {
// expect all correct metrics within a reasonable time period
errsMap := map[string]error{}
for i := 0; i < waitForPrometheusStartSeconds; i++ {
queryErrors := make(map[string]error)
passed := make(map[string]struct{})
for i := 0; i < maxPrometheusQueryRetries; i++ {
for query, expected := range promQueries {
if _, ok := passed[query]; ok {
continue
}
//TODO when the http/query apis discussed at https://github.com/prometheus/client_golang#client-for-the-prometheus-http-api
// and introduced at https://github.com/prometheus/client_golang/blob/master/api/prometheus/v1/api.go are vendored into
// openshift/origin, look to replace this homegrown http request / query param with that API
g.By("perform prometheus metric query " + query)
contents, err := getBearerTokenURLViaPod(ns, execPodName, fmt.Sprintf("%s/api/v1/query?%s", baseURL, (url.Values{"query": []string{query}}).Encode()), bearerToken)
url := fmt.Sprintf("%s/api/v1/query?%s", baseURL, (url.Values{"query": []string{query}}).Encode())
contents, err := getBearerTokenURLViaPod(ns, execPodName, url, bearerToken)
o.Expect(err).NotTo(o.HaveOccurred())
result := prometheusResponse{}
json.Unmarshal([]byte(contents), &result)
metrics := result.Data.Result

delete(errsMap, query) // clear out any prior failures
// check query result, if this is a new error log it, otherwise remain silent
var result prometheusResponse
if err := json.Unmarshal([]byte(contents), &result); err != nil {
e2e.Logf("unable to parse query response for %s: %v", query, err)
continue
}
metrics := result.Data.Result
if result.Status != "success" {
msg := fmt.Sprintf("promQL query: %s had reported incorrect status: %#v", query, metrics)
if prev, ok := queryErrors[query]; !ok || prev.Error() != msg {
e2e.Logf("%s", msg)
}
queryErrors[query] = fmt.Errorf(msg)
continue
}
if (len(metrics) > 0 && !expected) || (len(metrics) == 0 && expected) {
dbg := fmt.Sprintf("promQL query: %s had reported incorrect results: %v", query, metrics)
fmt.Fprintf(g.GinkgoWriter, dbg)
errsMap[query] = fmt.Errorf(dbg)
msg := fmt.Sprintf("promQL query: %s had reported incorrect results: %#v", query, metrics)
if prev, ok := queryErrors[query]; !ok || prev.Error() != msg {
e2e.Logf("%s", msg)
}
queryErrors[query] = fmt.Errorf(msg)
continue
}

// query successful
passed[query] = struct{}{}
delete(queryErrors, query)
}

if len(errsMap) == 0 {
if len(queryErrors) == 0 {
break
}
time.Sleep(time.Second)
}

if len(errsMap) != 0 {
if len(queryErrors) != 0 {
exutil.DumpPodLogsStartingWith("prometheus-0", oc)
}
o.Expect(errsMap).To(o.BeEmpty())
o.Expect(queryErrors).To(o.BeEmpty())
}

func startOpenShiftBuild(oc *exutil.CLI, appTemplate string) *exutil.BuildResult {
Expand Down