From ab482618aabb1a85c8607525ef70fc1eb221ddbd Mon Sep 17 00:00:00 2001 From: Kui Wang Date: Fri, 21 Nov 2025 10:11:42 +0800 Subject: [PATCH] Migrate OLM v0 stress test cases --- .../openshift_payload_olmv0.json | 32 ++ tests-extension/test/qe/specs/olmv0_allns.go | 4 +- tests-extension/test/qe/specs/olmv0_common.go | 3 +- .../test/qe/specs/olmv0_defaultoption.go | 3 +- .../test/qe/specs/olmv0_hypershiftmgmt.go | 3 +- .../test/qe/specs/olmv0_multins.go | 3 +- .../test/qe/specs/olmv0_nonallns.go | 22 +- tests-extension/test/qe/specs/olmv0_opm.go | 5 +- tests-extension/test/qe/specs/olmv0_stress.go | 282 ++++++++++++++++++ tests-extension/test/qe/util/client.go | 164 +++++++--- tests-extension/test/qe/util/nodes.go | 20 +- .../test/qe/util/olmv0util/catalog_source.go | 17 +- .../test/qe/util/opmcli/opm_bin.go | 22 +- .../config/pkg-ins/metrics-endpoint.yml | 9 + .../metrics-profiles/metrics-aggregated.yml | 4 + .../manifests/config/pkg-ins/pkg-ins.yml | 72 +++++ .../manifests/config/pkg-ins/templates/og.yml | 7 + .../config/pkg-ins/templates/sub.yml | 10 + .../test/qe/util/stress/util/ma.py | 6 + .../test/qe/util/stress/util/ma/__init__.py | 10 + .../qe/util/stress/util/ma/cli/__init__.py | 0 .../qe/util/stress/util/ma/cli/__main__.py | 8 + .../util/stress/util/ma/cli/cmd_check_ccpu.py | 98 ++++++ .../qe/util/stress/util/ma/cli/cmd_group.py | 47 +++ .../qe/util/stress/util/ma/helper/__init__.py | 0 .../qe/util/stress/util/ma/helper/algo.py | 44 +++ .../qe/util/stress/util/ma/helper/const.py | 6 + .../stress/util/ma/helper/containercpu.py | 173 +++++++++++ .../util/stress/util/ma/helper/exceptions.py | 2 + .../qe/util/stress/util/ma/helper/util.py | 47 +++ 30 files changed, 1064 insertions(+), 59 deletions(-) create mode 100644 tests-extension/test/qe/specs/olmv0_stress.go create mode 100644 tests-extension/test/qe/util/stress/manifests/config/pkg-ins/metrics-endpoint.yml create mode 100644 tests-extension/test/qe/util/stress/manifests/config/pkg-ins/metrics-profiles/metrics-aggregated.yml create mode 100644 tests-extension/test/qe/util/stress/manifests/config/pkg-ins/pkg-ins.yml create mode 100644 tests-extension/test/qe/util/stress/manifests/config/pkg-ins/templates/og.yml create mode 100644 tests-extension/test/qe/util/stress/manifests/config/pkg-ins/templates/sub.yml create mode 100755 tests-extension/test/qe/util/stress/util/ma.py create mode 100644 tests-extension/test/qe/util/stress/util/ma/__init__.py create mode 100644 tests-extension/test/qe/util/stress/util/ma/cli/__init__.py create mode 100644 tests-extension/test/qe/util/stress/util/ma/cli/__main__.py create mode 100644 tests-extension/test/qe/util/stress/util/ma/cli/cmd_check_ccpu.py create mode 100644 tests-extension/test/qe/util/stress/util/ma/cli/cmd_group.py create mode 100644 tests-extension/test/qe/util/stress/util/ma/helper/__init__.py create mode 100644 tests-extension/test/qe/util/stress/util/ma/helper/algo.py create mode 100644 tests-extension/test/qe/util/stress/util/ma/helper/const.py create mode 100644 tests-extension/test/qe/util/stress/util/ma/helper/containercpu.py create mode 100644 tests-extension/test/qe/util/stress/util/ma/helper/exceptions.py create mode 100644 tests-extension/test/qe/util/stress/util/ma/helper/util.py diff --git a/tests-extension/.openshift-tests-extension/openshift_payload_olmv0.json b/tests-extension/.openshift-tests-extension/openshift_payload_olmv0.json index b0dfa4e022..771841bb89 100644 --- a/tests-extension/.openshift-tests-extension/openshift_payload_olmv0.json +++ b/tests-extension/.openshift-tests-extension/openshift_payload_olmv0.json @@ -1741,5 +1741,37 @@ "environmentSelector": { "exclude": "topology==\"External\"" } + }, + { + "name": "[sig-operator][Jira:OLM] OLM v0 for stress PolarionID:80299-[OTP][Skipped:Disconnected][OlmStress]create mass operator to see if they all are installed successfully with different ns [Slow][Timeout:180m]", + "labels": { + "Extended": {}, + "NonHyperShiftHOST": {}, + "StressTest": {} + }, + "resources": { + "isolation": {} + }, + "source": "openshift:payload:olmv0", + "lifecycle": "blocking", + "environmentSelector": { + "exclude": "topology==\"External\"" + } + }, + { + "name": "[sig-operator][Jira:OLM] OLM v0 for stress PolarionID:80413-[OTP][Skipped:Disconnected][OlmStress]install operator repeatedly serially with same ns [Slow][Timeout:180m]", + "labels": { + "Extended": {}, + "NonHyperShiftHOST": {}, + "StressTest": {} + }, + "resources": { + "isolation": {} + }, + "source": "openshift:payload:olmv0", + "lifecycle": "blocking", + "environmentSelector": { + "exclude": "topology==\"External\"" + } } ] diff --git a/tests-extension/test/qe/specs/olmv0_allns.go b/tests-extension/test/qe/specs/olmv0_allns.go index 161ef5b9e6..490ca045dd 100644 --- a/tests-extension/test/qe/specs/olmv0_allns.go +++ b/tests-extension/test/qe/specs/olmv0_allns.go @@ -20,13 +20,13 @@ var _ = g.Describe("[sig-operator][Jira:OLM] OLMv0 within all namespace", func() defer g.GinkgoRecover() var ( - oc = exutil.NewCLI("olm-all-"+exutil.GetRandomString(), exutil.KubeConfigPath()) - + oc = exutil.NewCLIWithoutNamespace("default") dr = make(olmv0util.DescriberResrouce) ) g.BeforeEach(func() { exutil.SkipMicroshift(oc) + oc.SetupProject() exutil.SkipNoOLMCore(oc) itName := g.CurrentSpecReport().FullText() dr.AddIr(itName) diff --git a/tests-extension/test/qe/specs/olmv0_common.go b/tests-extension/test/qe/specs/olmv0_common.go index a8e90927e6..fb9d715333 100644 --- a/tests-extension/test/qe/specs/olmv0_common.go +++ b/tests-extension/test/qe/specs/olmv0_common.go @@ -28,13 +28,14 @@ var _ = g.Describe("[sig-operator][Jira:OLM] OLMv0 should", func() { defer g.GinkgoRecover() var ( - oc = exutil.NewCLI("olm-common-"+exutil.GetRandomString(), exutil.KubeConfigPath()) + oc = exutil.NewCLIWithoutNamespace("default") dr = make(olmv0util.DescriberResrouce) ) g.BeforeEach(func() { exutil.SkipMicroshift(oc) + oc.SetupProject() exutil.SkipNoOLMCore(oc) itName := g.CurrentSpecReport().FullText() dr.AddIr(itName) diff --git a/tests-extension/test/qe/specs/olmv0_defaultoption.go b/tests-extension/test/qe/specs/olmv0_defaultoption.go index 7f9a033121..28394e4fa0 100644 --- a/tests-extension/test/qe/specs/olmv0_defaultoption.go +++ b/tests-extension/test/qe/specs/olmv0_defaultoption.go @@ -33,11 +33,12 @@ var _ = g.Describe("[sig-operator][Jira:OLM] OLMv0 optional should", func() { defer g.GinkgoRecover() var ( - oc = exutil.NewCLI("default-"+exutil.GetRandomString(), exutil.KubeConfigPath()) + oc = exutil.NewCLIWithoutNamespace("default") ) g.BeforeEach(func() { exutil.SkipMicroshift(oc) + oc.SetupProject() exutil.SkipNoOLMCore(oc) }) diff --git a/tests-extension/test/qe/specs/olmv0_hypershiftmgmt.go b/tests-extension/test/qe/specs/olmv0_hypershiftmgmt.go index 3eba3b3c99..19d45d2ce6 100644 --- a/tests-extension/test/qe/specs/olmv0_hypershiftmgmt.go +++ b/tests-extension/test/qe/specs/olmv0_hypershiftmgmt.go @@ -18,7 +18,7 @@ var _ = g.Describe("[sig-operator][Jira:OLM] OLMv0 on hypershift mgmt", g.Label( defer g.GinkgoRecover() var ( - oc = exutil.NewCLIForKubeOpenShift("hypershiftmgmt-" + exutil.GetRandomString()) + oc = exutil.NewCLIWithoutNamespace("default") guestClusterName, guestClusterKube, hostedClusterNS string isAKS bool errIsAKS error @@ -26,6 +26,7 @@ var _ = g.Describe("[sig-operator][Jira:OLM] OLMv0 on hypershift mgmt", g.Label( g.BeforeEach(func() { exutil.SkipMicroshift(oc) + oc.SetupProject() if !exutil.IsHypershiftMgmtCluster(oc) { g.Skip("this is not a hypershift management cluster, skip test run") } diff --git a/tests-extension/test/qe/specs/olmv0_multins.go b/tests-extension/test/qe/specs/olmv0_multins.go index 2c055703b6..6af212161a 100644 --- a/tests-extension/test/qe/specs/olmv0_multins.go +++ b/tests-extension/test/qe/specs/olmv0_multins.go @@ -19,13 +19,14 @@ var _ = g.Describe("[sig-operator][Jira:OLM] OLMv0 with multi ns", func() { defer g.GinkgoRecover() var ( - oc = exutil.NewCLI("olm-multi-"+exutil.GetRandomString(), exutil.KubeConfigPath()) + oc = exutil.NewCLIWithoutNamespace("default") dr = make(olmv0util.DescriberResrouce) ) g.BeforeEach(func() { exutil.SkipMicroshift(oc) + oc.SetupProject() exutil.SkipNoOLMCore(oc) itName := g.CurrentSpecReport().FullText() diff --git a/tests-extension/test/qe/specs/olmv0_nonallns.go b/tests-extension/test/qe/specs/olmv0_nonallns.go index 310b320ffb..7c3ffd83e1 100644 --- a/tests-extension/test/qe/specs/olmv0_nonallns.go +++ b/tests-extension/test/qe/specs/olmv0_nonallns.go @@ -26,13 +26,14 @@ var _ = g.Describe("[sig-operator][Jira:OLM] OLMv0 within a namespace", func() { defer g.GinkgoRecover() var ( - oc = exutil.NewCLI("olm-a-"+exutil.GetRandomString(), exutil.KubeConfigPath()) + oc = exutil.NewCLIWithoutNamespace("default") dr = make(olmv0util.DescriberResrouce) ) g.BeforeEach(func() { exutil.SkipMicroshift(oc) + oc.SetupProject() exutil.SkipNoOLMCore(oc) itName := g.CurrentSpecReport().FullText() dr.AddIr(itName) @@ -1108,7 +1109,22 @@ var _ = g.Describe("[sig-operator][Jira:OLM] OLMv0 within a namespace", func() { g.By("Delete sa of csv") sa.GetDefinition(oc) sa.Delete(oc) - olmv0util.NewCheck("expect", exutil.AsUser, exutil.WithNamespace, exutil.Compare, "RequirementsNotMet", exutil.Ok, []string{"csv", sub.InstalledCSV, "-o=jsonpath={.status.reason}"}).Check(oc) + var output string + var err error + errCsv := wait.PollUntilContextTimeout(context.TODO(), 10*time.Second, 300*time.Second, false, func(ctx context.Context) (bool, error) { + output, err = oc.WithoutNamespace().Run("get").Args("csv", sub.InstalledCSV, "-n", sub.Namespace, "-o=jsonpath={.status.reason}").Output() + if err != nil { + return false, err + } + if strings.Contains(output, "RequirementsNotMet") { + return true, nil + } + return false, nil + }) + if strings.Contains(output, "InstallWaiting") { + g.Skip("skip because of slow installation") + } + exutil.AssertWaitPollNoErr(errCsv, fmt.Sprintf("csv status %v is not expected", output)) g.By("Recovery sa of csv") sa.Reapply(oc) @@ -1835,6 +1851,8 @@ var _ = g.Describe("[sig-operator][Jira:OLM] OLMv0 within a namespace", func() { g.By("install operator") sub.Create(oc, itName, dr) + defer sub.DeleteCSV(itName, dr) + defer sub.Delete(itName, dr) g.By("check if dependent operator is installed") olmv0util.NewCheck("expect", exutil.AsAdmin, exutil.WithoutNamespace, exutil.Compare, "Succeeded", exutil.Ok, []string{"csv", sub.InstalledCSV, "-n", sub.Namespace, "-o=jsonpath={.status.phase}"}).Check(oc) diff --git a/tests-extension/test/qe/specs/olmv0_opm.go b/tests-extension/test/qe/specs/olmv0_opm.go index c845460596..e5bf2a2234 100644 --- a/tests-extension/test/qe/specs/olmv0_opm.go +++ b/tests-extension/test/qe/specs/olmv0_opm.go @@ -19,13 +19,14 @@ var _ = g.Describe("[sig-operator][Jira:OLM] OLMv0 opm should", g.Label("NonHype defer g.GinkgoRecover() var ( - oc = exutil.NewCLIForKubeOpenShift("opm-" + exutil.GetRandomString()) + oc = exutil.NewCLIWithoutNamespace("default") opmCLI = opmcli.NewOpmCLI() ) g.BeforeEach(func() { exutil.SkipMicroshift(oc) + oc.SetupProject() err := opmcli.EnsureOPMBinary() if err != nil { g.Skip("Failed to setup opm binary: " + err.Error()) @@ -500,6 +501,8 @@ var _ = g.Describe("[sig-operator][Jira:OLM] OLMv0 opm should", g.Label("NonHype if os.Getenv("HTTP_PROXY") != "" || os.Getenv("http_proxy") != "" { g.Skip("HTTP_PROXY is not empty - skipping test ...") } + errPolicy := opmcli.EnsureContainerPolicy() + o.Expect(errPolicy).NotTo(o.HaveOccurred()) opmBaseDir := exutil.FixturePath("testdata", "opm", "53871") opmCLI.ExecCommandPath = opmBaseDir diff --git a/tests-extension/test/qe/specs/olmv0_stress.go b/tests-extension/test/qe/specs/olmv0_stress.go new file mode 100644 index 0000000000..3b7ae52283 --- /dev/null +++ b/tests-extension/test/qe/specs/olmv0_stress.go @@ -0,0 +1,282 @@ +package specs + +import ( + "context" + "fmt" + "math/rand" + "path/filepath" + "strings" + "time" + + g "github.com/onsi/ginkgo/v2" + o "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/util/wait" + e2e "k8s.io/kubernetes/test/e2e/framework" + + exutil "github.com/openshift/operator-framework-olm/tests-extension/test/qe/util" + olmv0util "github.com/openshift/operator-framework-olm/tests-extension/test/qe/util/olmv0util" +) + +var _ = g.Describe("[sig-operator][Jira:OLM] OLM v0 for stress", func() { + defer g.GinkgoRecover() + + var ( + oc = exutil.NewCLIWithoutNamespace("default") + dr = make(olmv0util.DescriberResrouce) + ) + + g.BeforeEach(func() { + exutil.SkipMicroshift(oc) + oc.SetupProject() + exutil.SkipNoOLMCore(oc) + itName := g.CurrentSpecReport().FullText() + dr.AddIr(itName) + }) + + g.It("PolarionID:80299-[OTP][Skipped:Disconnected][OlmStress]create mass operator to see if they all are installed successfully with different ns [Slow][Timeout:180m]", g.Label("StressTest"), g.Label("NonHyperShiftHOST"), func() { + var ( + itName = g.CurrentSpecReport().FullText() + buildPruningBaseDir = exutil.FixturePath("testdata", "olm") + ogSingleTemplate = filepath.Join(buildPruningBaseDir, "operatorgroup.yaml") + subTemplate = filepath.Join(buildPruningBaseDir, "olm-subscription.yaml") + caseID = "80299" + ns = "openshift-operator-lifecycle-manager" + catalogLabel = "app=catalog-operator" + olmLabel = "app=olm-operator" + + og = olmv0util.OperatorGroupDescription{ + Name: "og-singlenamespace", + Namespace: "", + Template: ogSingleTemplate, + } + sub = olmv0util.SubscriptionDescription{ + SubName: "local-storage-operator", + Namespace: "", + Channel: "stable", + IpApproval: "Automatic", + OperatorPackage: "local-storage-operator", + CatalogSourceName: "redhat-operators", + CatalogSourceNamespace: "openshift-marketplace", + StartingCSV: "", + CurrentCSV: "", + InstalledCSV: "", + Template: subTemplate, + SingleNamespace: true, + } + ) + catsrcName := olmv0util.GetCatsrc(oc, "redhat-operators", "local-storage-operator") + if len(catsrcName) == 0 { + g.Skip("there is no package local-storage-operator") + } + e2e.Logf("the catsrc is %v", catsrcName) + sub.CatalogSourceName = catsrcName + startTime := time.Now().UTC() + e2e.Logf("Start time: %s", startTime.Format(time.RFC3339)) + + for i := 0; i < 45; i++ { + e2e.Logf("=================it is round %v=================", i) + func() { + seed := time.Now().UnixNano() + r := rand.New(rand.NewSource(seed)) + randomNum := r.Intn(5) + 5 + e2e.Logf("=================round %v has %v namespaces =================", i, randomNum) + namespaces := []string{} + for j := 0; j < randomNum; j++ { + namespaces = append(namespaces, "olm-stress-"+exutil.GetRandomString()) + } + + for _, nsName := range namespaces { + g.By(fmt.Sprintf("create ns %s, and then install og and sub", nsName)) + err := oc.AsAdmin().WithoutNamespace().Run("create").Args("ns", nsName).Execute() + o.Expect(err).NotTo(o.HaveOccurred()) + defer func(ns string) { + _ = oc.AsAdmin().WithoutNamespace().Run("delete").Args("ns", ns, "--force", "--grace-period=0", "--wait=false").Execute() + }(nsName) + og.Namespace = nsName + og.Create(oc, itName, dr) + sub.Namespace = nsName + sub.CreateWithoutCheckNoPrint(oc, itName, dr) + } + for _, nsName := range namespaces { + g.By(fmt.Sprintf("find the installed csv ns %s", nsName)) + sub.Namespace = nsName + sub.FindInstalledCSV(oc, itName, dr) + } + for _, nsName := range namespaces { + g.By(fmt.Sprintf("check the installed csv is ok in %s", nsName)) + sub.Namespace = nsName + + errWait := wait.PollUntilContextTimeout(context.TODO(), 3*time.Second, 150*time.Second, false, func(ctx context.Context) (bool, error) { + phase, err := oc.AsAdmin().WithoutNamespace().Run("get").Args("csv", sub.InstalledCSV, "-n", sub.Namespace, "-o=jsonpath={.status.phase}").Output() + if err != nil { + if strings.Contains(phase, "NotFound") || strings.Contains(phase, "No resources found") { + e2e.Logf("the existing csv does not exist, and try to get new csv") + sub.FindInstalledCSV(oc, itName, dr) + } else { + e2e.Logf("the error: %v, and try next", err) + } + return false, nil + } + + e2e.Logf("---> we expect value: %s, in returned value: %s", "Succeeded+2+InstallSucceeded", phase) + if strings.Compare(phase, "Succeeded") == 0 || strings.Compare(phase, "InstallSucceeded") == 0 { + e2e.Logf("the output %s matches one of the content %s, expected", phase, "Succeeded+2+InstallSucceeded") + return true, nil + } + e2e.Logf("the output %s does not match one of the content %s, unexpected", phase, "Succeeded+2+InstallSucceeded") + return false, nil + }) + if errWait != nil { + olmv0util.GetResource(oc, true, true, "pod", "-n", "openshift-marketplace") + olmv0util.GetResource(oc, true, true, "operatorgroup", "-n", sub.Namespace, "-o", "yaml") + olmv0util.GetResource(oc, true, true, "subscription", "-n", sub.Namespace, "-o", "yaml") + olmv0util.GetResource(oc, true, true, "installplan", "-n", sub.Namespace) + olmv0util.GetResource(oc, true, true, "csv", "-n", sub.Namespace) + olmv0util.GetResource(oc, true, true, "pods", "-n", sub.Namespace) + } + exutil.AssertWaitPollNoErr(errWait, fmt.Sprintf("expected content %s not found by %v", "Succeeded+2+InstallSucceeded", strings.Join([]string{"csv", sub.InstalledCSV, "-n", sub.Namespace, "-o=jsonpath={.status.phase}"}, " "))) + + } + + }() + } + endTime := time.Now().UTC() + e2e.Logf("End time: %v", endTime.Format(time.RFC3339)) + + duration := endTime.Sub(startTime) + minutes := int(duration.Minutes()) + if minutes < 1 { + minutes = 1 + } + + podName, err := oc.AsAdmin().WithoutNamespace().Run("get").Args("pods", "-l", catalogLabel, "-o=jsonpath={.items[0].metadata.name}", "-n", ns).Output() + if err == nil { + if !exutil.WriteErrToArtifactDir(oc, ns, podName, "error", "Unhandled|Reconciler error|try again|level=info|warning", caseID, minutes) { + e2e.Logf("no error log into artifact for pod %s in %s", podName, ns) + } + } + podName, err = oc.AsAdmin().WithoutNamespace().Run("get").Args("pods", "-l", olmLabel, "-o=jsonpath={.items[0].metadata.name}", "-n", ns).Output() + if err == nil { + if !exutil.WriteErrToArtifactDir(oc, ns, podName, "error", "Unhandled|Reconciler error|level=info|warning|update Operator status|no errors|warning", caseID, minutes) { + e2e.Logf("no error log into artifact for pod %s in %s", podName, ns) + } + } + + if !exutil.IsPodReady(oc, ns, catalogLabel) { + olmv0util.GetResource(oc, true, true, "pod", "-n", ns, "-l", catalogLabel, "-o", "yaml") + exutil.AssertWaitPollNoErr(fmt.Errorf("the pod with %s is not correct", catalogLabel), "the pod with app=catalog-operator is not correct") + } + if !exutil.IsPodReady(oc, ns, olmLabel) { + olmv0util.GetResource(oc, true, true, "pod", "-n", ns, "-l", olmLabel, "-o", "yaml") + exutil.AssertWaitPollNoErr(fmt.Errorf("the pod with %s is not correct", olmLabel), "the pod with app=olm-operator is not correct") + } + }) + + g.It("PolarionID:80413-[OTP][Skipped:Disconnected][OlmStress]install operator repeatedly serially with same ns [Slow][Timeout:180m]", g.Label("StressTest"), g.Label("NonHyperShiftHOST"), func() { + var ( + itName = g.CurrentSpecReport().FullText() + buildPruningBaseDir = exutil.FixturePath("testdata", "olm") + ogSingleTemplate = filepath.Join(buildPruningBaseDir, "operatorgroup.yaml") + catsrcImageTemplate = filepath.Join(buildPruningBaseDir, "catalogsource-image.yaml") + subTemplate = filepath.Join(buildPruningBaseDir, "olm-subscription.yaml") + caseID = "80413" + ns = "openshift-must-gather-operator" + nsOlm = "openshift-operator-lifecycle-manager" + catalogLabel = "app=catalog-operator" + olmLabel = "app=olm-operator" + + catsrc = olmv0util.CatalogSourceDescription{ + Name: "catsrc-80413", + Namespace: ns, + DisplayName: "Test 80413", + Publisher: "OLM QE", + SourceType: "grpc", + Address: "quay.io/app-sre/must-gather-operator-registry@sha256:0a0610e37a016fb4eed1b000308d840795838c2306f305a151c64cf3b4fd6bb4", + Template: catsrcImageTemplate, + } + og = olmv0util.OperatorGroupDescription{ + Name: "og", + Namespace: ns, + Template: ogSingleTemplate, + } + sub = olmv0util.SubscriptionDescription{ + SubName: "must-gather-operator", + Namespace: ns, + Channel: "stable", + IpApproval: "Automatic", + OperatorPackage: "must-gather-operator", + CatalogSourceName: "catsrc-80413", + CatalogSourceNamespace: ns, + StartingCSV: "", + CurrentCSV: "", + InstalledCSV: "", + Template: subTemplate, + SingleNamespace: true, + } + ) + + startTime := time.Now().UTC() + e2e.Logf("Start time: %s", startTime.Format(time.RFC3339)) + + for i := 0; i < 150; i++ { + e2e.Logf("=================it is round %v=================", i) + func() { + g.By(fmt.Sprintf("create ns %s", ns)) + err := oc.AsAdmin().WithoutNamespace().Run("create").Args("ns", ns).Execute() + o.Expect(err).NotTo(o.HaveOccurred()) + defer func() { + _ = oc.AsAdmin().WithoutNamespace().Run("delete").Args("ns", ns).Execute() + }() + + g.By(fmt.Sprintf("install catsrc in %s", ns)) + defer catsrc.Delete(itName, dr) + catsrc.Create(oc, itName, dr) + + g.By(fmt.Sprintf("install og in %s", ns)) + og.Create(oc, itName, dr) + + g.By(fmt.Sprintf("install sub in %s", ns)) + sub.CreateWithoutCheckNoPrint(oc, itName, dr) + + g.By(fmt.Sprintf("find the installed csv ns %s", ns)) + sub.FindInstalledCSV(oc, itName, dr) + + g.By(fmt.Sprintf("check the installed csv is ok in %s", ns)) + olmv0util.NewCheck("expect", true, true, true, "Succeeded+2+InstallSucceeded", true, []string{"csv", sub.InstalledCSV, "-n", sub.Namespace, "-o=jsonpath={.status.phase}"}).Check(oc) + + }() + } + + endTime := time.Now().UTC() + e2e.Logf("End time: %v", endTime.Format(time.RFC3339)) + + duration := endTime.Sub(startTime) + minutes := int(duration.Minutes()) + if minutes < 1 { + minutes = 1 + } + + podName, err := oc.AsAdmin().WithoutNamespace().Run("get").Args("pods", "-l", catalogLabel, "-o=jsonpath={.items[0].metadata.name}", "-n", nsOlm).Output() + if err == nil { + if !exutil.WriteErrToArtifactDir(oc, nsOlm, podName, "error", "Unhandled|Reconciler error|try again|level=info|warning", caseID, minutes) { + e2e.Logf("no error log into artifact for pod %s in %s", podName, nsOlm) + } + } + podName, err = oc.AsAdmin().WithoutNamespace().Run("get").Args("pods", "-l", olmLabel, "-o=jsonpath={.items[0].metadata.name}", "-n", nsOlm).Output() + if err == nil { + if !exutil.WriteErrToArtifactDir(oc, nsOlm, podName, "error", "Unhandled|Reconciler error|level=info|warning|update Operator status|no errors|warning", caseID, minutes) { + e2e.Logf("no error log into artifact for pod %s in %s", podName, nsOlm) + } + } + + if !exutil.IsPodReady(oc, nsOlm, catalogLabel) { + olmv0util.GetResource(oc, true, true, "pod", "-n", nsOlm, "-l", catalogLabel, "-o", "yaml") + exutil.AssertWaitPollNoErr(fmt.Errorf("the pod with %s is not correct", catalogLabel), "the pod with app=catalog-operator is not correct") + } + if !exutil.IsPodReady(oc, nsOlm, olmLabel) { + olmv0util.GetResource(oc, true, true, "pod", "-n", nsOlm, "-l", olmLabel, "-o", "yaml") + exutil.AssertWaitPollNoErr(fmt.Errorf("the pod with %s is not correct", olmLabel), "the pod with app=olm-operator is not correct") + } + }) + +}) diff --git a/tests-extension/test/qe/util/client.go b/tests-extension/test/qe/util/client.go index 3a3b4a6ffa..3204b68f07 100644 --- a/tests-extension/test/qe/util/client.go +++ b/tests-extension/test/qe/util/client.go @@ -720,63 +720,153 @@ type ExitError struct { *exec.ExitError } +// isTransientKubeconfigError checks if the error is a transient kubeconfig-related error +// that can be retried (typically caused by race conditions in parallel test execution) +func isTransientKubeconfigError(output string) bool { + transientErrors := []string{ + "context was not found", + "Error in configuration", + "cluster has no server defined", + "error loading config file", + "unable to read client-cert", + "unable to read client-key", + "unable to read certificate-authority", + } + for _, errMsg := range transientErrors { + if strings.Contains(output, errMsg) { + return true + } + } + return false +} + // Output executes the command and returns stdout/stderr combined into one string +// Implements retry logic for transient kubeconfig errors caused by parallel test execution func (c *CLI) Output() (string, error) { if c.verbose { fmt.Printf("DEBUG: oc %s\n", c.printCmd()) } - cmd := exec.Command(c.execPath, c.finalArgs...) // nolint:gosec // G204: execPath is hardcoded to "oc" and finalArgs come from test code - cmd.Stdin = c.stdin - if c.showInfo { - e2e.Logf("Running '%s %s'", c.execPath, strings.Join(c.finalArgs, " ")) - } - out, err := cmd.CombinedOutput() - trimmed := strings.TrimSpace(string(out)) - if err == nil { - c.stdout = bytes.NewBuffer(out) - return trimmed, nil + + // Retry configuration for handling race conditions in parallel tests + const maxRetries = 3 + const retryDelay = 500 * time.Millisecond + + var lastErr error + var lastOutput string + + for attempt := 1; attempt <= maxRetries; attempt++ { + cmd := exec.Command(c.execPath, c.finalArgs...) // nolint:gosec // G204: execPath is hardcoded to "oc" and finalArgs come from test code + cmd.Stdin = c.stdin + if c.showInfo && attempt == 1 { + e2e.Logf("Running '%s %s'", c.execPath, strings.Join(c.finalArgs, " ")) + } + out, err := cmd.CombinedOutput() + trimmed := strings.TrimSpace(string(out)) + + if err == nil { + c.stdout = bytes.NewBuffer(out) + if attempt > 1 { + e2e.Logf("Command succeeded on retry attempt %d/%d", attempt, maxRetries) + } + return trimmed, nil + } + + lastOutput = trimmed + lastErr = err + + // Check if this is a transient kubeconfig error that we should retry + if isTransientKubeconfigError(trimmed) { + if attempt < maxRetries { + e2e.Logf("Transient kubeconfig error detected (attempt %d/%d), retrying after %v: %s", + attempt, maxRetries, retryDelay, trimmed) + time.Sleep(retryDelay) + continue + } + e2e.Logf("Transient kubeconfig error persisted after %d attempts: %s", maxRetries, trimmed) + } + + // Not a transient error or max retries reached, break and handle normally + break } + + // Handle the final error var err1 *exec.ExitError - if errors.As(err, &err1) { - e2e.Logf("Error running %v:\n%s", cmd, trimmed) - return trimmed, &ExitError{ExitError: err1, Cmd: c.execPath + " " + strings.Join(c.finalArgs, " "), StdErr: trimmed} + if errors.As(lastErr, &err1) { + e2e.Logf("Error running %v:\n%s", c.execPath+" "+strings.Join(c.finalArgs, " "), lastOutput) + return lastOutput, &ExitError{ExitError: err1, Cmd: c.execPath + " " + strings.Join(c.finalArgs, " "), StdErr: lastOutput} } - FatalErr(fmt.Errorf("unable to execute %q: %v", c.execPath, err)) + FatalErr(fmt.Errorf("unable to execute %q: %v", c.execPath, lastErr)) return "", nil } // Outputs executes the command and returns the stdout/stderr output as separate strings +// Implements retry logic for transient kubeconfig errors caused by parallel test execution func (c *CLI) Outputs() (string, string, error) { if c.verbose { fmt.Printf("DEBUG: oc %s\n", c.printCmd()) } - cmd := exec.Command(c.execPath, c.finalArgs...) // nolint:gosec // G204: execPath is hardcoded to "oc" and finalArgs come from test code - cmd.Stdin = c.stdin - e2e.Logf("showInfo is %v", c.showInfo) - if c.showInfo { - e2e.Logf("Running '%s %s'", c.execPath, strings.Join(c.finalArgs, " ")) - } - //out, err := cmd.CombinedOutput() - var stdErrBuff, stdOutBuff bytes.Buffer - cmd.Stdout = &stdOutBuff - cmd.Stderr = &stdErrBuff - err := cmd.Run() - stdOutBytes := stdOutBuff.Bytes() - stdErrBytes := stdErrBuff.Bytes() - stdOut := strings.TrimSpace(string(stdOutBytes)) - stdErr := strings.TrimSpace(string(stdErrBytes)) - if err == nil { - c.stdout = bytes.NewBuffer(stdOutBytes) - c.stderr = bytes.NewBuffer(stdErrBytes) - return stdOut, stdErr, nil + // Retry configuration for handling race conditions in parallel tests + const maxRetries = 3 + const retryDelay = 500 * time.Millisecond + + var lastErr error + var lastStdOut, lastStdErr string + + for attempt := 1; attempt <= maxRetries; attempt++ { + cmd := exec.Command(c.execPath, c.finalArgs...) // nolint:gosec // G204: execPath is hardcoded to "oc" and finalArgs come from test code + cmd.Stdin = c.stdin + if c.showInfo && attempt == 1 { + e2e.Logf("showInfo is %v", c.showInfo) + e2e.Logf("Running '%s %s'", c.execPath, strings.Join(c.finalArgs, " ")) + } + + var stdErrBuff, stdOutBuff bytes.Buffer + cmd.Stdout = &stdOutBuff + cmd.Stderr = &stdErrBuff + err := cmd.Run() + + stdOutBytes := stdOutBuff.Bytes() + stdErrBytes := stdErrBuff.Bytes() + stdOut := strings.TrimSpace(string(stdOutBytes)) + stdErr := strings.TrimSpace(string(stdErrBytes)) + + if err == nil { + c.stdout = bytes.NewBuffer(stdOutBytes) + c.stderr = bytes.NewBuffer(stdErrBytes) + if attempt > 1 { + e2e.Logf("Command succeeded on retry attempt %d/%d", attempt, maxRetries) + } + return stdOut, stdErr, nil + } + + lastStdOut = stdOut + lastStdErr = stdErr + lastErr = err + + // Check if this is a transient kubeconfig error in either stdout or stderr + combinedOutput := stdOut + stdErr + if isTransientKubeconfigError(combinedOutput) { + if attempt < maxRetries { + e2e.Logf("Transient kubeconfig error detected (attempt %d/%d), retrying after %v: %s", + attempt, maxRetries, retryDelay, combinedOutput) + time.Sleep(retryDelay) + continue + } + e2e.Logf("Transient kubeconfig error persisted after %d attempts: %s", maxRetries, combinedOutput) + } + + // Not a transient error or max retries reached, break and handle normally + break } + + // Handle the final error var err1 *exec.ExitError - if errors.As(err, &err1) { - e2e.Logf("Error running %v:\nStdOut>\n%s\nStdErr>\n%s\n", cmd, stdOut, stdErr) - return stdOut, stdErr, &ExitError{ExitError: err1, Cmd: c.execPath + " " + strings.Join(c.finalArgs, " "), StdErr: stdErr} + if errors.As(lastErr, &err1) { + e2e.Logf("Error running %v:\nStdOut>\n%s\nStdErr>\n%s\n", c.execPath+" "+strings.Join(c.finalArgs, " "), lastStdOut, lastStdErr) + return lastStdOut, lastStdErr, &ExitError{ExitError: err1, Cmd: c.execPath + " " + strings.Join(c.finalArgs, " "), StdErr: lastStdErr} } - FatalErr(fmt.Errorf("unable to execute %q: %v", c.execPath, err)) + FatalErr(fmt.Errorf("unable to execute %q: %v", c.execPath, lastErr)) return "", "", nil } diff --git a/tests-extension/test/qe/util/nodes.go b/tests-extension/test/qe/util/nodes.go index 0ee466eab4..0353a887e7 100644 --- a/tests-extension/test/qe/util/nodes.go +++ b/tests-extension/test/qe/util/nodes.go @@ -737,11 +737,23 @@ func GetNodeListByLabel(oc *CLI, labelKey string) []string { // - bool: true if default node selector is configured, false otherwise func IsDefaultNodeSelectorEnabled(oc *CLI) bool { defaultNodeSelector, getNodeSelectorErr := oc.AsAdmin().WithoutNamespace().Run("get").Args("scheduler", "cluster", "-o=jsonpath={.spec.defaultNodeSelector}").Output() - if getNodeSelectorErr != nil && strings.Contains(defaultNodeSelector, `the server doesn't have a resource type`) { - e2e.Logf("WARNING: The scheduler API is not supported on the test cluster") - return false + if getNodeSelectorErr != nil { + // Check if the scheduler API is not supported (expected on some cluster types) + if strings.Contains(defaultNodeSelector, `the server doesn't have a resource type`) { + e2e.Logf("WARNING: The scheduler API is not supported on the test cluster") + return false + } + // Handle transient kubeconfig errors (race conditions during parallel test execution) + // These errors can occur when temp kubeconfig files have stale context references + if strings.Contains(defaultNodeSelector, "context was not found") || + strings.Contains(defaultNodeSelector, "Error in configuration") || + strings.Contains(defaultNodeSelector, "cluster has no server defined") { + e2e.Logf("WARNING: Transient kubeconfig error detected (likely race condition), assuming no default node selector: %v", getNodeSelectorErr) + return false + } + // For other unexpected errors, fail the test with detailed information + o.Expect(getNodeSelectorErr).NotTo(o.HaveOccurred(), "Fail to get cluster scheduler defaultNodeSelector got error: %v\n", getNodeSelectorErr) } - o.Expect(getNodeSelectorErr).NotTo(o.HaveOccurred(), "Fail to get cluster scheduler defaultNodeSelector got error: %v\n", getNodeSelectorErr) return !strings.EqualFold(defaultNodeSelector, "") } diff --git a/tests-extension/test/qe/util/olmv0util/catalog_source.go b/tests-extension/test/qe/util/olmv0util/catalog_source.go index 38a9c9ff6f..0d910103ab 100644 --- a/tests-extension/test/qe/util/olmv0util/catalog_source.go +++ b/tests-extension/test/qe/util/olmv0util/catalog_source.go @@ -93,9 +93,20 @@ func (catsrc *CatalogSourceDescription) SetSCCRestricted(oc *exutil.CLI) { } else { // Retrieve PSA enforcement level from cluster configuration output, err := oc.AsAdmin().WithoutNamespace().Run("get").Args("configmaps", "-n", "openshift-kube-apiserver", "config", `-o=jsonpath={.data.config\.yaml}`).Output() - o.Expect(err).NotTo(o.HaveOccurred()) - psa = gjson.Get(output, "admission.pluginConfig.PodSecurity.configuration.defaults.enforce").String() - e2e.Logf("pod-security.kubernetes.io/enforce is %s", string(psa)) + if err != nil { + // Check if this is a Forbidden error (permission denied) + if strings.Contains(err.Error(), "Forbidden") { + // In some CI environments, even with AsAdmin(), the service account may not have permission + // to access openshift-kube-apiserver namespace. Use default value "restricted" in such cases. + e2e.Logf("cannot get default PSA setting from kube-apiserver config (permission denied), use default value restricted: %v", err) + } else { + // For other errors (e.g., network issues, API server down), fail the test + o.Expect(err).NotTo(o.HaveOccurred()) + } + } else { + psa = gjson.Get(output, "admission.pluginConfig.PodSecurity.configuration.defaults.enforce").String() + e2e.Logf("pod-security.kubernetes.io/enforce is %s", string(psa)) + } } // Apply restricted security context if PSA enforcement requires it if strings.Contains(string(psa), "restricted") { diff --git a/tests-extension/test/qe/util/opmcli/opm_bin.go b/tests-extension/test/qe/util/opmcli/opm_bin.go index cbff6f1604..94c82f9fc1 100644 --- a/tests-extension/test/qe/util/opmcli/opm_bin.go +++ b/tests-extension/test/qe/util/opmcli/opm_bin.go @@ -299,14 +299,24 @@ func downloadFile(url, filepath string) error { func setupOPMEnv(opmDir string) error { currentPath := os.Getenv("PATH") - if !strings.Contains(currentPath, opmDir) { - newPath := opmDir + ":" + currentPath - err := os.Setenv("PATH", newPath) - if err != nil { - return err + + // Split PATH and check if opmDir already exists as an exact entry + // This prevents false positives like "/tmp/google-cloud-sdk/bin" matching "/tmp" + paths := strings.Split(currentPath, ":") + for _, p := range paths { + if p == opmDir { + e2e.Logf("%s already in PATH", opmDir) + return nil } - e2e.Logf("Added %s to PATH: %s", opmDir, newPath) } + + // Not in PATH, add it + newPath := opmDir + ":" + currentPath + err := os.Setenv("PATH", newPath) + if err != nil { + return err + } + e2e.Logf("Added %s to PATH: %s", opmDir, newPath) return nil } diff --git a/tests-extension/test/qe/util/stress/manifests/config/pkg-ins/metrics-endpoint.yml b/tests-extension/test/qe/util/stress/manifests/config/pkg-ins/metrics-endpoint.yml new file mode 100644 index 0000000000..b43d6e0021 --- /dev/null +++ b/tests-extension/test/qe/util/stress/manifests/config/pkg-ins/metrics-endpoint.yml @@ -0,0 +1,9 @@ +- endpoint: {{.PROMETHEUS_URL}} + token: {{.PROMETHEUS_TOKEN}} + step: 10s + skipTLSVerify: true + metrics: + - metrics-profiles/metrics-aggregated.yml + indexer: + type: local + metricsDirectory: collected-metrics-{{.UUID}} diff --git a/tests-extension/test/qe/util/stress/manifests/config/pkg-ins/metrics-profiles/metrics-aggregated.yml b/tests-extension/test/qe/util/stress/manifests/config/pkg-ins/metrics-profiles/metrics-aggregated.yml new file mode 100644 index 0000000000..2827aa04cf --- /dev/null +++ b/tests-extension/test/qe/util/stress/manifests/config/pkg-ins/metrics-profiles/metrics-aggregated.yml @@ -0,0 +1,4 @@ +# Containers & pod metrics + +- query: (sum(irate(container_cpu_usage_seconds_total{container="registry-server",namespace="openshift-marketplace"}[2m]) * 100) by (container, pod)) > 0 + metricName: containerCPU-CatSrc diff --git a/tests-extension/test/qe/util/stress/manifests/config/pkg-ins/pkg-ins.yml b/tests-extension/test/qe/util/stress/manifests/config/pkg-ins/pkg-ins.yml new file mode 100644 index 0000000000..316190d91d --- /dev/null +++ b/tests-extension/test/qe/util/stress/manifests/config/pkg-ins/pkg-ins.yml @@ -0,0 +1,72 @@ +--- +global: + gc: {{.GC}} + gcMetrics: {{.GC_METRICS}} + measurements: + - name: podLatency + + +jobs: + - name: {{.OPERATION}} # will be parameter + jobType: create + jobIterations: {{.JOB_ITERATIONS}} + namespace: {{.OPERATION}} + namespacedIterations: {{.NAMESPACED_ITERATIONS}} + iterationsPerNamespace: {{.ITERATIONS_PER_NAMESPACE}} + cleanup: true + podWait: true + waitWhenFinished: true + maxWaitTimeout: {{.MAX_WAIT_TIMEOUT}} + jobIterationDelay: {{.JOB_ITERATION_DELAY}} + jobPause: {{.JOB_PAUSE}} + qps: {{.QPS}} + burst: {{.BURST}} + executionMode: parallel + verifyObjects: true + errorOnVerify: true + skipIndexing: false + preLoadImages: true + preLoadPeriod: 15s + churn: {{.CHURN}} + churnCycles: {{.CHURN_CYCLES}} + churnDuration: {{.CHURN_DURATION}} + churnPercent: {{.CHURN_PERCENT}} + churnDelay: {{.CHURN_DELAY}} + churnDeletionStrategy: {{.CHURN_DELETION_STRATEGY}} + defaultMissingKeysWithZero: false + namespaceLabels: + security.openshift.io/scc.podSecurityLabelSync: false + pod-security.kubernetes.io/enforce: privileged + pod-security.kubernetes.io/audit: privileged + pod-security.kubernetes.io/warn: privileged + objects: + + - objectTemplate: templates/og.yml + replicas: 1 + inputVars: + prefixNamespace: {{.OPERATION}} + + + - objectTemplate: templates/sub.yml + replicas: 1 + inputVars: + pkgName: {{.PKG_NAME}} + channelName: {{.CHANNEL_NAME}} + catsrcName: {{.CATALOGSOURCE_NAME}} + catsrcNamespace: {{.CATALOGSOURCE_NAMESPACE}} + # will add inputVars to make operator as parameters + waitOptions: + # only take one of customStatusPaths and kind, and both can use labelSelector + # 1, take customStatusPaths to check sub's status + customStatusPaths: + - key: ".state" + value: "AtLatestKnown" + # 2, take kind to check pod's status with label + # labelSelector: {app: migration} + # labelSelector: {control-plane: controller-manager} + # kind: Pod + # 3, take other field of sub and it does not work yet + # labelSelector: {kube-burner-job: operator-install-delete} + # customStatusPaths: + # - key: ".installedCSV" + # value: "oadp-operator.v1.4.2" diff --git a/tests-extension/test/qe/util/stress/manifests/config/pkg-ins/templates/og.yml b/tests-extension/test/qe/util/stress/manifests/config/pkg-ins/templates/og.yml new file mode 100644 index 0000000000..d9d9ba3d1e --- /dev/null +++ b/tests-extension/test/qe/util/stress/manifests/config/pkg-ins/templates/og.yml @@ -0,0 +1,7 @@ +kind: OperatorGroup +apiVersion: operators.coreos.com/v1 +metadata: + name: og +spec: + targetNamespaces: + - "{{.prefixNamespace}}-{{.Iteration}}" diff --git a/tests-extension/test/qe/util/stress/manifests/config/pkg-ins/templates/sub.yml b/tests-extension/test/qe/util/stress/manifests/config/pkg-ins/templates/sub.yml new file mode 100644 index 0000000000..4761d85fe7 --- /dev/null +++ b/tests-extension/test/qe/util/stress/manifests/config/pkg-ins/templates/sub.yml @@ -0,0 +1,10 @@ +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: {{.pkgName}} +spec: + channel: {{.channelName}} + installPlanApproval: automatic + name: {{.pkgName}} + source: {{.catsrcName}} + sourceNamespace: {{.catsrcNamespace}} diff --git a/tests-extension/test/qe/util/stress/util/ma.py b/tests-extension/test/qe/util/stress/util/ma.py new file mode 100755 index 0000000000..976a9fed83 --- /dev/null +++ b/tests-extension/test/qe/util/stress/util/ma.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +from ma.cli.__main__ import main + +if __name__ == "__main__": + main() diff --git a/tests-extension/test/qe/util/stress/util/ma/__init__.py b/tests-extension/test/qe/util/stress/util/ma/__init__.py new file mode 100644 index 0000000000..1898849ec8 --- /dev/null +++ b/tests-extension/test/qe/util/stress/util/ma/__init__.py @@ -0,0 +1,10 @@ +import os +import sys + +if sys.version_info < (3, 9): + sys.exit("Sorry, Python < 3.9 is no longer supported.") + +sys.dont_write_bytecode = True + +def version(): + return "0.1" diff --git a/tests-extension/test/qe/util/stress/util/ma/cli/__init__.py b/tests-extension/test/qe/util/stress/util/ma/cli/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests-extension/test/qe/util/stress/util/ma/cli/__main__.py b/tests-extension/test/qe/util/stress/util/ma/cli/__main__.py new file mode 100644 index 0000000000..f201dfbedb --- /dev/null +++ b/tests-extension/test/qe/util/stress/util/ma/cli/__main__.py @@ -0,0 +1,8 @@ +from ma.cli.cmd_group import cli + + +def main(): + try: + cli(obj={}) + except Exception as e: + raise SystemExit(e) diff --git a/tests-extension/test/qe/util/stress/util/ma/cli/cmd_check_ccpu.py b/tests-extension/test/qe/util/stress/util/ma/cli/cmd_check_ccpu.py new file mode 100644 index 0000000000..c18a63179d --- /dev/null +++ b/tests-extension/test/qe/util/stress/util/ma/cli/cmd_check_ccpu.py @@ -0,0 +1,98 @@ +import click +import logging +from ma.helper.containercpu import ContainerCPU + +logger = logging.getLogger(__name__) + + +@click.command() +@click.pass_context +@click.option( + "-i", + "--metrics_result_file", + type=click.Path( + exists=True, + file_okay=True, + dir_okay=False, + readable=True, + resolve_path=True + ), + required=True, + help="the result file of the metrics" +) +@click.option( + "-o", + "--output_dir", + default="./", + type=click.Path( + exists=True, + file_okay=False, + dir_okay=True, + readable=True, + resolve_path=True + ), + help="the directory of generated metrics figure" +) +@click.option( + "--zscore_threshold", + type=int, + default=3, + required=False, + help="the threshold for z-score" +) +@click.option( + "--window_size", + type=int, + default=18, + required=False, + help="the size of moving window" +) +@click.option( + "--window_threshold", + type=int, + default=3, + required=False, + help="the threshold of moving window" +) +@click.option( + "--watermark", + type=int, + default=20, + required=False, + help="the abnoarm cpu usage calucalted by 100" +) +@click.option( + "--anomalies_threshold", + type=int, + default=2, + required=False, + help="the anomalies threshold to determine if the checking fails or not" +) +def check_ccpu(ctx, + metrics_result_file, + output_dir, + zscore_threshold, + window_size, + window_threshold, + watermark, + anomalies_threshold): + """ + Check if cpu usage is expected + """ + try: + ccpu = ContainerCPU(metrics_result_file, + output_dir, + zscore_threshold, + window_size, + window_threshold, + watermark, + anomalies_threshold) + ccpu.handle() + ccpu.ok_or_not() + # print(ccpu.get_preliminary_screening()) + # print(ccpu.get_refinement_screening()) + # print(ccpu.get_final_screening()) + except Exception as e: + logger.exception("checking container cpu failing") + raise + diff --git a/tests-extension/test/qe/util/stress/util/ma/cli/cmd_group.py b/tests-extension/test/qe/util/stress/util/ma/cli/cmd_group.py new file mode 100644 index 0000000000..94bfc575ea --- /dev/null +++ b/tests-extension/test/qe/util/stress/util/ma/cli/cmd_group.py @@ -0,0 +1,47 @@ +import click +import logging +import sys +from ma.cli.cmd_check_ccpu import check_ccpu +import ma.helper.util as util +from ma import version +from ma.helper.const import CONTEXT_SETTINGS + +logger = logging.getLogger(__name__) + + +def print_version(ctx, param, value): + if not value or ctx.resilient_parsing: + return + click.echo("ma v{}".format(version())) + click.echo("python v{}".format(sys.version)) + ctx.exit() + + +@click.group(context_settings=CONTEXT_SETTINGS) +@click.pass_context +@click.option( + "-V", + "--version", + is_flag=True, + callback=print_version, + expose_value=False, + is_eager=True, +) +@click.option( + "-v", + "--debug", + help="enable debug logging", + is_flag=True, + default=False) +def cli(ctx, debug): + util.init_logging(logging.DEBUG if debug else logging.INFO) + is_help = False + for k in CONTEXT_SETTINGS["help_option_names"]: + if k in sys.argv: + is_help = True + break + if not is_help: + logger.info("start to handle sub command") + pass + +cli.add_command(check_ccpu) diff --git a/tests-extension/test/qe/util/stress/util/ma/helper/__init__.py b/tests-extension/test/qe/util/stress/util/ma/helper/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests-extension/test/qe/util/stress/util/ma/helper/algo.py b/tests-extension/test/qe/util/stress/util/ma/helper/algo.py new file mode 100644 index 0000000000..13985f83ae --- /dev/null +++ b/tests-extension/test/qe/util/stress/util/ma/helper/algo.py @@ -0,0 +1,44 @@ +import logging +import numpy as np + +logger = logging.getLogger(__name__) + +def z_score(timestamps, values, threshold): + # (default threshold = 3) + preliminary_anomalies = [] + mean = np.mean(values) + std = np.std(values) + logger.info(f"mean: {mean}, std: {std}") + for i, value in enumerate(values): + if std == 0: + preliminary_anomalies.append((timestamps[i], value)) + continue + if value > mean + threshold * std: + preliminary_anomalies.append((timestamps[i], value)) + return preliminary_anomalies + + +def moving_window_statistics (preliminary_anomalies, timestamps, values, window_size, window_threshold): + refined_anomalies = [] + # Refinement with moving window (default window_size = 30 points, default threshold = 3) + logger.info(f"window_size: {window_size}, window_threshold: {window_threshold}") + for ts, value in preliminary_anomalies: + index = timestamps.index(ts) + if index < window_size: + continue + window = values[index - window_size : index] + window_mean = np.mean(window) + window_std = np.std(window) + if window_std == 0: + refined_anomalies.append((ts, value)) + continue + if value > window_mean + window_threshold * window_std: + refined_anomalies.append((ts, value)) + return refined_anomalies + +def watermark(refined_anomalies, watermark): + anomalies = [] + for ts, value in refined_anomalies: + if value > watermark: + anomalies.append((ts, value)) + return anomalies diff --git a/tests-extension/test/qe/util/stress/util/ma/helper/const.py b/tests-extension/test/qe/util/stress/util/ma/helper/const.py new file mode 100644 index 0000000000..f220707d7a --- /dev/null +++ b/tests-extension/test/qe/util/stress/util/ma/helper/const.py @@ -0,0 +1,6 @@ +CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) + +# draw figure +FIGURE_WIDTH = 40 +FIGURE_HEIGHT = 10 + diff --git a/tests-extension/test/qe/util/stress/util/ma/helper/containercpu.py b/tests-extension/test/qe/util/stress/util/ma/helper/containercpu.py new file mode 100644 index 0000000000..08c454a1f6 --- /dev/null +++ b/tests-extension/test/qe/util/stress/util/ma/helper/containercpu.py @@ -0,0 +1,173 @@ +import json +import os +import logging +from datetime import datetime +from pathlib import Path +import ma.helper.util as util +import ma.helper.algo as algo +from ma.helper.exceptions import ContainerCPUException + +logger = logging.getLogger(__name__) + + +class ContainerCPU: + """ + Container CPU object is used to check if the Container CPU usage is expected. + + """ + + def __init__(self, metrics_result_file, output_dir, zscore_threshold, window_size, window_threshold, watermark, anomalies_threshold): + self.mrf = metrics_result_file + self.odir = output_dir + self.zscore_threshold = zscore_threshold + self.window_size = window_size + self.window_threshold = window_threshold + self.watermark = watermark + self.anomalies_threshold = anomalies_threshold + self.preliminary_anomalies = [] + self.refined_anomalies = [] + self.final_anomalies = [] + + try: + self.base_name = os.path.basename(self.mrf) + self.base_name_wo_ext, self.base_name_ext = os.path.splitext(self.base_name) + except BaseException as re: + raise ContainerCPUException("parse the file path failed") from re + + try: + with open(self.mrf) as f: + self.data = json.load(f) + self.values = [d['value'] for d in self.data] + self.timestamps = [d['timestamp'] for d in self.data] + self.timestamps_format = [datetime.strptime(d['timestamp'], "%Y-%m-%dT%H:%M:%S.%fZ") for d in self.data] + + except BaseException as re: + raise ContainerCPUException("load ccpu metric data failed") from re + + def handle(self): + self.tendency_chart() + self.preliminary_screening() + self.refinement_screening() + self.final_screening() + self.convert_preliminary_screening() + self.convert_refinement_screening() + self.convert_final_screening() + + def tendency_chart(self): + """ + it draws the cpu usage figure and save it as pdf + """ + try: + util.draw_figure( + self.timestamps_format, + self.values, + self.odir, + self.base_name_wo_ext + ) + except BaseException as re: + raise ContainerCPUException("drawing pdf failed") from re + + + def preliminary_screening(self): + """ + it uses a simple, fast method (e.g., Z-Score) to flag potential anomalies + """ + try: + self.preliminary_anomalies = algo.z_score( + self.timestamps, + self.values, + self.zscore_threshold) + except BaseException as re: + raise ContainerCPUException("preliminary screening failed") from re + + def refinement_screening(self): + """ + it applyes stricter, context-aware rules (e.g., moving window statistics) to + validate whether the candidates flagged in preliminary_screening are true anomalies + """ + try: + self.refined_anomalies = algo.moving_window_statistics( + self.preliminary_anomalies, + self.timestamps, + self.values, + self.window_size, + self.window_threshold) + except BaseException as re: + raise ContainerCPUException("refinement screening failed") from re + + def final_screening(self): + try: + self.final_anomalies = algo.watermark( + self.refined_anomalies, + self.watermark) + except BaseException as re: + raise ContainerCPUException("final screening failed") from re + + def get_preliminary_screening(self): + """ + it gets the result of a simple, fast method (e.g., Z-Score) to flag potential anomalies + """ + return self.preliminary_anomalies + + def get_refinement_screening(self): + """ + it get results of refinement screening + """ + return self.refined_anomalies + + def get_final_screening(self): + """ + it get results of final screening + """ + return self.final_anomalies + + def convert_preliminary_screening(self): + """ + it converts the result of a simple, fast method (e.g., Z-Score) to flag potential anomalies + """ + try: + util.convert_screening( + self.preliminary_anomalies, + os.path.join(self.odir, self.base_name_wo_ext) + "_prescr.json" + ) + except BaseException as re: + raise ContainerCPUException("convert preliminary screening failed") from re + + + def convert_refinement_screening(self): + """ + it converts results of refinement screening + """ + try: + util.convert_screening( + self.refined_anomalies, + os.path.join(self.odir, self.base_name_wo_ext) + "_refscr.json" + ) + except BaseException as re: + raise ContainerCPUException("convert refinement screening failed") from re + + def convert_final_screening(self): + """ + it converts results of final screening + """ + try: + util.convert_screening( + self.final_anomalies, + os.path.join(self.odir, self.base_name_wo_ext) + "_finscr.json" + ) + except BaseException as re: + raise ContainerCPUException("convert final screening failed") from re + + def ok_or_not(self): + """ + it reports if the result is ok + """ + try: + base_path = os.path.join(self.odir, self.base_name_wo_ext) + result = "pass" + if len(self.final_anomalies) > self.anomalies_threshold: + result = "fail" + output_path = Path(base_path+"_result-"+result) + output_path.write_text(result) + except BaseException as re: + raise ContainerCPUException("check result failed") from re \ No newline at end of file diff --git a/tests-extension/test/qe/util/stress/util/ma/helper/exceptions.py b/tests-extension/test/qe/util/stress/util/ma/helper/exceptions.py new file mode 100644 index 0000000000..17feb001fd --- /dev/null +++ b/tests-extension/test/qe/util/stress/util/ma/helper/exceptions.py @@ -0,0 +1,2 @@ +class ContainerCPUException(BaseException): + """Exception class to raise error in ContainerCPU""" diff --git a/tests-extension/test/qe/util/stress/util/ma/helper/util.py b/tests-extension/test/qe/util/stress/util/ma/helper/util.py new file mode 100644 index 0000000000..29fc4bc560 --- /dev/null +++ b/tests-extension/test/qe/util/stress/util/ma/helper/util.py @@ -0,0 +1,47 @@ +import logging +import json +import os +import matplotlib.pyplot as plt +from ma.helper.const import * +from pathlib import Path + + + +def init_logging(log_level=logging.INFO): + logging.basicConfig( + # format="%(module)s: %(asctime)s: %(levelname)s: %(message)s", + format="%(asctime)s: %(levelname)s: %(message)s", + datefmt="%Y-%m-%dT%H:%M:%SZ", + level=log_level, + ) + + loggers = logging.Logger.manager.loggerDict + for k in loggers.keys(): + if "requests" in k or "urllib3" in k or "gssapi" in k: + logger = logging.getLogger(k) + logger.setLevel(logging.WARNING) + if "requests_kerberos" in k: + logger = logging.getLogger(k) + logger.setLevel(logging.CRITICAL) + +def draw_figure(timestamps_format, values, odir, base_name_wo_ext): + plt.figure(figsize=(FIGURE_WIDTH, FIGURE_HEIGHT)) + plt.plot(timestamps_format, values, marker='o') + # plt.xticks(rotation=45) + plt.ylabel('CPU Usage') + plt.title('CPU Usage Over Time') + plt.tight_layout() + saved_file_wo_ext = os.path.join(odir, base_name_wo_ext) + plt.savefig(saved_file_wo_ext+"_figure.pdf") + plt.close() + +def convert_screening(anomalies, file): + formatted_data = [ + {"timestamp": ts, "value": round(val, 15)} + for ts, val in anomalies + ] + + output_path = Path(file) + output_path.write_text( + json.dumps(formatted_data, indent=2, ensure_ascii=False) + )