diff --git a/openshift/tests-extension/.openshift-tests-extension/openshift_payload_olmv1.json b/openshift/tests-extension/.openshift-tests-extension/openshift_payload_olmv1.json index 06ef8f782..f587662af 100644 --- a/openshift/tests-extension/.openshift-tests-extension/openshift_payload_olmv1.json +++ b/openshift/tests-extension/.openshift-tests-extension/openshift_payload_olmv1.json @@ -565,6 +565,22 @@ "exclude": "topology==\"External\"" } }, + { + "name": "[sig-olmv1][Jira:OLM] OLM v1 for stress PolarionID:81509-[OTP][Skipped:Disconnected][OlmStress]olmv1 create mass operator to see if they all are installed successfully [Slow][Timeout:330m]", + "labels": { + "Extended": {}, + "NonHyperShiftHOST": {}, + "StressTest": {} + }, + "resources": { + "isolation": {} + }, + "source": "openshift:payload:olmv1", + "lifecycle": "blocking", + "environmentSelector": { + "exclude": "topology==\"External\"" + } + }, { "name": "[sig-olmv1][OCPFeatureGate:NewOLM][Skipped:Disconnected] OLMv1 Catalogs should be installed", "labels": {}, diff --git a/openshift/tests-extension/test/qe/README.md b/openshift/tests-extension/test/qe/README.md index 14ef27291..3870b79e8 100644 --- a/openshift/tests-extension/test/qe/README.md +++ b/openshift/tests-extension/test/qe/README.md @@ -244,7 +244,20 @@ All migrated test case code needs the following changes to run in the new test f **Environment Validation for Disconnected-Supporting Migrated Test Cases:** -If your test case supports disconnected environments, you MUST call `ValidateAccessEnvironment` at the beginning of the test: +**When to use `ValidateAccessEnvironment`:** + +1. **Test cases that create ClusterCatalog or ClusterExtension**: + - If your test supports disconnected environments (both connected+disconnected, or disconnected-only) + - AND your test creates ClusterCatalog or ClusterExtension resources + - **MUST** call `ValidateAccessEnvironment(oc)` at the beginning of the test + - This applies to both newly created test cases and migrated test cases + +2. **Test cases that do NOT create both ClusterCatalog or ClusterExtension**: + - Optional to use `ValidateAccessEnvironment(oc)` + - Using it won't cause errors, but it's not required + - The validation is primarily for ensuring catalog images can be mirrored + +**Usage example:** ```go g.It("test case supporting disconnected", func() { diff --git a/openshift/tests-extension/test/qe/specs/olmv1_stress.go b/openshift/tests-extension/test/qe/specs/olmv1_stress.go new file mode 100644 index 000000000..ae6baf7df --- /dev/null +++ b/openshift/tests-extension/test/qe/specs/olmv1_stress.go @@ -0,0 +1,146 @@ +package specs + +import ( + "fmt" + "path/filepath" + "time" + + g "github.com/onsi/ginkgo/v2" + o "github.com/onsi/gomega" + e2e "k8s.io/kubernetes/test/e2e/framework" + + exutil "github.com/openshift/operator-framework-operator-controller/openshift/tests-extension/test/qe/util" + olmv1util "github.com/openshift/operator-framework-operator-controller/openshift/tests-extension/test/qe/util/olmv1util" +) + +var _ = g.Describe("[sig-olmv1][Jira:OLM] OLM v1 for stress", func() { + + defer g.GinkgoRecover() + var ( + oc = exutil.NewCLIWithoutNamespace("default") + ) + + g.BeforeEach(func() { + exutil.SkipMicroshift(oc) + exutil.SkipNoOLMv1Core(oc) + }) + + // author: kuiwang@redhat.com + g.It("PolarionID:81509-[OTP][Skipped:Disconnected][OlmStress]olmv1 create mass operator to see if they all are installed successfully [Slow][Timeout:330m]", g.Label("StressTest"), g.Label("NonHyperShiftHOST"), func() { + var ( + caseID = "81509" + prefixCatalog = "catalog-" + caseID + prefixSa = "sa-" + caseID + prefixCe = "ce-" + caseID + prefixNs = "ns-" + caseID + prefixPackage = "stress-olmv1-c" + prefixImage = "quay.io/olmqe/stress-index:vokv" + nsOc = "openshift-operator-controller" + nsCatalog = "openshift-catalogd" + catalogLabel = "control-plane=catalogd-controller-manager" + ocLabel = "control-plane=operator-controller-controller-manager" + baseDir = exutil.FixturePath("testdata", "olm") + clustercatalogTemplate = filepath.Join(baseDir, "clustercatalog.yaml") + clusterextensionTemplate = filepath.Join(baseDir, "clusterextension.yaml") + saClusterRoleBindingTemplate = filepath.Join(baseDir, "sa-admin.yaml") + ) + + if !olmv1util.IsPodReady(oc, nsCatalog, catalogLabel) { + _, _ = olmv1util.Get(oc, "pod", "-n", nsCatalog, "-l", catalogLabel, "-o", "yaml") + exutil.AssertWaitPollNoErr(fmt.Errorf("the pod with %s is not correct", catalogLabel), "the pod with app=catalog-operator is not correct") + } + if !olmv1util.IsPodReady(oc, nsOc, ocLabel) { + _, _ = olmv1util.Get(oc, "pod", "-n", nsOc, "-l", ocLabel, "-o", "yaml") + exutil.AssertWaitPollNoErr(fmt.Errorf("the pod with %s is not correct", ocLabel), "the pod with app=olm-operator is not correct") + } + + startTime := time.Now().UTC() + e2e.Logf("Start time: %s", startTime.Format(time.RFC3339)) + + // for i := 0; i < 500; i++ { + for i := 900; i < 969; i++ { + // it is not enough with 330m for one case if we run 100 times + e2e.Logf("=================it is round %v=================", i) + ns := fmt.Sprintf("%s-%d", prefixNs, i) + clustercatalog := olmv1util.ClusterCatalogDescription{ + Name: fmt.Sprintf("%s-%d", prefixCatalog, i), + Imageref: fmt.Sprintf("%s%d", prefixImage, i), + Template: clustercatalogTemplate, + } + saCrb := olmv1util.SaCLusterRolebindingDescription{ + Name: fmt.Sprintf("%s-%d", prefixSa, i), + Namespace: ns, + Template: saClusterRoleBindingTemplate, + } + ce := olmv1util.ClusterExtensionDescription{ + Name: fmt.Sprintf("%s-%d", prefixCe, i), + PackageName: fmt.Sprintf("%s%d", prefixPackage, i), + Channel: "alpha", + Version: ">=0.0.1", + InstallNamespace: ns, + SaName: fmt.Sprintf("%s-%d", prefixSa, i), + Template: clusterextensionTemplate, + } + g.By(fmt.Sprintf("Create namespace for %d", i)) + // defer oc.WithoutNamespace().AsAdmin().Run("delete").Args("ns", ns, "--ignore-not-found").Execute() + // it take time delete ns which is not necessary. currently 5.5h is not enough to delete them. + // so I prefer to keep ns to save case duration + err := oc.WithoutNamespace().AsAdmin().Run("create").Args("ns", ns).Execute() + o.Expect(err).NotTo(o.HaveOccurred()) + + o.Expect(olmv1util.Appearance(oc, exutil.Appear, "ns", ns)).To(o.BeTrue()) + + g.By(fmt.Sprintf("Create clustercatalog for %d", i)) + e2e.Logf("=========Create clustercatalog %v=========", clustercatalog.Name) + defer clustercatalog.Delete(oc) + err = clustercatalog.CreateWithoutCheck(oc) + o.Expect(err).NotTo(o.HaveOccurred()) + clustercatalog.WaitCatalogStatus(oc, "true", "Serving", 0) + + g.By(fmt.Sprintf("Create SA for clusterextension for %d", i)) + defer saCrb.Delete(oc) + saCrb.Create(oc) + + g.By(fmt.Sprintf("check ce to be installed for %d", i)) + e2e.Logf("=========Create clusterextension %v=========", ce.Name) + defer ce.Delete(oc) + err = ce.CreateWithoutCheck(oc) + o.Expect(err).NotTo(o.HaveOccurred()) + ce.CheckClusterExtensionCondition(oc, "Progressing", "reason", "Succeeded", 10, 600, 0) + ce.WaitClusterExtensionCondition(oc, "Installed", "True", 0) + } + + endTime := time.Now().UTC() + e2e.Logf("End time: %v", endTime.Format(time.RFC3339)) + + duration := endTime.Sub(startTime) + minutes := int(duration.Minutes()) + if minutes < 1 { + minutes = 1 + } + + podName, err := oc.AsAdmin().WithoutNamespace().Run("get").Args("pods", "-l", catalogLabel, "-o=jsonpath={.items[0].metadata.name}", "-n", nsCatalog).Output() + if err == nil { + if !olmv1util.WriteErrToArtifactDir(oc, nsCatalog, podName, "error", "Unhandled|Reconciler error|level=info", caseID, minutes) { + e2e.Logf("no error log into artifact for pod %s in %s", podName, nsCatalog) + } + } + podName, err = oc.AsAdmin().WithoutNamespace().Run("get").Args("pods", "-l", ocLabel, "-o=jsonpath={.items[0].metadata.name}", "-n", nsOc).Output() + if err == nil { + if !olmv1util.WriteErrToArtifactDir(oc, nsOc, podName, "error", "Unhandled|Reconciler error|level=info", caseID, minutes) { + e2e.Logf("no error log into artifact for pod %s in %s", podName, nsOc) + } + } + + if !olmv1util.IsPodReady(oc, nsCatalog, catalogLabel) { + _, _ = olmv1util.Get(oc, "pod", "-n", nsCatalog, "-l", catalogLabel, "-o", "yaml") + exutil.AssertWaitPollNoErr(fmt.Errorf("the pod with %s is not correct", catalogLabel), "the pod with app=catalog-operator is not correct") + } + if !olmv1util.IsPodReady(oc, nsOc, ocLabel) { + _, _ = olmv1util.Get(oc, "pod", "-n", nsOc, "-l", ocLabel, "-o", "yaml") + exutil.AssertWaitPollNoErr(fmt.Errorf("the pod with %s is not correct", ocLabel), "the pod with app=olm-operator is not correct") + } + + }) + +}) diff --git a/openshift/tests-extension/test/qe/util/architecture/architecture.go b/openshift/tests-extension/test/qe/util/architecture/architecture.go index f5f340aaa..78de861de 100644 --- a/openshift/tests-extension/test/qe/util/architecture/architecture.go +++ b/openshift/tests-extension/test/qe/util/architecture/architecture.go @@ -80,10 +80,10 @@ func SkipNonAmd64SingleArch(oc *exutil.CLI) Architecture { func getNodeArchitectures(oc *exutil.CLI) []string { output, err := oc.WithoutNamespace().AsAdmin().Run("get").Args("nodes", "-o=jsonpath={.items[*].status.nodeInfo.architecture}").Output() if err != nil { - e2e.Failf("unable to get cluster node architectures: %v", err) + g.Skip(fmt.Sprintf("unable to get cluster node architectures: %v", err)) } if output == "" { - e2e.Failf("no nodes found or architecture information missing") + g.Skip("no nodes found or architecture information missing") } return strings.Fields(output) // Use Fields instead of Split to handle multiple spaces } @@ -97,7 +97,7 @@ func getNodeArchitectures(oc *exutil.CLI) []string { func GetAvailableArchitecturesSet(oc *exutil.CLI) []Architecture { architectureStrings := getNodeArchitectures(oc) if len(architectureStrings) == 0 { - e2e.Failf("no node architectures found") + g.Skip("no node architectures found") } // Use map for deduplication with Architecture as key @@ -199,7 +199,7 @@ func (a Architecture) String() string { func ClusterArchitecture(oc *exutil.CLI) Architecture { architectureStrings := getNodeArchitectures(oc) if len(architectureStrings) == 0 { - e2e.Failf("no node architectures found") + g.Skip("no node architectures found") } // Filter out empty strings and convert to Architecture @@ -211,7 +211,7 @@ func ClusterArchitecture(oc *exutil.CLI) Architecture { } if len(architectures) == 0 { - e2e.Failf("no valid node architectures found") + g.Skip("no valid node architectures found") } // Check if all architectures are the same @@ -267,7 +267,7 @@ func GetControlPlaneArch(oc *exutil.CLI) Architecture { architectureStr = strings.TrimSpace(architectureStr) if architectureStr == "" { - e2e.Failf("Control plane node %s has no architecture information", masterNode) + g.Skip(fmt.Sprintf("Control plane node %s has no architecture information", masterNode)) } return FromString(architectureStr) diff --git a/openshift/tests-extension/test/qe/util/olmv1util/helper.go b/openshift/tests-extension/test/qe/util/olmv1util/helper.go index babd7b5bf..46ed388b8 100644 --- a/openshift/tests-extension/test/qe/util/olmv1util/helper.go +++ b/openshift/tests-extension/test/qe/util/olmv1util/helper.go @@ -590,7 +590,7 @@ func HasExternalNetworkAccess(oc *exutil.CLI) bool { // Note: In disconnected environments, curl will fail and bash will return non-zero exit code, // causing DebugNodeWithChroot to return an error. We ignore this error and rely on output checking. cmd := `timeout 10 curl -k https://quay.io > /dev/null 2>&1; [ $? -eq 0 ] && echo "connected"` - output, _ := exutil.DebugNodeWithChroot(oc, masterNode, "bash", "-c", cmd) + output, _ := exutil.DebugNodeWithOptionsAndChroot(oc, masterNode, []string{"--to-namespace=default"}, "bash", "-c", cmd) // Check if the output contains "connected" // - Connected environment: curl succeeds -> echo "connected" -> output contains "connected" diff --git a/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/metrics-endpoint.yml b/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/metrics-endpoint.yml new file mode 100644 index 000000000..b43d6e002 --- /dev/null +++ b/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/metrics-endpoint.yml @@ -0,0 +1,9 @@ +- endpoint: {{.PROMETHEUS_URL}} + token: {{.PROMETHEUS_TOKEN}} + step: 10s + skipTLSVerify: true + metrics: + - metrics-profiles/metrics-aggregated.yml + indexer: + type: local + metricsDirectory: collected-metrics-{{.UUID}} diff --git a/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/metrics-profiles/metrics-aggregated.yml b/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/metrics-profiles/metrics-aggregated.yml new file mode 100644 index 000000000..c2e3b45a8 --- /dev/null +++ b/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/metrics-profiles/metrics-aggregated.yml @@ -0,0 +1,7 @@ +# Containers & pod metrics + +- query: (sum(irate(container_cpu_usage_seconds_total{container="manager",namespace="openshift-catalogd"}[2m]) * 100) by (container, pod)) > 0 + metricName: containerCPU-Catlogd + +- query: (sum(irate(container_cpu_usage_seconds_total{container="manager",namespace="openshift-operator-controller"}[2m]) * 100) by (container, pod)) > 0 + metricName: containerCPU-OpCon diff --git a/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/pkg-ins-v1.yml b/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/pkg-ins-v1.yml new file mode 100644 index 000000000..5766abb4a --- /dev/null +++ b/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/pkg-ins-v1.yml @@ -0,0 +1,67 @@ +--- +global: + gc: {{.GC}} + gcMetrics: {{.GC_METRICS}} + measurements: + - name: podLatency + + +jobs: + - name: {{.OPERATION}} + jobType: create + jobIterations: {{.JOB_ITERATIONS}} + namespace: {{.OPERATION}} + namespacedIterations: {{.NAMESPACED_ITERATIONS}} + iterationsPerNamespace: {{.ITERATIONS_PER_NAMESPACE}} + cleanup: true + podWait: true + waitWhenFinished: true + maxWaitTimeout: {{.MAX_WAIT_TIMEOUT}} + jobIterationDelay: {{.JOB_ITERATION_DELAY}} + jobPause: {{.JOB_PAUSE}} + qps: {{.QPS}} + burst: {{.BURST}} + executionMode: parallel + verifyObjects: true + errorOnVerify: true + skipIndexing: false + preLoadImages: true + preLoadPeriod: 15s + churn: false + defaultMissingKeysWithZero: false + namespaceLabels: + security.openshift.io/scc.podSecurityLabelSync: false + pod-security.kubernetes.io/enforce: privileged + pod-security.kubernetes.io/audit: privileged + pod-security.kubernetes.io/warn: privileged + objects: + + - objectTemplate: templates/catalogd.yml + replicas: 1 + inputVars: + prefixImageName: "quay.io/olmqe/stress-index:vokv" + waitOptions: + customStatusPaths: + - key: ".conditions[] | select(.type==\"Serving\") | .status" + value: "True" + + - objectTemplate: templates/clusterrole.yml + replicas: 1 + + - objectTemplate: templates/sa.yml + replicas: 1 + + - objectTemplate: templates/clusterrolebinding.yml + replicas: 1 + inputVars: + prefixNamespace: {{.OPERATION}} + + - objectTemplate: templates/ce.yml + replicas: 1 + inputVars: + prefixNamespace: {{.OPERATION}} + prefixPkgName: {{.PREFIX_PKG_NAME_V1}} + waitOptions: + customStatusPaths: + - key: ".conditions[] | select(.type==\"Installed\") | .status" + value: "True" diff --git a/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/templates/catalogd.yml b/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/templates/catalogd.yml new file mode 100644 index 000000000..b4290395c --- /dev/null +++ b/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/templates/catalogd.yml @@ -0,0 +1,9 @@ +apiVersion: olm.operatorframework.io/v1 +kind: ClusterCatalog +metadata: + name: "clustercatalog-{{.Iteration}}" +spec: + source: + type: Image + image: + ref: "{{.prefixImageName}}{{.Iteration}}" diff --git a/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/templates/ce.yml b/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/templates/ce.yml new file mode 100644 index 000000000..ccd94ba70 --- /dev/null +++ b/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/templates/ce.yml @@ -0,0 +1,12 @@ +apiVersion: olm.operatorframework.io/v1 +kind: ClusterExtension +metadata: + name: "ce-{{.Iteration}}" +spec: + namespace: "{{.prefixNamespace}}-{{.Iteration}}" + serviceAccount: + name: "ins-sa-{{.Iteration}}" + source: + sourceType: Catalog + catalog: + packageName: "{{.prefixPkgName}}{{.Iteration}}" diff --git a/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/templates/clusterrole.yml b/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/templates/clusterrole.yml new file mode 100644 index 000000000..bcf19157d --- /dev/null +++ b/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/templates/clusterrole.yml @@ -0,0 +1,11 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: "ins-admin-clusterrole-{{.Iteration}}" +rules: + - apiGroups: + - "*" + resources: + - "*" + verbs: + - "*" diff --git a/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/templates/clusterrolebinding.yml b/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/templates/clusterrolebinding.yml new file mode 100644 index 000000000..cc5b181fc --- /dev/null +++ b/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/templates/clusterrolebinding.yml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: "ins-admin-clusterrole-binding-{{.Iteration}}" +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: "ins-admin-clusterrole-{{.Iteration}}" +subjects: + - kind: ServiceAccount + name: "ins-sa-{{.Iteration}}" + namespace: "{{.prefixNamespace}}-{{.Iteration}}" diff --git a/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/templates/sa.yml b/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/templates/sa.yml new file mode 100644 index 000000000..2f194c754 --- /dev/null +++ b/openshift/tests-extension/test/qe/util/stress/manifests/config/pkg-ins-v1/templates/sa.yml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: "ins-sa-{{.Iteration}}" diff --git a/openshift/tests-extension/test/qe/util/stress/util/ma.py b/openshift/tests-extension/test/qe/util/stress/util/ma.py new file mode 100755 index 000000000..976a9fed8 --- /dev/null +++ b/openshift/tests-extension/test/qe/util/stress/util/ma.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +from ma.cli.__main__ import main + +if __name__ == "__main__": + main() diff --git a/openshift/tests-extension/test/qe/util/stress/util/ma/__init__.py b/openshift/tests-extension/test/qe/util/stress/util/ma/__init__.py new file mode 100644 index 000000000..1898849ec --- /dev/null +++ b/openshift/tests-extension/test/qe/util/stress/util/ma/__init__.py @@ -0,0 +1,10 @@ +import os +import sys + +if sys.version_info < (3, 9): + sys.exit("Sorry, Python < 3.9 is no longer supported.") + +sys.dont_write_bytecode = True + +def version(): + return "0.1" diff --git a/openshift/tests-extension/test/qe/util/stress/util/ma/cli/__init__.py b/openshift/tests-extension/test/qe/util/stress/util/ma/cli/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/openshift/tests-extension/test/qe/util/stress/util/ma/cli/__main__.py b/openshift/tests-extension/test/qe/util/stress/util/ma/cli/__main__.py new file mode 100644 index 000000000..f201dfbed --- /dev/null +++ b/openshift/tests-extension/test/qe/util/stress/util/ma/cli/__main__.py @@ -0,0 +1,8 @@ +from ma.cli.cmd_group import cli + + +def main(): + try: + cli(obj={}) + except Exception as e: + raise SystemExit(e) diff --git a/openshift/tests-extension/test/qe/util/stress/util/ma/cli/cmd_check_ccpu.py b/openshift/tests-extension/test/qe/util/stress/util/ma/cli/cmd_check_ccpu.py new file mode 100644 index 000000000..c18a63179 --- /dev/null +++ b/openshift/tests-extension/test/qe/util/stress/util/ma/cli/cmd_check_ccpu.py @@ -0,0 +1,98 @@ +import click +import logging +from ma.helper.containercpu import ContainerCPU + +logger = logging.getLogger(__name__) + + +@click.command() +@click.pass_context +@click.option( + "-i", + "--metrics_result_file", + type=click.Path( + exists=True, + file_okay=True, + dir_okay=False, + readable=True, + resolve_path=True + ), + required=True, + help="the result file of the metrics" +) +@click.option( + "-o", + "--output_dir", + default="./", + type=click.Path( + exists=True, + file_okay=False, + dir_okay=True, + readable=True, + resolve_path=True + ), + help="the directory of generated metrics figure" +) +@click.option( + "--zscore_threshold", + type=int, + default=3, + required=False, + help="the threshold for z-score" +) +@click.option( + "--window_size", + type=int, + default=18, + required=False, + help="the size of moving window" +) +@click.option( + "--window_threshold", + type=int, + default=3, + required=False, + help="the threshold of moving window" +) +@click.option( + "--watermark", + type=int, + default=20, + required=False, + help="the abnoarm cpu usage calucalted by 100" +) +@click.option( + "--anomalies_threshold", + type=int, + default=2, + required=False, + help="the anomalies threshold to determine if the checking fails or not" +) +def check_ccpu(ctx, + metrics_result_file, + output_dir, + zscore_threshold, + window_size, + window_threshold, + watermark, + anomalies_threshold): + """ + Check if cpu usage is expected + """ + try: + ccpu = ContainerCPU(metrics_result_file, + output_dir, + zscore_threshold, + window_size, + window_threshold, + watermark, + anomalies_threshold) + ccpu.handle() + ccpu.ok_or_not() + # print(ccpu.get_preliminary_screening()) + # print(ccpu.get_refinement_screening()) + # print(ccpu.get_final_screening()) + except Exception as e: + logger.exception("checking container cpu failing") + raise + diff --git a/openshift/tests-extension/test/qe/util/stress/util/ma/cli/cmd_group.py b/openshift/tests-extension/test/qe/util/stress/util/ma/cli/cmd_group.py new file mode 100644 index 000000000..94bfc575e --- /dev/null +++ b/openshift/tests-extension/test/qe/util/stress/util/ma/cli/cmd_group.py @@ -0,0 +1,47 @@ +import click +import logging +import sys +from ma.cli.cmd_check_ccpu import check_ccpu +import ma.helper.util as util +from ma import version +from ma.helper.const import CONTEXT_SETTINGS + +logger = logging.getLogger(__name__) + + +def print_version(ctx, param, value): + if not value or ctx.resilient_parsing: + return + click.echo("ma v{}".format(version())) + click.echo("python v{}".format(sys.version)) + ctx.exit() + + +@click.group(context_settings=CONTEXT_SETTINGS) +@click.pass_context +@click.option( + "-V", + "--version", + is_flag=True, + callback=print_version, + expose_value=False, + is_eager=True, +) +@click.option( + "-v", + "--debug", + help="enable debug logging", + is_flag=True, + default=False) +def cli(ctx, debug): + util.init_logging(logging.DEBUG if debug else logging.INFO) + is_help = False + for k in CONTEXT_SETTINGS["help_option_names"]: + if k in sys.argv: + is_help = True + break + if not is_help: + logger.info("start to handle sub command") + pass + +cli.add_command(check_ccpu) diff --git a/openshift/tests-extension/test/qe/util/stress/util/ma/helper/__init__.py b/openshift/tests-extension/test/qe/util/stress/util/ma/helper/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/openshift/tests-extension/test/qe/util/stress/util/ma/helper/algo.py b/openshift/tests-extension/test/qe/util/stress/util/ma/helper/algo.py new file mode 100644 index 000000000..13985f83a --- /dev/null +++ b/openshift/tests-extension/test/qe/util/stress/util/ma/helper/algo.py @@ -0,0 +1,44 @@ +import logging +import numpy as np + +logger = logging.getLogger(__name__) + +def z_score(timestamps, values, threshold): + # (default threshold = 3) + preliminary_anomalies = [] + mean = np.mean(values) + std = np.std(values) + logger.info(f"mean: {mean}, std: {std}") + for i, value in enumerate(values): + if std == 0: + preliminary_anomalies.append((timestamps[i], value)) + continue + if value > mean + threshold * std: + preliminary_anomalies.append((timestamps[i], value)) + return preliminary_anomalies + + +def moving_window_statistics (preliminary_anomalies, timestamps, values, window_size, window_threshold): + refined_anomalies = [] + # Refinement with moving window (default window_size = 30 points, default threshold = 3) + logger.info(f"window_size: {window_size}, window_threshold: {window_threshold}") + for ts, value in preliminary_anomalies: + index = timestamps.index(ts) + if index < window_size: + continue + window = values[index - window_size : index] + window_mean = np.mean(window) + window_std = np.std(window) + if window_std == 0: + refined_anomalies.append((ts, value)) + continue + if value > window_mean + window_threshold * window_std: + refined_anomalies.append((ts, value)) + return refined_anomalies + +def watermark(refined_anomalies, watermark): + anomalies = [] + for ts, value in refined_anomalies: + if value > watermark: + anomalies.append((ts, value)) + return anomalies diff --git a/openshift/tests-extension/test/qe/util/stress/util/ma/helper/const.py b/openshift/tests-extension/test/qe/util/stress/util/ma/helper/const.py new file mode 100644 index 000000000..f220707d7 --- /dev/null +++ b/openshift/tests-extension/test/qe/util/stress/util/ma/helper/const.py @@ -0,0 +1,6 @@ +CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) + +# draw figure +FIGURE_WIDTH = 40 +FIGURE_HEIGHT = 10 + diff --git a/openshift/tests-extension/test/qe/util/stress/util/ma/helper/containercpu.py b/openshift/tests-extension/test/qe/util/stress/util/ma/helper/containercpu.py new file mode 100644 index 000000000..08c454a1f --- /dev/null +++ b/openshift/tests-extension/test/qe/util/stress/util/ma/helper/containercpu.py @@ -0,0 +1,173 @@ +import json +import os +import logging +from datetime import datetime +from pathlib import Path +import ma.helper.util as util +import ma.helper.algo as algo +from ma.helper.exceptions import ContainerCPUException + +logger = logging.getLogger(__name__) + + +class ContainerCPU: + """ + Container CPU object is used to check if the Container CPU usage is expected. + + """ + + def __init__(self, metrics_result_file, output_dir, zscore_threshold, window_size, window_threshold, watermark, anomalies_threshold): + self.mrf = metrics_result_file + self.odir = output_dir + self.zscore_threshold = zscore_threshold + self.window_size = window_size + self.window_threshold = window_threshold + self.watermark = watermark + self.anomalies_threshold = anomalies_threshold + self.preliminary_anomalies = [] + self.refined_anomalies = [] + self.final_anomalies = [] + + try: + self.base_name = os.path.basename(self.mrf) + self.base_name_wo_ext, self.base_name_ext = os.path.splitext(self.base_name) + except BaseException as re: + raise ContainerCPUException("parse the file path failed") from re + + try: + with open(self.mrf) as f: + self.data = json.load(f) + self.values = [d['value'] for d in self.data] + self.timestamps = [d['timestamp'] for d in self.data] + self.timestamps_format = [datetime.strptime(d['timestamp'], "%Y-%m-%dT%H:%M:%S.%fZ") for d in self.data] + + except BaseException as re: + raise ContainerCPUException("load ccpu metric data failed") from re + + def handle(self): + self.tendency_chart() + self.preliminary_screening() + self.refinement_screening() + self.final_screening() + self.convert_preliminary_screening() + self.convert_refinement_screening() + self.convert_final_screening() + + def tendency_chart(self): + """ + it draws the cpu usage figure and save it as pdf + """ + try: + util.draw_figure( + self.timestamps_format, + self.values, + self.odir, + self.base_name_wo_ext + ) + except BaseException as re: + raise ContainerCPUException("drawing pdf failed") from re + + + def preliminary_screening(self): + """ + it uses a simple, fast method (e.g., Z-Score) to flag potential anomalies + """ + try: + self.preliminary_anomalies = algo.z_score( + self.timestamps, + self.values, + self.zscore_threshold) + except BaseException as re: + raise ContainerCPUException("preliminary screening failed") from re + + def refinement_screening(self): + """ + it applyes stricter, context-aware rules (e.g., moving window statistics) to + validate whether the candidates flagged in preliminary_screening are true anomalies + """ + try: + self.refined_anomalies = algo.moving_window_statistics( + self.preliminary_anomalies, + self.timestamps, + self.values, + self.window_size, + self.window_threshold) + except BaseException as re: + raise ContainerCPUException("refinement screening failed") from re + + def final_screening(self): + try: + self.final_anomalies = algo.watermark( + self.refined_anomalies, + self.watermark) + except BaseException as re: + raise ContainerCPUException("final screening failed") from re + + def get_preliminary_screening(self): + """ + it gets the result of a simple, fast method (e.g., Z-Score) to flag potential anomalies + """ + return self.preliminary_anomalies + + def get_refinement_screening(self): + """ + it get results of refinement screening + """ + return self.refined_anomalies + + def get_final_screening(self): + """ + it get results of final screening + """ + return self.final_anomalies + + def convert_preliminary_screening(self): + """ + it converts the result of a simple, fast method (e.g., Z-Score) to flag potential anomalies + """ + try: + util.convert_screening( + self.preliminary_anomalies, + os.path.join(self.odir, self.base_name_wo_ext) + "_prescr.json" + ) + except BaseException as re: + raise ContainerCPUException("convert preliminary screening failed") from re + + + def convert_refinement_screening(self): + """ + it converts results of refinement screening + """ + try: + util.convert_screening( + self.refined_anomalies, + os.path.join(self.odir, self.base_name_wo_ext) + "_refscr.json" + ) + except BaseException as re: + raise ContainerCPUException("convert refinement screening failed") from re + + def convert_final_screening(self): + """ + it converts results of final screening + """ + try: + util.convert_screening( + self.final_anomalies, + os.path.join(self.odir, self.base_name_wo_ext) + "_finscr.json" + ) + except BaseException as re: + raise ContainerCPUException("convert final screening failed") from re + + def ok_or_not(self): + """ + it reports if the result is ok + """ + try: + base_path = os.path.join(self.odir, self.base_name_wo_ext) + result = "pass" + if len(self.final_anomalies) > self.anomalies_threshold: + result = "fail" + output_path = Path(base_path+"_result-"+result) + output_path.write_text(result) + except BaseException as re: + raise ContainerCPUException("check result failed") from re \ No newline at end of file diff --git a/openshift/tests-extension/test/qe/util/stress/util/ma/helper/exceptions.py b/openshift/tests-extension/test/qe/util/stress/util/ma/helper/exceptions.py new file mode 100644 index 000000000..17feb001f --- /dev/null +++ b/openshift/tests-extension/test/qe/util/stress/util/ma/helper/exceptions.py @@ -0,0 +1,2 @@ +class ContainerCPUException(BaseException): + """Exception class to raise error in ContainerCPU""" diff --git a/openshift/tests-extension/test/qe/util/stress/util/ma/helper/util.py b/openshift/tests-extension/test/qe/util/stress/util/ma/helper/util.py new file mode 100644 index 000000000..29fc4bc56 --- /dev/null +++ b/openshift/tests-extension/test/qe/util/stress/util/ma/helper/util.py @@ -0,0 +1,47 @@ +import logging +import json +import os +import matplotlib.pyplot as plt +from ma.helper.const import * +from pathlib import Path + + + +def init_logging(log_level=logging.INFO): + logging.basicConfig( + # format="%(module)s: %(asctime)s: %(levelname)s: %(message)s", + format="%(asctime)s: %(levelname)s: %(message)s", + datefmt="%Y-%m-%dT%H:%M:%SZ", + level=log_level, + ) + + loggers = logging.Logger.manager.loggerDict + for k in loggers.keys(): + if "requests" in k or "urllib3" in k or "gssapi" in k: + logger = logging.getLogger(k) + logger.setLevel(logging.WARNING) + if "requests_kerberos" in k: + logger = logging.getLogger(k) + logger.setLevel(logging.CRITICAL) + +def draw_figure(timestamps_format, values, odir, base_name_wo_ext): + plt.figure(figsize=(FIGURE_WIDTH, FIGURE_HEIGHT)) + plt.plot(timestamps_format, values, marker='o') + # plt.xticks(rotation=45) + plt.ylabel('CPU Usage') + plt.title('CPU Usage Over Time') + plt.tight_layout() + saved_file_wo_ext = os.path.join(odir, base_name_wo_ext) + plt.savefig(saved_file_wo_ext+"_figure.pdf") + plt.close() + +def convert_screening(anomalies, file): + formatted_data = [ + {"timestamp": ts, "value": round(val, 15)} + for ts, val in anomalies + ] + + output_path = Path(file) + output_path.write_text( + json.dumps(formatted_data, indent=2, ensure_ascii=False) + )