update code to automate nto bugs

openshift · Nov 24, 2023 · 6f85557 · 6f85557
1 parent ad0c73d
commit 6f85557
Show file tree

Hide file tree

Showing 3 changed files with 295 additions and 1 deletion.
diff --git a/test/extended/node_tuning/node_tuning.go b/test/extended/node_tuning/node_tuning.go
@@ -1,6 +1,7 @@
 package node_tuning
 
 import (
+	"context"
 	"fmt"
 	"path/filepath"
 	"strings"
@@ -10,7 +11,12 @@ import (
 	o "github.com/onsi/gomega"
 
 	exutil "github.com/openshift/origin/test/extended/util"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/runtime/schema"
 	"k8s.io/apimachinery/pkg/util/wait"
+	"k8s.io/client-go/dynamic"
+	"k8s.io/kubernetes/test/e2e/framework"
 	e2e "k8s.io/kubernetes/test/e2e/framework"
 )
 
@@ -19,7 +25,7 @@ var _ = g.Describe("[sig-node-tuning] NTO should", func() {
 
 	var (
 		ntoNamespace        = "openshift-cluster-node-tuning-operator"
-		oc                  = exutil.NewCLIWithoutNamespace("nto")
+		oc                  = exutil.NewCLIWithoutNamespace("nto").AsAdmin()
 		buildPruningBaseDir = exutil.FixturePath("testdata", "node_tuning")
 		ntoStalldFile       = filepath.Join(buildPruningBaseDir, "nto-stalld.yaml")
 		stalldCurrentPID    string
@@ -111,6 +117,54 @@ var _ = g.Describe("[sig-node-tuning] NTO should", func() {
 
 		err = fmt.Errorf("case: %v\nexpected error got because of %v", g.CurrentSpecReport().FullText(), fmt.Sprintf("stalld service restarted : %v", errWait))
 		o.Expect(err).NotTo(o.HaveOccurred())
+	})
+
+	// OCPBUGS-18052
+	g.It("SNO installation does not finish due to wait for non-existing machine-config [Early]", func() {
+		isSNO := isSNOCluster(oc)
+		if !isSNO {
+			g.Skip("only test on SNO cluster, skipping it ...")
+		}
+
+		var (
+			mcpConfigDaemonset *corev1.Pod
+		)
+
+		ctx := context.TODO()
+		nodeClient := oc.KubeClient().CoreV1().Nodes()
+		firstMasterNodeName, err := getFirstMasterNode(ctx, nodeClient)
+		o.Expect(err).NotTo(o.HaveOccurred())
+
+		e2e.Logf("assert if the status of mcp master keep on updated state")
+		config, err := framework.LoadConfig()
+		framework.ExpectNoError(err)
+		dynamicClient := dynamic.NewForConfigOrDie(config)
+		mcps := dynamicClient.Resource(schema.GroupVersionResource{
+			Group:    "machineconfiguration.openshift.io",
+			Version:  "v1",
+			Resource: "machineconfigpools",
+		})
+		pools, err := mcps.List(context.Background(), metav1.ListOptions{})
+		o.Expect(err).NotTo(o.HaveOccurred())
+		for _, p := range pools.Items {
+			assertIfSpecifiedMCPSKeepUpdatedStateWithRetry(mcps, p.GetName())
+		}
+
+		e2e.Logf("assert if the status of co machine-config is availabe state")
+		assertIfSpecifiedCOKeepAvailableStateWithRetry(oc, "machine-config")
 
+		e2e.Logf("assert if the status of co node-tuning is availabe state")
+		assertIfSpecifiedCOKeepAvailableStateWithRetry(oc, "node-tuning")
+
+		kf := oc.KubeFramework()
+		mcpConfigDaemonset, _ = exutil.GetMachineConfigDaemonByNode(kf.ClientSet, firstMasterNodeName)
+		e2e.Logf("mcpConfigDaemonsetPodName %v", mcpConfigDaemonset.Name)
+		o.Expect(err).NotTo(o.HaveOccurred())
+		e2e.Logf("Get pod logs for %v", mcpConfigDaemonset.Name)
+		podLogsStdout, err := getPodLogsLastLines(context.Background(), oc.KubeClient(), "openshift-machine-config-operator", mcpConfigDaemonset.Name, "machine-config-daemon", 20)
+		o.Expect(err).NotTo(o.HaveOccurred())
+		e2e.Logf("Check if the log of %v contains keyword [Marking Degraded due to|not found]", mcpConfigDaemonset.Name)
+		logAssertResult := assertPodLogsLastLines(mcpConfigDaemonset.Name, podLogsStdout, "Marking Degraded due to|not found")
+		o.Expect(logAssertResult).To(o.BeFalse())
 	})
 })
diff --git a/test/extended/node_tuning/node_tuning_utils.go b/test/extended/node_tuning/node_tuning_utils.go
@@ -0,0 +1,238 @@
+package node_tuning
+
+import (
+	"context"
+	"fmt"
+	"regexp"
+	"strconv"
+	"strings"
+	"time"
+
+	g "github.com/onsi/ginkgo/v2"
+	o "github.com/onsi/gomega"
+	configv1 "github.com/openshift/api/config/v1"
+	exutil "github.com/openshift/origin/test/extended/util"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
+	"k8s.io/apimachinery/pkg/util/wait"
+	"k8s.io/client-go/dynamic"
+	clientset "k8s.io/client-go/kubernetes"
+	v1 "k8s.io/client-go/kubernetes/typed/core/v1"
+	"k8s.io/kubernetes/test/e2e/framework"
+	e2e "k8s.io/kubernetes/test/e2e/framework"
+)
+
+const masterNodeRoleLabel = "node-role.kubernetes.io/master"
+
+// isSNOCluster will check if OCP is a single node cluster
+func isSNOCluster(oc *exutil.CLI) bool {
+	infrastructureType, err := oc.AdminConfigClient().ConfigV1().Infrastructures().Get(context.Background(), "cluster", metav1.GetOptions{})
+	o.Expect(err).NotTo(o.HaveOccurred())
+	e2e.Logf("the cluster type is %v", infrastructureType.Status.ControlPlaneTopology)
+	return infrastructureType.Status.ControlPlaneTopology == configv1.SingleReplicaTopologyMode
+}
+
+func getFirstMasterNode(ctx context.Context, nodeClient v1.NodeInterface) (*corev1.Node, error) {
+	masterNodes, err := nodeClient.List(ctx, metav1.ListOptions{LabelSelector: masterNodeRoleLabel})
+	if err != nil {
+		fmt.Printf("failed to list master nodes:'%w'", err)
+	}
+	var firstMasterName *corev1.Node
+	for i, masterNode := range masterNodes.Items {
+		if i == 0 {
+			firstMasterName = &masterNode
+			e2e.Logf("the first masterNode is %v", masterNode.Name)
+			break
+		}
+	}
+	return firstMasterName, err
+}
+
+func assertPodLogsLastLines(podName string, podLogs string, filter string) bool {
+	regNTOPodLogs, err := regexp.Compile(".*" + filter + ".*")
+	o.Expect(err).NotTo(o.HaveOccurred())
+	isMatch := regNTOPodLogs.MatchString(podLogs)
+	if isMatch {
+		loglines := regNTOPodLogs.FindAllString(podLogs, -1)
+		e2e.Logf("the logs of pod %v is [%v]", podName, loglines[0])
+		return true
+	}
+	e2e.Logf("the keywords [%s] of pod isn't found ...", filter)
+	return false
+}
+
+func getPodLogsLastLines(ctx context.Context, c clientset.Interface, namespace, podName, containerName string, lastlines int) (string, error) {
+	return getPodLogsInternal(ctx, c, namespace, podName, containerName, false, nil, &lastlines)
+}
+
+// utility function for gomega Eventually
+func getPodLogsInternal(ctx context.Context, c clientset.Interface, namespace, podName, containerName string, previous bool, sinceTime *metav1.Time, tailLines *int) (string, error) {
+	request := c.CoreV1().RESTClient().Get().
+		Resource("pods").
+		Namespace(namespace).
+		Name(podName).SubResource("log").
+		Param("container", containerName).
+		Param("previous", strconv.FormatBool(previous))
+	if sinceTime != nil {
+		request.Param("sinceTime", sinceTime.Format(time.RFC3339))
+	}
+	if tailLines != nil {
+		request.Param("tailLines", strconv.Itoa(*tailLines))
+	}
+	logs, err := request.Do(ctx).Raw()
+	if err != nil {
+		return "", err
+	}
+	if strings.Contains(string(logs), "Internal Error") {
+		return "", fmt.Errorf("Fetched log contains \"Internal Error\": %q", string(logs))
+	}
+	return string(logs), err
+}
+
+func isPoolUpdated(dc dynamic.NamespaceableResourceInterface, name string) (poolUpToDate bool, poolIsUpdating bool) {
+	pool, err := dc.Get(context.Background(), name, metav1.GetOptions{})
+	if err != nil {
+		framework.Logf("error getting pool %s: %v", name, err)
+		return false, false
+	}
+	paused, found, err := unstructured.NestedBool(pool.Object, "spec", "paused")
+	if err != nil || !found {
+		return false, false
+	}
+	conditions, found, err := unstructured.NestedFieldNoCopy(pool.Object, "status", "conditions")
+	if err != nil || !found {
+		return false, false
+	}
+	original, ok := conditions.([]interface{})
+	if !ok {
+		return false, false
+	}
+	var updated, updating, degraded bool
+	for _, obj := range original {
+		o, ok := obj.(map[string]interface{})
+		if !ok {
+			return false, false
+		}
+		t, found, err := unstructured.NestedString(o, "type")
+		if err != nil || !found {
+			return false, false
+		}
+		s, found, err := unstructured.NestedString(o, "status")
+		if err != nil || !found {
+			return false, false
+		}
+		if t == "Updated" && s == "True" {
+			updated = true
+		}
+		if t == "Updating" && s == "True" {
+			updating = true
+		}
+		if t == "Degraded" && s == "True" {
+			degraded = true
+		}
+	}
+	if paused {
+		framework.Logf("Pool %s is paused, treating as up-to-date (Updated: %v, Updating: %v, Degraded: %v)", name, updated, updating, degraded)
+		return true, updating
+	}
+	if updated && !updating && !degraded {
+		return true, updating
+	}
+	framework.Logf("Pool %s is still reporting (Updated: %v, Updating: %v, Degraded: %v)", name, updated, updating, degraded)
+	return false, updating
+}
+
+func findCondition(conditions []configv1.ClusterOperatorStatusCondition, name configv1.ClusterStatusConditionType) *configv1.ClusterOperatorStatusCondition {
+	for i := range conditions {
+		if name == conditions[i].Type {
+			return &conditions[i]
+		}
+	}
+	return nil
+}
+
+// wait for 5 minutes and check co stay on avaible state, just in case ocp upgrade scenario or node reboot/scale out that caused by other test case.
+func assertIfSpecifiedCOKeepAvailableStateWithRetry(oc *exutil.CLI, coName string) {
+	errWait := wait.Poll(1*time.Minute, 5*time.Minute, func() (bool, error) {
+		isCOAvailable, err := isCOAvailableState(oc, coName)
+		o.Expect(err).ToNot(o.HaveOccurred())
+		if isCOAvailable {
+			e2e.Logf("the status of cluster operator %v keep on available state", coName)
+			return true, nil
+		}
+		e2e.Logf("the status of co %v doesn't stay on expected state, will check again", coName)
+		return false, nil
+	})
+	AssertWaitPollNoErr(errWait, "the status of co keep unexpected state")
+}
+
+// isCOAvailable use to check if specified co is available state, that mean Available is true, Progressing is false, Degraded is false
+func isCOAvailableState(oc *exutil.CLI, coName string) (bool, error) {
+	var (
+		clusterOperators []configv1.ClusterOperator
+		desiredCO        configv1.ClusterOperator
+		isAvailable      bool
+	)
+	clusterOperatorsList, err := oc.AdminConfigClient().ConfigV1().ClusterOperators().List(context.Background(), metav1.ListOptions{})
+	o.Expect(err).ToNot(o.HaveOccurred())
+	clusterOperators = clusterOperatorsList.Items
+	for _, clusterOperator := range clusterOperators {
+		if clusterOperator.Name == coName {
+			desiredCO = clusterOperator
+			e2e.Logf("desiredCO Name is %v", desiredCO.Name)
+			e2e.Logf("desiredCO.status.conditions of %v is %v", desiredCO.Name, desiredCO.Status.Conditions)
+			break
+		}
+	}
+	available := findCondition(desiredCO.Status.Conditions, configv1.OperatorAvailable)
+	degraded := findCondition(desiredCO.Status.Conditions, configv1.OperatorDegraded)
+	progressing := findCondition(desiredCO.Status.Conditions, configv1.OperatorProgressing)
+	e2e.Logf("the status of co %v is: available.Status [%v] degraded.Status [%v] and progressing.Status [%v]", coName, available.Status, degraded.Status, progressing.Status)
+	if available.Status == configv1.ConditionTrue &&
+		degraded.Status == configv1.ConditionFalse &&
+		progressing.Status == configv1.ConditionFalse {
+		e2e.Logf("the status of cluster operator %v restore to updated state", coName)
+		isAvailable = true
+	} else {
+		//print the co status for toubleshooting why the co keep available=false state
+		e2e.Logf("desiredCO status is %v", desiredCO.Status)
+	}
+	return isAvailable, nil
+}
+
+// wait for 10 minutes and check mcp master/worker mcp stay on updated state
+// just in case other test case cause node reboot/scale out or update during execute this test case
+func assertIfSpecifiedMCPSKeepUpdatedStateWithRetry(mcps dynamic.NamespaceableResourceInterface, mcpName string) {
+	errWait := wait.Poll(1*time.Minute, 10*time.Minute, func() (bool, error) {
+		updated, Updating := isPoolUpdated(mcps, mcpName)
+		if updated && !Updating {
+			e2e.Logf("the status of mcp %v is updated state, not updating", mcpName)
+			return true, nil
+		}
+		e2e.Logf("the status of mcp %v is updating or degraded state, will check again")
+		e2e.Logf("the status of mcp is : updated - [%v] updating - [%v]", updated, Updating)
+		return false, nil
+	})
+	AssertWaitPollNoErr(errWait, "the status of mcp keep unexpected state")
+}
+
+// e is return value of Wait.Poll
+// msg is the reason why time out
+// the function assert return value of Wait.Poll, and expect NO error
+// if e is Nil, just pass and nothing happen.
+// if e is not Nil, will not print the default error message "timed out waiting for the condition" because it causes RP AA not to analysis result exactly.
+// if e is "timed out waiting for the condition" or "context deadline exceeded", it is replaced by msg.
+// if e is not "timed out waiting for the condition", it print e and then case fails.
+func AssertWaitPollNoErr(e error, msg string) {
+	if e == nil {
+		return
+	}
+	var err error
+	if strings.Compare(e.Error(), "timed out waiting for the condition") == 0 || strings.Compare(e.Error(), "context deadline exceeded") == 0 {
+		err = fmt.Errorf("case: %v\nerror: %s", g.CurrentSpecReport().FullText(), msg)
+	} else {
+		err = fmt.Errorf("case: %v\nerror: %s", g.CurrentSpecReport().FullText(), e.Error())
+	}
+	o.Expect(err).NotTo(o.HaveOccurred())
+}
diff --git a/test/extended/util/annotate/generated/zz_generated.annotations.go b/test/extended/util/annotate/generated/zz_generated.annotations.go