Skip to content

Commit

Permalink
update code to automate nto bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
liqcui committed Nov 24, 2023
1 parent ad0c73d commit 6f85557
Show file tree
Hide file tree
Showing 3 changed files with 295 additions and 1 deletion.
56 changes: 55 additions & 1 deletion test/extended/node_tuning/node_tuning.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package node_tuning

import (
"context"
"fmt"
"path/filepath"
"strings"
Expand All @@ -10,7 +11,12 @@ import (
o "github.com/onsi/gomega"

exutil "github.com/openshift/origin/test/extended/util"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/dynamic"
"k8s.io/kubernetes/test/e2e/framework"
e2e "k8s.io/kubernetes/test/e2e/framework"
)

Expand All @@ -19,7 +25,7 @@ var _ = g.Describe("[sig-node-tuning] NTO should", func() {

var (
ntoNamespace = "openshift-cluster-node-tuning-operator"
oc = exutil.NewCLIWithoutNamespace("nto")
oc = exutil.NewCLIWithoutNamespace("nto").AsAdmin()
buildPruningBaseDir = exutil.FixturePath("testdata", "node_tuning")
ntoStalldFile = filepath.Join(buildPruningBaseDir, "nto-stalld.yaml")
stalldCurrentPID string
Expand Down Expand Up @@ -111,6 +117,54 @@ var _ = g.Describe("[sig-node-tuning] NTO should", func() {

err = fmt.Errorf("case: %v\nexpected error got because of %v", g.CurrentSpecReport().FullText(), fmt.Sprintf("stalld service restarted : %v", errWait))
o.Expect(err).NotTo(o.HaveOccurred())
})

// OCPBUGS-18052
g.It("SNO installation does not finish due to wait for non-existing machine-config [Early]", func() {
isSNO := isSNOCluster(oc)
if !isSNO {
g.Skip("only test on SNO cluster, skipping it ...")
}

var (
mcpConfigDaemonset *corev1.Pod
)

ctx := context.TODO()
nodeClient := oc.KubeClient().CoreV1().Nodes()
firstMasterNodeName, err := getFirstMasterNode(ctx, nodeClient)
o.Expect(err).NotTo(o.HaveOccurred())

e2e.Logf("assert if the status of mcp master keep on updated state")
config, err := framework.LoadConfig()
framework.ExpectNoError(err)
dynamicClient := dynamic.NewForConfigOrDie(config)
mcps := dynamicClient.Resource(schema.GroupVersionResource{
Group: "machineconfiguration.openshift.io",
Version: "v1",
Resource: "machineconfigpools",
})
pools, err := mcps.List(context.Background(), metav1.ListOptions{})
o.Expect(err).NotTo(o.HaveOccurred())
for _, p := range pools.Items {
assertIfSpecifiedMCPSKeepUpdatedStateWithRetry(mcps, p.GetName())
}

e2e.Logf("assert if the status of co machine-config is availabe state")
assertIfSpecifiedCOKeepAvailableStateWithRetry(oc, "machine-config")

e2e.Logf("assert if the status of co node-tuning is availabe state")
assertIfSpecifiedCOKeepAvailableStateWithRetry(oc, "node-tuning")

kf := oc.KubeFramework()
mcpConfigDaemonset, _ = exutil.GetMachineConfigDaemonByNode(kf.ClientSet, firstMasterNodeName)
e2e.Logf("mcpConfigDaemonsetPodName %v", mcpConfigDaemonset.Name)
o.Expect(err).NotTo(o.HaveOccurred())
e2e.Logf("Get pod logs for %v", mcpConfigDaemonset.Name)
podLogsStdout, err := getPodLogsLastLines(context.Background(), oc.KubeClient(), "openshift-machine-config-operator", mcpConfigDaemonset.Name, "machine-config-daemon", 20)
o.Expect(err).NotTo(o.HaveOccurred())
e2e.Logf("Check if the log of %v contains keyword [Marking Degraded due to|not found]", mcpConfigDaemonset.Name)
logAssertResult := assertPodLogsLastLines(mcpConfigDaemonset.Name, podLogsStdout, "Marking Degraded due to|not found")
o.Expect(logAssertResult).To(o.BeFalse())
})
})
238 changes: 238 additions & 0 deletions test/extended/node_tuning/node_tuning_utils.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
package node_tuning

import (
"context"
"fmt"
"regexp"
"strconv"
"strings"
"time"

g "github.com/onsi/ginkgo/v2"
o "github.com/onsi/gomega"
configv1 "github.com/openshift/api/config/v1"
exutil "github.com/openshift/origin/test/extended/util"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/dynamic"
clientset "k8s.io/client-go/kubernetes"
v1 "k8s.io/client-go/kubernetes/typed/core/v1"
"k8s.io/kubernetes/test/e2e/framework"
e2e "k8s.io/kubernetes/test/e2e/framework"
)

const masterNodeRoleLabel = "node-role.kubernetes.io/master"

// isSNOCluster will check if OCP is a single node cluster
func isSNOCluster(oc *exutil.CLI) bool {
infrastructureType, err := oc.AdminConfigClient().ConfigV1().Infrastructures().Get(context.Background(), "cluster", metav1.GetOptions{})
o.Expect(err).NotTo(o.HaveOccurred())
e2e.Logf("the cluster type is %v", infrastructureType.Status.ControlPlaneTopology)
return infrastructureType.Status.ControlPlaneTopology == configv1.SingleReplicaTopologyMode
}

func getFirstMasterNode(ctx context.Context, nodeClient v1.NodeInterface) (*corev1.Node, error) {
masterNodes, err := nodeClient.List(ctx, metav1.ListOptions{LabelSelector: masterNodeRoleLabel})
if err != nil {
fmt.Printf("failed to list master nodes:'%w'", err)
}
var firstMasterName *corev1.Node
for i, masterNode := range masterNodes.Items {
if i == 0 {
firstMasterName = &masterNode
e2e.Logf("the first masterNode is %v", masterNode.Name)
break
}
}
return firstMasterName, err
}

func assertPodLogsLastLines(podName string, podLogs string, filter string) bool {
regNTOPodLogs, err := regexp.Compile(".*" + filter + ".*")
o.Expect(err).NotTo(o.HaveOccurred())
isMatch := regNTOPodLogs.MatchString(podLogs)
if isMatch {
loglines := regNTOPodLogs.FindAllString(podLogs, -1)
e2e.Logf("the logs of pod %v is [%v]", podName, loglines[0])
return true
}
e2e.Logf("the keywords [%s] of pod isn't found ...", filter)
return false
}

func getPodLogsLastLines(ctx context.Context, c clientset.Interface, namespace, podName, containerName string, lastlines int) (string, error) {
return getPodLogsInternal(ctx, c, namespace, podName, containerName, false, nil, &lastlines)
}

// utility function for gomega Eventually
func getPodLogsInternal(ctx context.Context, c clientset.Interface, namespace, podName, containerName string, previous bool, sinceTime *metav1.Time, tailLines *int) (string, error) {
request := c.CoreV1().RESTClient().Get().
Resource("pods").
Namespace(namespace).
Name(podName).SubResource("log").
Param("container", containerName).
Param("previous", strconv.FormatBool(previous))
if sinceTime != nil {
request.Param("sinceTime", sinceTime.Format(time.RFC3339))
}
if tailLines != nil {
request.Param("tailLines", strconv.Itoa(*tailLines))
}
logs, err := request.Do(ctx).Raw()
if err != nil {
return "", err
}
if strings.Contains(string(logs), "Internal Error") {
return "", fmt.Errorf("Fetched log contains \"Internal Error\": %q", string(logs))
}
return string(logs), err
}

func isPoolUpdated(dc dynamic.NamespaceableResourceInterface, name string) (poolUpToDate bool, poolIsUpdating bool) {
pool, err := dc.Get(context.Background(), name, metav1.GetOptions{})
if err != nil {
framework.Logf("error getting pool %s: %v", name, err)
return false, false
}
paused, found, err := unstructured.NestedBool(pool.Object, "spec", "paused")
if err != nil || !found {
return false, false
}
conditions, found, err := unstructured.NestedFieldNoCopy(pool.Object, "status", "conditions")
if err != nil || !found {
return false, false
}
original, ok := conditions.([]interface{})
if !ok {
return false, false
}
var updated, updating, degraded bool
for _, obj := range original {
o, ok := obj.(map[string]interface{})
if !ok {
return false, false
}
t, found, err := unstructured.NestedString(o, "type")
if err != nil || !found {
return false, false
}
s, found, err := unstructured.NestedString(o, "status")
if err != nil || !found {
return false, false
}
if t == "Updated" && s == "True" {
updated = true
}
if t == "Updating" && s == "True" {
updating = true
}
if t == "Degraded" && s == "True" {
degraded = true
}
}
if paused {
framework.Logf("Pool %s is paused, treating as up-to-date (Updated: %v, Updating: %v, Degraded: %v)", name, updated, updating, degraded)
return true, updating
}
if updated && !updating && !degraded {
return true, updating
}
framework.Logf("Pool %s is still reporting (Updated: %v, Updating: %v, Degraded: %v)", name, updated, updating, degraded)
return false, updating
}

func findCondition(conditions []configv1.ClusterOperatorStatusCondition, name configv1.ClusterStatusConditionType) *configv1.ClusterOperatorStatusCondition {
for i := range conditions {
if name == conditions[i].Type {
return &conditions[i]
}
}
return nil
}

// wait for 5 minutes and check co stay on avaible state, just in case ocp upgrade scenario or node reboot/scale out that caused by other test case.
func assertIfSpecifiedCOKeepAvailableStateWithRetry(oc *exutil.CLI, coName string) {
errWait := wait.Poll(1*time.Minute, 5*time.Minute, func() (bool, error) {
isCOAvailable, err := isCOAvailableState(oc, coName)
o.Expect(err).ToNot(o.HaveOccurred())
if isCOAvailable {
e2e.Logf("the status of cluster operator %v keep on available state", coName)
return true, nil
}
e2e.Logf("the status of co %v doesn't stay on expected state, will check again", coName)
return false, nil
})
AssertWaitPollNoErr(errWait, "the status of co keep unexpected state")
}

// isCOAvailable use to check if specified co is available state, that mean Available is true, Progressing is false, Degraded is false
func isCOAvailableState(oc *exutil.CLI, coName string) (bool, error) {
var (
clusterOperators []configv1.ClusterOperator
desiredCO configv1.ClusterOperator
isAvailable bool
)
clusterOperatorsList, err := oc.AdminConfigClient().ConfigV1().ClusterOperators().List(context.Background(), metav1.ListOptions{})
o.Expect(err).ToNot(o.HaveOccurred())
clusterOperators = clusterOperatorsList.Items
for _, clusterOperator := range clusterOperators {
if clusterOperator.Name == coName {
desiredCO = clusterOperator
e2e.Logf("desiredCO Name is %v", desiredCO.Name)
e2e.Logf("desiredCO.status.conditions of %v is %v", desiredCO.Name, desiredCO.Status.Conditions)
break
}
}
available := findCondition(desiredCO.Status.Conditions, configv1.OperatorAvailable)
degraded := findCondition(desiredCO.Status.Conditions, configv1.OperatorDegraded)
progressing := findCondition(desiredCO.Status.Conditions, configv1.OperatorProgressing)
e2e.Logf("the status of co %v is: available.Status [%v] degraded.Status [%v] and progressing.Status [%v]", coName, available.Status, degraded.Status, progressing.Status)
if available.Status == configv1.ConditionTrue &&
degraded.Status == configv1.ConditionFalse &&
progressing.Status == configv1.ConditionFalse {
e2e.Logf("the status of cluster operator %v restore to updated state", coName)
isAvailable = true
} else {
//print the co status for toubleshooting why the co keep available=false state
e2e.Logf("desiredCO status is %v", desiredCO.Status)
}
return isAvailable, nil
}

// wait for 10 minutes and check mcp master/worker mcp stay on updated state
// just in case other test case cause node reboot/scale out or update during execute this test case
func assertIfSpecifiedMCPSKeepUpdatedStateWithRetry(mcps dynamic.NamespaceableResourceInterface, mcpName string) {
errWait := wait.Poll(1*time.Minute, 10*time.Minute, func() (bool, error) {
updated, Updating := isPoolUpdated(mcps, mcpName)
if updated && !Updating {
e2e.Logf("the status of mcp %v is updated state, not updating", mcpName)
return true, nil
}
e2e.Logf("the status of mcp %v is updating or degraded state, will check again")
e2e.Logf("the status of mcp is : updated - [%v] updating - [%v]", updated, Updating)
return false, nil
})
AssertWaitPollNoErr(errWait, "the status of mcp keep unexpected state")
}

// e is return value of Wait.Poll
// msg is the reason why time out
// the function assert return value of Wait.Poll, and expect NO error
// if e is Nil, just pass and nothing happen.
// if e is not Nil, will not print the default error message "timed out waiting for the condition" because it causes RP AA not to analysis result exactly.
// if e is "timed out waiting for the condition" or "context deadline exceeded", it is replaced by msg.
// if e is not "timed out waiting for the condition", it print e and then case fails.
func AssertWaitPollNoErr(e error, msg string) {
if e == nil {
return
}
var err error
if strings.Compare(e.Error(), "timed out waiting for the condition") == 0 || strings.Compare(e.Error(), "context deadline exceeded") == 0 {
err = fmt.Errorf("case: %v\nerror: %s", g.CurrentSpecReport().FullText(), msg)
} else {
err = fmt.Errorf("case: %v\nerror: %s", g.CurrentSpecReport().FullText(), e.Error())
}
o.Expect(err).NotTo(o.HaveOccurred())
}

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 6f85557

Please sign in to comment.