Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PSAP-1210: Update code for automate OCPBUGS-18052 #28382

Merged
merged 4 commits into from
Dec 21, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
58 changes: 57 additions & 1 deletion test/extended/node_tuning/node_tuning.go
@@ -1,6 +1,7 @@
package node_tuning

import (
"context"
"fmt"
"path/filepath"
"strings"
Expand All @@ -10,7 +11,11 @@ import (
o "github.com/onsi/gomega"

exutil "github.com/openshift/origin/test/extended/util"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/dynamic"
e2e "k8s.io/kubernetes/test/e2e/framework"
)

Expand All @@ -19,7 +24,7 @@ var _ = g.Describe("[sig-node-tuning] NTO should", func() {

var (
ntoNamespace = "openshift-cluster-node-tuning-operator"
oc = exutil.NewCLIWithoutNamespace("nto")
oc = exutil.NewCLIWithoutNamespace("nto").AsAdmin()
buildPruningBaseDir = exutil.FixturePath("testdata", "node_tuning")
ntoStalldFile = filepath.Join(buildPruningBaseDir, "nto-stalld.yaml")
stalldCurrentPID string
Expand Down Expand Up @@ -111,6 +116,57 @@ var _ = g.Describe("[sig-node-tuning] NTO should", func() {

err = fmt.Errorf("case: %v\nexpected error got because of %v", g.CurrentSpecReport().FullText(), fmt.Sprintf("stalld service restarted : %v", errWait))
o.Expect(err).NotTo(o.HaveOccurred())
})

// OCPBUGS-18052
g.It("SNO installation does not finish due to wait for non-existing machine-config [Early]", func() {
isSNO, err := isSNOCluster(oc)
o.Expect(err).NotTo(o.HaveOccurred())
if !isSNO {
g.Skip("only test on SNO cluster, skipping it ...")
}

var (
mcpConfigDaemonset *corev1.Pod
)

ctx := context.TODO()
nodeClient := oc.KubeClient().CoreV1().Nodes()
firstMasterNode, err := getFirstMasterNode(ctx, nodeClient)
o.Expect(err).NotTo(o.HaveOccurred())

e2e.Logf("ensure that the status of mcp is on updated state")
config, err := e2e.LoadConfig()
e2e.ExpectNoError(err)
dynamicClient := dynamic.NewForConfigOrDie(config)
mcps := dynamicClient.Resource(schema.GroupVersionResource{
Group: "machineconfiguration.openshift.io",
Version: "v1",
Resource: "machineconfigpools",
})
pools, err := mcps.List(context.Background(), metav1.ListOptions{})
o.Expect(err).NotTo(o.HaveOccurred())
for _, p := range pools.Items {
err = waitForUpdatedMCP(mcps, p.GetName())
o.Expect(err).NotTo(o.HaveOccurred())
}

e2e.Logf("ensure that the status of co machine-config is availabe state")
liqcui marked this conversation as resolved.
Show resolved Hide resolved
err = waitForClusterOperatorAvailable(oc, "machine-config")
o.Expect(err).NotTo(o.HaveOccurred())
e2e.Logf("ensure that the status of co node-tuning is availabe state")
liqcui marked this conversation as resolved.
Show resolved Hide resolved
err = waitForClusterOperatorAvailable(oc, "node-tuning")
o.Expect(err).NotTo(o.HaveOccurred())

kf := oc.KubeFramework()
mcpConfigDaemonset, _ = exutil.GetMachineConfigDaemonByNode(kf.ClientSet, firstMasterNode)
o.Expect(err).NotTo(o.HaveOccurred())
e2e.Logf("get pod logs for %v", mcpConfigDaemonset.Name)
podLogsStdout, err := getPodLogsLastLines(context.Background(), oc.KubeClient(), "openshift-machine-config-operator", mcpConfigDaemonset.Name, "machine-config-daemon", 20)
o.Expect(err).NotTo(o.HaveOccurred())
e2e.Logf("check if the log of %v contains keyword [marking degraded due to|not found]", mcpConfigDaemonset.Name)
logAssertResult, err := podLogsMatch(mcpConfigDaemonset.Name, podLogsStdout, "Marking Degraded due to|not found")
o.Expect(err).NotTo(o.HaveOccurred())
o.Expect(logAssertResult).To(o.BeFalse())
})
})
213 changes: 213 additions & 0 deletions test/extended/node_tuning/node_tuning_utils.go
@@ -0,0 +1,213 @@
package node_tuning

import (
"context"
"fmt"
"regexp"
"strconv"
"strings"
"time"

configv1 "github.com/openshift/api/config/v1"
exutil "github.com/openshift/origin/test/extended/util"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/dynamic"
clientset "k8s.io/client-go/kubernetes"
v1 "k8s.io/client-go/kubernetes/typed/core/v1"
e2e "k8s.io/kubernetes/test/e2e/framework"
liqcui marked this conversation as resolved.
Show resolved Hide resolved
)

const masterNodeRoleLabel = "node-role.kubernetes.io/master"

// isSNOCluster will check if OCP is a single node cluster
func isSNOCluster(oc *exutil.CLI) (bool, error) {
infrastructureType, err := oc.AdminConfigClient().ConfigV1().Infrastructures().Get(context.Background(), "cluster", metav1.GetOptions{})
if err != nil {
return false, err
}
e2e.Logf("the cluster type is %v", infrastructureType.Status.ControlPlaneTopology)
return infrastructureType.Status.ControlPlaneTopology == configv1.SingleReplicaTopologyMode, nil
}

func getFirstMasterNode(ctx context.Context, nodeClient v1.NodeInterface) (*corev1.Node, error) {
masterNodes, err := nodeClient.List(ctx, metav1.ListOptions{LabelSelector: masterNodeRoleLabel})
if err != nil || len(masterNodes.Items) == 0 {
e2e.Logf("failed to list master nodes %v", err)
return nil, err
}
return &masterNodes.Items[0], err
}

func podLogsMatch(podName string, podLogs string, filter string) (bool, error) {
regNTOPodLogs, err := regexp.Compile(".*" + filter + ".*")
liqcui marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return false, err
}
isMatch := regNTOPodLogs.MatchString(podLogs)
if isMatch {
loglines := regNTOPodLogs.FindAllString(podLogs, -1)
e2e.Logf("the [%v] matched in the logs of pod %v, full log is [%v]", filter, podName, loglines[0])
return true, nil
}
e2e.Logf("the keywords [%s] of pod isn't found ...", filter)
return false, nil
}

func getPodLogsLastLines(ctx context.Context, c clientset.Interface, namespace, podName, containerName string, lastlines int) (string, error) {
return getPodLogsInternal(ctx, c, namespace, podName, containerName, false, nil, &lastlines)
}

// utility function for gomega Eventually
func getPodLogsInternal(ctx context.Context, c clientset.Interface, namespace, podName, containerName string, previous bool, sinceTime *metav1.Time, tailLines *int) (string, error) {
request := c.CoreV1().RESTClient().Get().
Resource("pods").
Namespace(namespace).
Name(podName).SubResource("log").
Param("container", containerName).
Param("previous", strconv.FormatBool(previous))
if sinceTime != nil {
request.Param("sinceTime", sinceTime.Format(time.RFC3339))
}
if tailLines != nil {
request.Param("tailLines", strconv.Itoa(*tailLines))
}
logs, err := request.Do(ctx).Raw()
if err != nil {
return "", err
}
if strings.Contains(string(logs), "Internal Error") {
return "", fmt.Errorf("fetched log contains \"Internal Error\": %q", string(logs))
}
return string(logs), err
}

func isPoolUpdated(dc dynamic.NamespaceableResourceInterface, name string) (poolUpToDate bool, poolIsUpdating bool) {
pool, err := dc.Get(context.Background(), name, metav1.GetOptions{})
if err != nil {
e2e.Logf("error getting pool %s: %v", name, err)
return false, false
}
paused, found, err := unstructured.NestedBool(pool.Object, "spec", "paused")
if err != nil || !found {
return false, false
}
conditions, found, err := unstructured.NestedFieldNoCopy(pool.Object, "status", "conditions")
if err != nil || !found {
return false, false
}
original, ok := conditions.([]interface{})
if !ok {
return false, false
}
var updated, updating, degraded bool
for _, obj := range original {
o, ok := obj.(map[string]interface{})
if !ok {
return false, false
}
t, found, err := unstructured.NestedString(o, "type")
if err != nil || !found {
return false, false
}
s, found, err := unstructured.NestedString(o, "status")
if err != nil || !found {
return false, false
}
if t == "Updated" && s == "True" {
updated = true
}
if t == "Updating" && s == "True" {
updating = true
}
if t == "Degraded" && s == "True" {
degraded = true
}
}
if paused {
e2e.Logf("pool %s is paused, treating as up-to-date (updated: %v, updating: %v, degraded: %v)", name, updated, updating, degraded)
return true, updating
}
if updated && !updating && !degraded {
return true, updating
}
e2e.Logf("pool %s is still reporting (updated: %v, updating: %v, degraded: %v)", name, updated, updating, degraded)
return false, updating
}

func findCondition(conditions []configv1.ClusterOperatorStatusCondition, name configv1.ClusterStatusConditionType) *configv1.ClusterOperatorStatusCondition {
for i := range conditions {
if name == conditions[i].Type {
return &conditions[i]
}
}
return nil
}

// Check if CO status is available with multiple retry, the maxinum is 5 times, re-try one time each minutes.
// just in case the co status become degraded or processing during ocp upgrade scenario or node reboot/scale out.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check if the co status is available every 60 seconds, for a maximum of 5 tries.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in new code, thank you for your suggestion!

func waitForClusterOperatorAvailable(oc *exutil.CLI, coName string) error {
return wait.Poll(1*time.Minute, 5*time.Minute, func() (bool, error) {
isCOAvailable, err := isCOAvailableState(oc, coName)
if err != nil {
return false, err
}
if isCOAvailable {
e2e.Logf("the status of cluster operator %v keep on available state", coName)
return true, nil
liqcui marked this conversation as resolved.
Show resolved Hide resolved
}
e2e.Logf("the status of co %v doesn't stay on expected state, will check again", coName)
return false, nil
})
}

// isCOAvailable returns true when the ClusterOperator's coName status conditions are as follows: Available true, Progressing false and Degraded false.
func isCOAvailableState(oc *exutil.CLI, coName string) (bool, error) {
var (
clusterOperators []configv1.ClusterOperator
co configv1.ClusterOperator
isAvailable bool
)
clusterOperatorsList, err := oc.AdminConfigClient().ConfigV1().ClusterOperators().List(context.Background(), metav1.ListOptions{})
if err != nil {
e2e.Logf("fail to get cluster operator list with error")
return false, err
}
clusterOperators = clusterOperatorsList.Items
for _, clusterOperator := range clusterOperators {
if clusterOperator.Name == coName {
co = clusterOperator
e2e.Logf("co name is %v", co.Name)
e2e.Logf("co.status.conditions of %v is %v", co.Name, co.Status.Conditions)
break
}
}
available := findCondition(co.Status.Conditions, configv1.OperatorAvailable)
degraded := findCondition(co.Status.Conditions, configv1.OperatorDegraded)
progressing := findCondition(co.Status.Conditions, configv1.OperatorProgressing)
e2e.Logf("the status of co %v is available.Status [%v] degraded.Status [%v] and progressing.Status [%v]", coName, available.Status, degraded.Status, progressing.Status)
if available.Status == configv1.ConditionTrue &&
degraded.Status == configv1.ConditionFalse &&
progressing.Status == configv1.ConditionFalse {
isAvailable = true
}
e2e.Logf("co/%v status is %v", coName, co.Status)
return isAvailable, nil
}

// Check if mcp master/worker mcp is on updated state, the maxinum is 10 times, re-try one time each minutes

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check the updated state of mcp master/worker every 60 seconds, for a maximum of 10 tries.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in new code, thank you for your suggestion!

// just in case the mcp state become updating during ocp upgrade or node reboot/scale out or other test case change mcp
func waitForUpdatedMCP(mcps dynamic.NamespaceableResourceInterface, mcpName string) error {
return wait.Poll(1*time.Minute, 10*time.Minute, func() (bool, error) {
updated, updating := isPoolUpdated(mcps, mcpName)
if updated && !updating {
e2e.Logf("the status of mcp %v is updated state, not updating", mcpName)
return true, nil
}
e2e.Logf("the status of mcp %v is updating or degraded state, will check again")
e2e.Logf("the status of mcp is updated - [%v] updating - [%v]", updated, updating)
return false, nil
})
}

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.