Skip to content

Commit

Permalink
Mark operator as degraded if any pods are in CrashLoopBackOff state
Browse files Browse the repository at this point in the history
This patch checks for the statuses of pods of those deployments and daemonsets
which are in a "hung" state. If any of the pods are in CrashLoopBackOff state
the operator will be marked as degraded.

Signed-off-by: Surya Seetharaman <suryaseetharaman.9@gmail.com>
  • Loading branch information
tssurya committed Jun 22, 2020
1 parent 2a56600 commit 61d2fb1
Show file tree
Hide file tree
Showing 2 changed files with 295 additions and 2 deletions.
30 changes: 30 additions & 0 deletions pkg/controller/statusmanager/pod_status.go
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/openshift/cluster-network-operator/pkg/names"

appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
Expand Down Expand Up @@ -91,6 +92,10 @@ func (status *StatusManager) SetFromPods() {
} else if ds.Status.NumberUnavailable > 0 {
progressing = append(progressing, fmt.Sprintf("DaemonSet %q is not available (awaiting %d nodes)", dsName.String(), ds.Status.NumberUnavailable))
dsProgressing = true
// Check for any pods in CrashLoopBackOff state and mark the operator as degrade if so.
if !isNonCritical(ds) {
hung = append(hung, status.CheckCrashLoopBackOffPods(dsName, ds.Spec.Selector.MatchLabels, "DaemonSet")...)
}
} else if ds.Status.NumberAvailable == 0 { // NOTE: update this if we ever expect empty (unscheduled) daemonsets ~cdc
progressing = append(progressing, fmt.Sprintf("DaemonSet %q is not yet scheduled on any nodes", dsName.String()))
dsProgressing = true
Expand Down Expand Up @@ -138,6 +143,10 @@ func (status *StatusManager) SetFromPods() {
if dep.Status.UnavailableReplicas > 0 {
progressing = append(progressing, fmt.Sprintf("Deployment %q is not available (awaiting %d nodes)", depName.String(), dep.Status.UnavailableReplicas))
depProgressing = true
// Check for any pods in CrashLoopBackOff state and mark the operator as degrade if so.
if !isNonCritical(dep) {
hung = append(hung, status.CheckCrashLoopBackOffPods(depName, dep.Spec.Selector.MatchLabels, "Deployment")...)
}
} else if dep.Status.AvailableReplicas == 0 {
progressing = append(progressing, fmt.Sprintf("Deployment %q is not yet scheduled on any nodes", depName.String()))
depProgressing = true
Expand Down Expand Up @@ -290,6 +299,27 @@ func (status *StatusManager) setLastPodState(
})
}

func (status *StatusManager) CheckCrashLoopBackOffPods(dName types.NamespacedName, lb map[string]string, name string) []string {
hung := []string{}
pods := &v1.PodList{}
err := status.client.List(context.TODO(), pods, client.InNamespace(dName.Namespace), client.MatchingLabels(lb))
if err != nil {
log.Printf("Error getting pods from %s %q: %v", name, dName.String(), err)
}
for _, pod := range pods.Items {
for _, container := range pod.Status.ContainerStatuses {
if container.State.Waiting != nil {
if container.State.Waiting.Reason == "CrashLoopBackOff" {
hung = append(hung, fmt.Sprintf("%s %q rollout is not making progress - pod %s is in CrashLoopBackOff State", name, dName.String(), pod.Name))
// we can break once we find at least one container crashing in this pod
break
}
}
}
}
return hung
}

func isNonCritical(obj metav1.Object) bool {
_, exists := obj.GetAnnotations()[names.NonCriticalAnnotation]
return exists
Expand Down
267 changes: 265 additions & 2 deletions pkg/controller/statusmanager/status_manager_test.go
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/openshift/library-go/pkg/config/clusteroperator/v1helpers"

appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -337,12 +338,26 @@ func TestStatusManagerSetFromDaemonSets(t *testing.T) {
}

// Create minimal DaemonSets
dsA := &appsv1.DaemonSet{ObjectMeta: metav1.ObjectMeta{Namespace: "one", Name: "alpha", Generation: 1}}
dsA := &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{Namespace: "one", Name: "alpha", Generation: 1},
Spec: appsv1.DaemonSetSpec{
Selector: &metav1.LabelSelector{
MatchLabels: map[string]string{"app": "alpha"},
},
},
}
err = client.Create(context.TODO(), dsA)
if err != nil {
t.Fatalf("error creating DaemonSet: %v", err)
}
dsB := &appsv1.DaemonSet{ObjectMeta: metav1.ObjectMeta{Namespace: "two", Name: "beta", Generation: 1}}
dsB := &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{Namespace: "two", Name: "beta", Generation: 1},
Spec: appsv1.DaemonSetSpec{
Selector: &metav1.LabelSelector{
MatchLabels: map[string]string{"app": "beta"},
},
},
}
err = client.Create(context.TODO(), dsB)
if err != nil {
t.Fatalf("error creating DaemonSet: %v", err)
Expand Down Expand Up @@ -915,6 +930,11 @@ func TestStatusManagerSetFromDeployments(t *testing.T) {
Status: appsv1.DeploymentStatus{
UnavailableReplicas: 1,
},
Spec: appsv1.DeploymentSpec{
Selector: &metav1.LabelSelector{
MatchLabels: map[string]string{"app": "gamma"},
},
},
}
err = client.Create(context.TODO(), depB)
if err != nil {
Expand Down Expand Up @@ -1188,3 +1208,246 @@ func setLastPodState(t *testing.T, client client.Client, name string, ps podStat
t.Fatal(err)
}
}

func TestStatusManagerCheckCrashLoopBackOffPods(t *testing.T) {
client := fake.NewFakeClient()
mapper := &fakeRESTMapper{}
status := New(client, mapper, "testing", "1.2.3")
status.SetDaemonSets([]types.NamespacedName{
{Namespace: "one", Name: "alpha"},
{Namespace: "two", Name: "beta"},
})

dsA := &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Namespace: "one",
Name: "alpha",
},
Spec: appsv1.DaemonSetSpec{
Selector: &metav1.LabelSelector{
MatchLabels: map[string]string{"app": "alpha"},
},
},
}
err := client.Create(context.TODO(), dsA)
if err != nil {
t.Fatalf("error creating DaemonSet: %v", err)
}

dsB := &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Namespace: "two",
Name: "beta",
},
Spec: appsv1.DaemonSetSpec{
Selector: &metav1.LabelSelector{
MatchLabels: map[string]string{"app": "beta"},
},
},
}
err = client.Create(context.TODO(), dsB)
if err != nil {
t.Fatalf("error creating DaemonSet: %v", err)
}

podA := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Namespace: "one",
Name: "alpha-x0x0",
Labels: map[string]string{"app": "alpha"},
},
Status: v1.PodStatus{
ContainerStatuses: []v1.ContainerStatus{{
Name: "ubuntu",
State: v1.ContainerState{
Waiting: &v1.ContainerStateWaiting{
Reason: "CrashLoopBackOff",
},
},
}},
},
}
err = client.Create(context.TODO(), podA)
if err != nil {
t.Fatalf("error creating Pod: %v", err)
}

podB := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Namespace: "two",
Name: "beta-x0x0",
Labels: map[string]string{"app": "beta"},
},
Status: v1.PodStatus{
ContainerStatuses: []v1.ContainerStatus{{
Name: "fedora",
State: v1.ContainerState{
Running: &v1.ContainerStateRunning{StartedAt: metav1.Time{}},
},
}},
},
}
err = client.Create(context.TODO(), podB)
if err != nil {
t.Fatalf("error creating Pod: %v", err)
}

expected := []string{"DaemonSet \"one/alpha\" rollout is not making progress - pod alpha-x0x0 is in CrashLoopBackOff State"}
hung := status.CheckCrashLoopBackOffPods(types.NamespacedName{Namespace: "one", Name: "alpha"}, map[string]string{"app": "alpha"}, "DaemonSet")
if !reflect.DeepEqual(hung, expected) {
t.Fatalf("unexpected value in hung %v", hung)
}

expected = []string{}
hung = status.CheckCrashLoopBackOffPods(types.NamespacedName{Namespace: "two", Name: "beta"}, map[string]string{"app": "beta"}, "DaemonSet")
if !reflect.DeepEqual(hung, expected) {
t.Fatalf("unexpected value in hung %v", hung)
}

// Test non-critical DaemonSets - the operator should not be marked as degraded.
status.SetDaemonSets([]types.NamespacedName{
{Namespace: "four", Name: "non-critical"},
})

dsNC := &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Namespace: "four",
Name: "non-critical",
Annotations: map[string]string{
names.NonCriticalAnnotation: "",
},
},
Status: appsv1.DaemonSetStatus{
NumberUnavailable: 1,
},
Spec: appsv1.DaemonSetSpec{
Selector: &metav1.LabelSelector{
MatchLabels: map[string]string{"app": "non-critical"},
},
},
}
err = client.Create(context.TODO(), dsNC)
if err != nil {
t.Fatalf("error creating DaemonSet: %v", err)
}

podnC := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Namespace: "four",
Name: "nC-x0x0",
Labels: map[string]string{"app": "non-critical"},
},
Status: v1.PodStatus{
ContainerStatuses: []v1.ContainerStatus{{
Name: "ubuntu",
State: v1.ContainerState{
Waiting: &v1.ContainerStateWaiting{
Reason: "CrashLoopBackOff",
},
},
},
}},
}
err = client.Create(context.TODO(), podnC)
if err != nil {
t.Fatalf("error creating Pod: %v", err)
}

status.SetFromPods()

co, err := getCO(client, "testing")
if err != nil {
t.Fatalf("error getting ClusterOperator: %v", err)
}
if !conditionsInclude(co.Status.Conditions, []configv1.ClusterOperatorStatusCondition{
{
Type: configv1.OperatorDegraded,
Status: configv1.ConditionFalse,
},
{
Type: configv1.OperatorProgressing,
Status: configv1.ConditionTrue,
},
{
Type: configv1.OperatorUpgradeable,
Status: configv1.ConditionTrue,
},
{
Type: configv1.OperatorAvailable,
Status: configv1.ConditionTrue,
},
}) {
t.Fatalf("unexpected Status.Conditions: %#v", co.Status.Conditions)
}

status.SetDeployments([]types.NamespacedName{
{Namespace: "three", Name: "gamma"},
})

dep := &appsv1.Deployment{
ObjectMeta: metav1.ObjectMeta{
Namespace: "three",
Name: "gamma",
},
Spec: appsv1.DeploymentSpec{
Selector: &metav1.LabelSelector{
MatchLabels: map[string]string{"app": "gamma"},
},
},
Status: appsv1.DeploymentStatus{
UnavailableReplicas: 1,
},
}
err = client.Create(context.TODO(), dep)
if err != nil {
t.Fatalf("error creating Deployment: %v", err)
}

podC := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Namespace: "three",
Name: "gamma-x0x0",
Labels: map[string]string{"app": "gamma"},
},
Status: v1.PodStatus{
ContainerStatuses: []v1.ContainerStatus{{
Name: "fedora",
State: v1.ContainerState{
Waiting: &v1.ContainerStateWaiting{
Reason: "CrashLoopBackOff",
},
},
},
}},
}
err = client.Create(context.TODO(), podC)
if err != nil {
t.Fatalf("error creating Pod: %v", err)
}

status.SetFromPods()
co, err = getCO(client, "testing")
if err != nil {
t.Fatalf("error getting ClusterOperator: %v", err)
}

if !conditionsInclude(co.Status.Conditions, []configv1.ClusterOperatorStatusCondition{
{
Type: configv1.OperatorDegraded,
Status: configv1.ConditionTrue,
Reason: "RolloutHung",
Message: "Deployment \"three/gamma\" rollout is not making progress - pod gamma-x0x0 is in CrashLoopBackOff State",
},
{
Type: configv1.OperatorProgressing,
Status: configv1.ConditionTrue,
Reason: "Deploying",
},
{
Type: configv1.OperatorAvailable,
Status: configv1.ConditionTrue,
},
}) {
t.Fatalf("unexpected Status.Conditions: %#v", co.Status.Conditions)
}
}

0 comments on commit 61d2fb1

Please sign in to comment.