Skip to content

Commit

Permalink
Merge pull request #1176 from harche/bug_2040533
Browse files Browse the repository at this point in the history
Bug 2040533: UPSTREAM: <drop>: Ignore container notfound error while getPodstatuses
  • Loading branch information
openshift-merge-robot committed Feb 16, 2022
2 parents f14faf2 + 7447f79 commit 476a929
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 11 deletions.
26 changes: 18 additions & 8 deletions pkg/kubelet/kuberuntime/kuberuntime_container.go
Expand Up @@ -34,6 +34,8 @@ import (
"sync"
"time"

crierror "k8s.io/cri-api/pkg/errors"

grpcstatus "google.golang.org/grpc/status"

"github.com/armon/circbuf"
Expand Down Expand Up @@ -502,10 +504,17 @@ func (m *kubeGenericRuntimeManager) getPodContainerStatuses(uid kubetypes.UID, n
return nil, err
}

statuses := make([]*kubecontainer.Status, len(containers))
statuses := []*kubecontainer.Status{}
// TODO: optimization: set maximum number of containers per container name to examine.
for i, c := range containers {
for _, c := range containers {
status, err := m.runtimeService.ContainerStatus(c.Id)
// Between List (ListContainers) and check (ContainerStatus) another thread might remove a container, and that is normal.
// The previous call (ListContainers) never fails due to a pod container not existing.
// Therefore, this method should not either, but instead act as if the previous call failed,
// which means the error should be ignored.
if crierror.IsNotFound(err) {
continue
}
if err != nil {
// Merely log this here; GetPodStatus will actually report the error out.
klog.V(4).InfoS("ContainerStatus return error", "containerID", c.Id, "err", err)
Expand Down Expand Up @@ -538,7 +547,7 @@ func (m *kubeGenericRuntimeManager) getPodContainerStatuses(uid kubetypes.UID, n
cStatus.Message += tMessage
}
}
statuses[i] = cStatus
statuses = append(statuses, cStatus)
}

sort.Sort(containerStatusByCreated(statuses))
Expand Down Expand Up @@ -721,15 +730,16 @@ func (m *kubeGenericRuntimeManager) killContainer(pod *v1.Pod, containerID kubec
"containerName", containerName, "containerID", containerID.String(), "gracePeriod", gracePeriod)

err := m.runtimeService.StopContainer(containerID.ID, gracePeriod)
if err != nil {
if err != nil && !crierror.IsNotFound(err) {
klog.ErrorS(err, "Container termination failed with gracePeriod", "pod", klog.KObj(pod), "podUID", pod.UID,
"containerName", containerName, "containerID", containerID.String(), "gracePeriod", gracePeriod)
} else {
klog.V(3).InfoS("Container exited normally", "pod", klog.KObj(pod), "podUID", pod.UID,
"containerName", containerName, "containerID", containerID.String())
return err
}

return err
klog.V(3).InfoS("Container exited normally", "pod", klog.KObj(pod), "podUID", pod.UID,
"containerName", containerName, "containerID", containerID.String())

return nil
}

// killContainersWithSyncResult kills all pod's containers with sync results.
Expand Down
14 changes: 11 additions & 3 deletions pkg/kubelet/kuberuntime/kuberuntime_manager.go
Expand Up @@ -25,6 +25,7 @@ import (
"time"

cadvisorapi "github.com/google/cadvisor/info/v1"
crierror "k8s.io/cri-api/pkg/errors"
"k8s.io/klog/v2"

v1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -1007,7 +1008,7 @@ func (m *kubeGenericRuntimeManager) killPodWithSyncResult(pod *v1.Pod, runningPo
result.AddSyncResult(killSandboxResult)
// Stop all sandboxes belongs to same pod
for _, podSandbox := range runningPod.Sandboxes {
if err := m.runtimeService.StopPodSandbox(podSandbox.ID.ID); err != nil {
if err := m.runtimeService.StopPodSandbox(podSandbox.ID.ID); err != nil && !crierror.IsNotFound(err) {
killSandboxResult.Fail(kubecontainer.ErrKillPodSandbox, err.Error())
klog.ErrorS(nil, "Failed to stop sandbox", "podSandboxID", podSandbox.ID)
}
Expand Down Expand Up @@ -1049,15 +1050,22 @@ func (m *kubeGenericRuntimeManager) GetPodStatus(uid kubetypes.UID, name, namesp

klog.V(4).InfoS("getSandboxIDByPodUID got sandbox IDs for pod", "podSandboxID", podSandboxIDs, "pod", klog.KObj(pod))

sandboxStatuses := make([]*runtimeapi.PodSandboxStatus, len(podSandboxIDs))
sandboxStatuses := []*runtimeapi.PodSandboxStatus{}
podIPs := []string{}
for idx, podSandboxID := range podSandboxIDs {
podSandboxStatus, err := m.runtimeService.PodSandboxStatus(podSandboxID)
// Between List (getSandboxIDByPodUID) and check (PodSandboxStatus) another thread might remove a container, and that is normal.
// The previous call (getSandboxIDByPodUID) never fails due to a pod sandbox not existing.
// Therefore, this method should not either, but instead act as if the previous call failed,
// which means the error should be ignored.
if crierror.IsNotFound(err) {
continue
}
if err != nil {
klog.ErrorS(err, "PodSandboxStatus of sandbox for pod", "podSandboxID", podSandboxID, "pod", klog.KObj(pod))
return nil, err
}
sandboxStatuses[idx] = podSandboxStatus
sandboxStatuses = append(sandboxStatuses, podSandboxStatus)

// Only get pod IP from latest sandbox
if idx == 0 && podSandboxStatus.State == runtimeapi.PodSandboxState_SANDBOX_READY {
Expand Down
78 changes: 78 additions & 0 deletions pkg/kubelet/kuberuntime/kuberuntime_manager_test.go
Expand Up @@ -23,6 +23,9 @@ import (
"testing"
"time"

"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"

cadvisorapi "github.com/google/cadvisor/info/v1"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -334,6 +337,81 @@ func TestGetPodStatus(t *testing.T) {
assert.Equal(t, apitest.FakePodSandboxIPs, podStatus.IPs)
}

func TestStopContainerWithNotFoundError(t *testing.T) {
fakeRuntime, _, m, err := createTestRuntimeManager()
assert.NoError(t, err)

containers := []v1.Container{
{
Name: "foo1",
Image: "busybox",
ImagePullPolicy: v1.PullIfNotPresent,
},
{
Name: "foo2",
Image: "busybox",
ImagePullPolicy: v1.PullIfNotPresent,
},
}
pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: "12345678",
Name: "foo",
Namespace: "new",
},
Spec: v1.PodSpec{
Containers: containers,
},
}

// Set fake sandbox and faked containers to fakeRuntime.
makeAndSetFakePod(t, m, fakeRuntime, pod)
fakeRuntime.InjectError("StopContainer", status.Error(codes.NotFound, "No such container"))
podStatus, err := m.GetPodStatus(pod.UID, pod.Name, pod.Namespace)
require.NoError(t, err)
p := kubecontainer.ConvertPodStatusToRunningPod("", podStatus)
gracePeriod := int64(1)
err = m.KillPod(pod, p, &gracePeriod)
require.NoError(t, err)
}
func TestGetPodStatusWithNotFoundError(t *testing.T) {
fakeRuntime, _, m, err := createTestRuntimeManager()
assert.NoError(t, err)

containers := []v1.Container{
{
Name: "foo1",
Image: "busybox",
ImagePullPolicy: v1.PullIfNotPresent,
},
{
Name: "foo2",
Image: "busybox",
ImagePullPolicy: v1.PullIfNotPresent,
},
}
pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: "12345678",
Name: "foo",
Namespace: "new",
},
Spec: v1.PodSpec{
Containers: containers,
},
}

// Set fake sandbox and faked containers to fakeRuntime.
makeAndSetFakePod(t, m, fakeRuntime, pod)
fakeRuntime.InjectError("ContainerStatus", status.Error(codes.NotFound, "No such container"))
podStatus, err := m.GetPodStatus(pod.UID, pod.Name, pod.Namespace)
require.NoError(t, err)
require.Equal(t, pod.UID, podStatus.ID)
require.Equal(t, pod.Name, podStatus.Name)
require.Equal(t, pod.Namespace, podStatus.Namespace)
require.Equal(t, apitest.FakePodSandboxIPs, podStatus.IPs)
}

func TestGetPods(t *testing.T) {
fakeRuntime, _, m, err := createTestRuntimeManager()
assert.NoError(t, err)
Expand Down
1 change: 1 addition & 0 deletions vendor/modules.txt
Expand Up @@ -2224,6 +2224,7 @@ k8s.io/cri-api/pkg/apis
k8s.io/cri-api/pkg/apis/runtime/v1
k8s.io/cri-api/pkg/apis/runtime/v1alpha2
k8s.io/cri-api/pkg/apis/testing
k8s.io/cri-api/pkg/errors
# k8s.io/csi-translation-lib v0.0.0 => ./staging/src/k8s.io/csi-translation-lib
## explicit
k8s.io/csi-translation-lib
Expand Down

0 comments on commit 476a929

Please sign in to comment.