Skip to content

Commit

Permalink
Merge pull request #43751 from vvoland/fix-exitcode-wait
Browse files Browse the repository at this point in the history
state/Wait: Fix race when reading exit status
  • Loading branch information
thaJeztah committed Jul 29, 2022
2 parents 71cb54c + a290f5d commit f34567b
Show file tree
Hide file tree
Showing 9 changed files with 255 additions and 73 deletions.
23 changes: 12 additions & 11 deletions container/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,17 +81,18 @@ type Container struct {
Driver string
OS string
// MountLabel contains the options for the 'mount' command
MountLabel string
ProcessLabel string
RestartCount int
HasBeenStartedBefore bool
HasBeenManuallyStopped bool // used for unless-stopped restart policy
MountPoints map[string]*volumemounts.MountPoint
HostConfig *containertypes.HostConfig `json:"-"` // do not serialize the host config in the json, otherwise we'll make the container unportable
ExecCommands *exec.Store `json:"-"`
DependencyStore agentexec.DependencyGetter `json:"-"`
SecretReferences []*swarmtypes.SecretReference
ConfigReferences []*swarmtypes.ConfigReference
MountLabel string
ProcessLabel string
RestartCount int
HasBeenStartedBefore bool
HasBeenManuallyStopped bool // used for unless-stopped restart policy
HasBeenManuallyRestarted bool `json:"-"` // used to distinguish restart caused by restart policy from the manual one
MountPoints map[string]*volumemounts.MountPoint
HostConfig *containertypes.HostConfig `json:"-"` // do not serialize the host config in the json, otherwise we'll make the container unportable
ExecCommands *exec.Store `json:"-"`
DependencyStore agentexec.DependencyGetter `json:"-"`
SecretReferences []*swarmtypes.SecretReference
ConfigReferences []*swarmtypes.ConfigReference
// logDriver for closing
LogDriver logger.Logger `json:"-"`
LogCopier *logger.Copier `json:"-"`
Expand Down
97 changes: 51 additions & 46 deletions container/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@ type State struct {
StartedAt time.Time
FinishedAt time.Time
Health *Health
Removed bool `json:"-"`

waitStop chan struct{}
waitRemove chan struct{}
stopWaiters []chan<- StateStatus
removeOnlyWaiters []chan<- StateStatus
}

// StateStatus is used to return container wait results.
Expand All @@ -57,12 +58,9 @@ func (s StateStatus) Err() error {
return s.err
}

// NewState creates a default state object with a fresh channel for state changes.
// NewState creates a default state object.
func NewState() *State {
return &State{
waitStop: make(chan struct{}),
waitRemove: make(chan struct{}),
}
return &State{}
}

// String returns a human-readable description of the state
Expand Down Expand Up @@ -182,11 +180,10 @@ func (s *State) Wait(ctx context.Context, condition WaitCondition) <-chan StateS
s.Lock()
defer s.Unlock()

if condition == WaitConditionNotRunning && !s.Running {
// Buffer so we can put it in the channel now.
resultC := make(chan StateStatus, 1)
// Buffer so we can put status and finish even nobody receives it.
resultC := make(chan StateStatus, 1)

// Send the current status.
if s.conditionAlreadyMet(condition) {
resultC <- StateStatus{
exitCode: s.ExitCode(),
err: s.Err(),
Expand All @@ -195,19 +192,16 @@ func (s *State) Wait(ctx context.Context, condition WaitCondition) <-chan StateS
return resultC
}

// If we are waiting only for removal, the waitStop channel should
// remain nil and block forever.
var waitStop chan struct{}
if condition < WaitConditionRemoved {
waitStop = s.waitStop
}

// Always wait for removal, just in case the container gets removed
// while it is still in a "created" state, in which case it is never
// actually stopped.
waitRemove := s.waitRemove
waitC := make(chan StateStatus, 1)

resultC := make(chan StateStatus, 1)
// Removal wakes up both removeOnlyWaiters and stopWaiters
// Container could be removed while still in "created" state
// in which case it is never actually stopped
if condition == WaitConditionRemoved {
s.removeOnlyWaiters = append(s.removeOnlyWaiters, waitC)
} else {
s.stopWaiters = append(s.stopWaiters, waitC)
}

go func() {
select {
Expand All @@ -218,23 +212,25 @@ func (s *State) Wait(ctx context.Context, condition WaitCondition) <-chan StateS
err: ctx.Err(),
}
return
case <-waitStop:
case <-waitRemove:
}

s.Lock()
result := StateStatus{
exitCode: s.ExitCode(),
err: s.Err(),
case status := <-waitC:
resultC <- status
}
s.Unlock()

resultC <- result
}()

return resultC
}

func (s *State) conditionAlreadyMet(condition WaitCondition) bool {
switch condition {
case WaitConditionNotRunning:
return !s.Running
case WaitConditionRemoved:
return s.Removed
}

return false
}

// IsRunning returns whether the running flag is set. Used by Container to check whether a container is running.
func (s *State) IsRunning() bool {
s.Lock()
Expand Down Expand Up @@ -292,8 +288,8 @@ func (s *State) SetStopped(exitStatus *ExitStatus) {
}
s.ExitCodeValue = exitStatus.ExitCode
s.OOMKilled = exitStatus.OOMKilled
close(s.waitStop) // fire waiters for stop
s.waitStop = make(chan struct{})

s.notifyAndClear(&s.stopWaiters)
}

// SetRestarting sets the container state to "restarting" without locking.
Expand All @@ -308,8 +304,8 @@ func (s *State) SetRestarting(exitStatus *ExitStatus) {
s.FinishedAt = time.Now().UTC()
s.ExitCodeValue = exitStatus.ExitCode
s.OOMKilled = exitStatus.OOMKilled
close(s.waitStop) // fire waiters for stop
s.waitStop = make(chan struct{})

s.notifyAndClear(&s.stopWaiters)
}

// SetError sets the container's error state. This is useful when we want to
Expand Down Expand Up @@ -374,22 +370,19 @@ func (s *State) IsDead() bool {
return res
}

// SetRemoved assumes this container is already in the "dead" state and
// closes the internal waitRemove channel to unblock callers waiting for a
// container to be removed.
// SetRemoved assumes this container is already in the "dead" state and notifies all waiters.
func (s *State) SetRemoved() {
s.SetRemovalError(nil)
}

// SetRemovalError is to be called in case a container remove failed.
// It sets an error and closes the internal waitRemove channel to unblock
// callers waiting for the container to be removed.
// It sets an error and notifies all waiters.
func (s *State) SetRemovalError(err error) {
s.SetError(err)
s.Lock()
close(s.waitRemove) // Unblock those waiting on remove.
// Recreate the channel so next ContainerWait will work
s.waitRemove = make(chan struct{})
s.Removed = true
s.notifyAndClear(&s.removeOnlyWaiters)
s.notifyAndClear(&s.stopWaiters)
s.Unlock()
}

Expand All @@ -400,3 +393,15 @@ func (s *State) Err() error {
}
return nil
}

func (s *State) notifyAndClear(waiters *[]chan<- StateStatus) {
result := StateStatus{
exitCode: s.ExitCodeValue,
err: s.Err(),
}

for _, c := range *waiters {
c <- result
}
*waiters = nil
}
25 changes: 25 additions & 0 deletions container/state_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,31 @@ func TestStateTimeoutWait(t *testing.T) {
}
}

// Related issue: #39352
func TestCorrectStateWaitResultAfterRestart(t *testing.T) {
s := NewState()

s.Lock()
s.SetRunning(0, true)
s.Unlock()

waitC := s.Wait(context.Background(), WaitConditionNotRunning)
want := ExitStatus{ExitCode: 10, ExitedAt: time.Now()}

s.Lock()
s.SetRestarting(&want)
s.Unlock()

s.Lock()
s.SetRunning(0, true)
s.Unlock()

got := <-waitC
if got.exitCode != want.ExitCode {
t.Fatalf("expected exit code %v, got %v", want.ExitCode, got.exitCode)
}
}

func TestIsValidStateString(t *testing.T) {
states := []struct {
state string
Expand Down
28 changes: 24 additions & 4 deletions daemon/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,20 @@ func (daemon *Daemon) handleContainerExit(c *container.Container, e *libcontaine
}
}

restart, wait, err := c.RestartManager().ShouldRestart(ec, daemon.IsShuttingDown() || c.HasBeenManuallyStopped, time.Since(c.StartedAt))
daemonShutdown := daemon.IsShuttingDown()
execDuration := time.Since(c.StartedAt)
restart, wait, err := c.RestartManager().ShouldRestart(ec, daemonShutdown || c.HasBeenManuallyStopped, execDuration)
if err != nil {
logrus.WithError(err).
WithField("container", c.ID).
WithField("restartCount", c.RestartCount).
WithField("exitStatus", exitStatus).
WithField("daemonShuttingDown", daemonShutdown).
WithField("hasBeenManuallyStopped", c.HasBeenManuallyStopped).
WithField("execDuration", execDuration).
Warn("ShouldRestart failed, container will not be restarted")
restart = false
}

// cancel healthcheck here, they will be automatically
// restarted if/when the container is started again
Expand All @@ -62,12 +75,19 @@ func (daemon *Daemon) handleContainerExit(c *container.Container, e *libcontaine
}
daemon.Cleanup(c)

if err == nil && restart {
if restart {
c.RestartCount++
logrus.WithField("container", c.ID).
WithField("restartCount", c.RestartCount).
WithField("exitStatus", exitStatus).
WithField("manualRestart", c.HasBeenManuallyRestarted).
Debug("Restarting container")
c.SetRestarting(&exitStatus)
} else {
c.SetStopped(&exitStatus)
defer daemon.autoRemove(c)
if !c.HasBeenManuallyRestarted {
defer daemon.autoRemove(c)
}
}
defer c.Unlock() // needs to be called before autoRemove

Expand All @@ -76,7 +96,7 @@ func (daemon *Daemon) handleContainerExit(c *container.Container, e *libcontaine

daemon.LogContainerEventWithAttributes(c, "die", attributes)

if err == nil && restart {
if restart {
go func() {
err := <-wait
if err == nil {
Expand Down
15 changes: 3 additions & 12 deletions daemon/restart.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (

containertypes "github.com/docker/docker/api/types/container"
"github.com/docker/docker/container"
"github.com/sirupsen/logrus"
)

// ContainerRestart stops and starts a container. It attempts to
Expand Down Expand Up @@ -52,19 +51,11 @@ func (daemon *Daemon) containerRestart(ctx context.Context, container *container
}

if container.IsRunning() {
// set AutoRemove flag to false before stop so the container won't be
// removed during restart process
autoRemove := container.HostConfig.AutoRemove
container.Lock()
container.HasBeenManuallyRestarted = true
container.Unlock()

container.HostConfig.AutoRemove = false
err := daemon.containerStop(ctx, container, options)
// restore AutoRemove irrespective of whether the stop worked or not
container.HostConfig.AutoRemove = autoRemove
// containerStop will write HostConfig to disk, we shall restore AutoRemove
// in disk too
if toDiskErr := daemon.checkpointAndSave(container); toDiskErr != nil {
logrus.Errorf("Write container to disk error: %v", toDiskErr)
}

if err != nil {
return err
Expand Down
1 change: 1 addition & 0 deletions daemon/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ func (daemon *Daemon) containerStart(container *container.Container, checkpoint
return translateContainerdStartErr(container.Path, container.SetExitCode, err)
}

container.HasBeenManuallyRestarted = false
container.SetRunning(pid, true)
container.HasBeenStartedBefore = true
daemon.setStateCounter(container)
Expand Down
58 changes: 58 additions & 0 deletions integration/container/restart_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/docker/docker/api/types"
"github.com/docker/docker/api/types/container"
"github.com/docker/docker/client"
testContainer "github.com/docker/docker/integration/internal/container"
"github.com/docker/docker/testutil/daemon"
"gotest.tools/v3/assert"
"gotest.tools/v3/poll"
Expand Down Expand Up @@ -153,3 +154,60 @@ func pollForNewHealthCheck(ctx context.Context, client *client.Client, startTime
return poll.Continue("waiting for a new container healthcheck")
}
}

// Container started with --rm should be able to be restarted.
// It should be removed only if killed or stopped
func TestContainerWithAutoRemoveCanBeRestarted(t *testing.T) {
defer setupTest(t)()
cli := testEnv.APIClient()
ctx := context.Background()

noWaitTimeout := 0

for _, tc := range []struct {
desc string
doSth func(ctx context.Context, containerID string) error
}{
{
desc: "kill",
doSth: func(ctx context.Context, containerID string) error {
return cli.ContainerKill(ctx, containerID, "SIGKILL")
},
},
{
desc: "stop",
doSth: func(ctx context.Context, containerID string) error {
return cli.ContainerStop(ctx, containerID, container.StopOptions{Timeout: &noWaitTimeout})
},
},
} {
tc := tc
t.Run(tc.desc, func(t *testing.T) {
cID := testContainer.Run(ctx, t, cli,
testContainer.WithName("autoremove-restart-and-"+tc.desc),
testContainer.WithAutoRemove,
)
defer func() {
err := cli.ContainerRemove(ctx, cID, types.ContainerRemoveOptions{Force: true})
if t.Failed() && err != nil {
t.Logf("Cleaning up test container failed with error: %v", err)
}
}()

err := cli.ContainerRestart(ctx, cID, container.StopOptions{Timeout: &noWaitTimeout})
assert.NilError(t, err)

inspect, err := cli.ContainerInspect(ctx, cID)
assert.NilError(t, err)
assert.Assert(t, inspect.State.Status != "removing", "Container should not be removing yet")

poll.WaitOn(t, testContainer.IsInState(ctx, cli, cID, "running"))

err = tc.doSth(ctx, cID)
assert.NilError(t, err)

poll.WaitOn(t, testContainer.IsRemoved(ctx, cli, cID))
})
}

}
Loading

0 comments on commit f34567b

Please sign in to comment.