New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Refactor libcontainerd to minimize containerd RPCs #43564
Changes from all commits
b752462
57d2d6e
4bafaa0
6a2f385
18e322b
15b8e4a
a09f8db
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,20 @@ | ||
package exec // import "github.com/docker/docker/daemon/exec" | ||
package container // import "github.com/docker/docker/container" | ||
|
||
import ( | ||
"context" | ||
"runtime" | ||
"sync" | ||
|
||
"github.com/containerd/containerd/cio" | ||
"github.com/docker/docker/container/stream" | ||
"github.com/docker/docker/libcontainerd/types" | ||
"github.com/docker/docker/pkg/stringid" | ||
"github.com/sirupsen/logrus" | ||
) | ||
|
||
// Config holds the configurations for execs. The Daemon keeps | ||
// ExecConfig holds the configurations for execs. The Daemon keeps | ||
// track of both running and finished execs so that they can be | ||
// examined both during and after completion. | ||
type Config struct { | ||
type ExecConfig struct { | ||
sync.Mutex | ||
Started chan struct{} | ||
StreamConfig *stream.Config | ||
|
@@ -25,7 +25,7 @@ type Config struct { | |
OpenStderr bool | ||
OpenStdout bool | ||
CanRemove bool | ||
ContainerID string | ||
Container *Container | ||
DetachKeys []byte | ||
Entrypoint string | ||
Args []string | ||
|
@@ -34,39 +34,22 @@ type Config struct { | |
User string | ||
WorkingDir string | ||
Env []string | ||
Pid int | ||
Process types.Process | ||
ConsoleSize *[2]uint | ||
} | ||
|
||
// NewConfig initializes the a new exec configuration | ||
func NewConfig() *Config { | ||
return &Config{ | ||
// NewExecConfig initializes the a new exec configuration | ||
func NewExecConfig(c *Container) *ExecConfig { | ||
return &ExecConfig{ | ||
ID: stringid.GenerateRandomID(), | ||
Container: c, | ||
StreamConfig: stream.NewConfig(), | ||
Started: make(chan struct{}), | ||
} | ||
} | ||
|
||
type rio struct { | ||
cio.IO | ||
|
||
sc *stream.Config | ||
} | ||
|
||
func (i *rio) Close() error { | ||
i.IO.Close() | ||
|
||
return i.sc.CloseStreams() | ||
} | ||
|
||
func (i *rio) Wait() { | ||
i.sc.Wait(context.Background()) | ||
|
||
i.IO.Wait() | ||
} | ||
|
||
// InitializeStdio is called by libcontainerd to connect the stdio. | ||
func (c *Config) InitializeStdio(iop *cio.DirectIO) (cio.IO, error) { | ||
func (c *ExecConfig) InitializeStdio(iop *cio.DirectIO) (cio.IO, error) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Slightly wondering what the motivation was to move this into the (Also looking commit-by-commit, so perhaps there's more in the next commit(s) 😅 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I had to do it to avoid a circular import between the |
||
c.StreamConfig.CopyToPipe(iop) | ||
|
||
if c.StreamConfig.Stdin() == nil && !c.Tty && runtime.GOOS == "windows" { | ||
|
@@ -81,68 +64,68 @@ func (c *Config) InitializeStdio(iop *cio.DirectIO) (cio.IO, error) { | |
} | ||
|
||
// CloseStreams closes the stdio streams for the exec | ||
func (c *Config) CloseStreams() error { | ||
func (c *ExecConfig) CloseStreams() error { | ||
return c.StreamConfig.CloseStreams() | ||
} | ||
|
||
// SetExitCode sets the exec config's exit code | ||
func (c *Config) SetExitCode(code int) { | ||
func (c *ExecConfig) SetExitCode(code int) { | ||
c.ExitCode = &code | ||
} | ||
|
||
// Store keeps track of the exec configurations. | ||
type Store struct { | ||
byID map[string]*Config | ||
sync.RWMutex | ||
// ExecStore keeps track of the exec configurations. | ||
type ExecStore struct { | ||
byID map[string]*ExecConfig | ||
mu sync.RWMutex | ||
} | ||
|
||
// NewStore initializes a new exec store. | ||
func NewStore() *Store { | ||
return &Store{ | ||
byID: make(map[string]*Config), | ||
// NewExecStore initializes a new exec store. | ||
func NewExecStore() *ExecStore { | ||
return &ExecStore{ | ||
byID: make(map[string]*ExecConfig), | ||
} | ||
} | ||
|
||
// Commands returns the exec configurations in the store. | ||
func (e *Store) Commands() map[string]*Config { | ||
e.RLock() | ||
byID := make(map[string]*Config, len(e.byID)) | ||
func (e *ExecStore) Commands() map[string]*ExecConfig { | ||
e.mu.RLock() | ||
byID := make(map[string]*ExecConfig, len(e.byID)) | ||
for id, config := range e.byID { | ||
byID[id] = config | ||
} | ||
e.RUnlock() | ||
e.mu.RUnlock() | ||
return byID | ||
} | ||
|
||
// Add adds a new exec configuration to the store. | ||
func (e *Store) Add(id string, Config *Config) { | ||
e.Lock() | ||
func (e *ExecStore) Add(id string, Config *ExecConfig) { | ||
e.mu.Lock() | ||
e.byID[id] = Config | ||
e.Unlock() | ||
e.mu.Unlock() | ||
} | ||
|
||
// Get returns an exec configuration by its id. | ||
func (e *Store) Get(id string) *Config { | ||
e.RLock() | ||
func (e *ExecStore) Get(id string) *ExecConfig { | ||
e.mu.RLock() | ||
res := e.byID[id] | ||
e.RUnlock() | ||
e.mu.RUnlock() | ||
return res | ||
} | ||
|
||
// Delete removes an exec configuration from the store. | ||
func (e *Store) Delete(id string, pid int) { | ||
e.Lock() | ||
func (e *ExecStore) Delete(id string) { | ||
e.mu.Lock() | ||
delete(e.byID, id) | ||
e.Unlock() | ||
e.mu.Unlock() | ||
} | ||
|
||
// List returns the list of exec ids in the store. | ||
func (e *Store) List() []string { | ||
func (e *ExecStore) List() []string { | ||
var IDs []string | ||
e.RLock() | ||
e.mu.RLock() | ||
for id := range e.byID { | ||
IDs = append(IDs, id) | ||
} | ||
e.RUnlock() | ||
e.mu.RUnlock() | ||
return IDs | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,7 @@ import ( | |
"time" | ||
|
||
"github.com/docker/docker/api/types" | ||
libcontainerdtypes "github.com/docker/docker/libcontainerd/types" | ||
units "github.com/docker/go-units" | ||
) | ||
|
||
|
@@ -36,6 +37,14 @@ type State struct { | |
|
||
stopWaiters []chan<- StateStatus | ||
removeOnlyWaiters []chan<- StateStatus | ||
|
||
// The libcontainerd reference fields are unexported to force consumers | ||
// to access them through the getter methods with multi-valued returns | ||
// so that they can't forget to nil-check: the code won't compile unless | ||
// the nil-check result is explicitly consumed or discarded. | ||
|
||
ctr libcontainerdtypes.Container | ||
task libcontainerdtypes.Task | ||
} | ||
|
||
// StateStatus is used to return container wait results. | ||
|
@@ -260,7 +269,7 @@ func (s *State) SetExitCode(ec int) { | |
} | ||
|
||
// SetRunning sets the state of the container to "running". | ||
func (s *State) SetRunning(pid int, initial bool) { | ||
func (s *State) SetRunning(ctr libcontainerdtypes.Container, tsk libcontainerdtypes.Task, initial bool) { | ||
s.ErrorMsg = "" | ||
s.Paused = false | ||
s.Running = true | ||
|
@@ -269,7 +278,14 @@ func (s *State) SetRunning(pid int, initial bool) { | |
s.Paused = false | ||
} | ||
s.ExitCodeValue = 0 | ||
s.Pid = pid | ||
s.ctr = ctr | ||
s.task = tsk | ||
if tsk != nil { | ||
s.Pid = int(tsk.Pid()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wondering if we should consider changing our |
||
} else { | ||
s.Pid = 0 | ||
} | ||
s.OOMKilled = false | ||
if initial { | ||
s.StartedAt = time.Now().UTC() | ||
} | ||
|
@@ -287,7 +303,6 @@ func (s *State) SetStopped(exitStatus *ExitStatus) { | |
s.FinishedAt = exitStatus.ExitedAt | ||
} | ||
s.ExitCodeValue = exitStatus.ExitCode | ||
s.OOMKilled = exitStatus.OOMKilled | ||
|
||
s.notifyAndClear(&s.stopWaiters) | ||
} | ||
|
@@ -303,7 +318,6 @@ func (s *State) SetRestarting(exitStatus *ExitStatus) { | |
s.Pid = 0 | ||
s.FinishedAt = time.Now().UTC() | ||
s.ExitCodeValue = exitStatus.ExitCode | ||
s.OOMKilled = exitStatus.OOMKilled | ||
|
||
s.notifyAndClear(&s.stopWaiters) | ||
} | ||
|
@@ -405,3 +419,21 @@ func (s *State) notifyAndClear(waiters *[]chan<- StateStatus) { | |
} | ||
*waiters = nil | ||
} | ||
|
||
// C8dContainer returns a reference to the libcontainerd Container object for | ||
// the container and whether the reference is valid. | ||
// | ||
// The container lock must be held when calling this method. | ||
func (s *State) C8dContainer() (_ libcontainerdtypes.Container, ok bool) { | ||
return s.ctr, s.ctr != nil | ||
} | ||
|
||
// Task returns a reference to the libcontainerd Task object for the container | ||
// and whether the reference is valid. | ||
// | ||
// The container lock must be held when calling this method. | ||
// | ||
// See also: (*Container).GetRunningTask(). | ||
func (s *State) Task() (_ libcontainerdtypes.Task, ok bool) { | ||
return s.task, s.task != nil | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TBH, I wasn't fully aware we also set this field if an exec or "any other process in the namespace" was OOM killed.
IIRC (but I'd have to dig history) the intent of this field was to learn "why did the container exit?" (which could be due to OOM), so (implicitly) this field was meant as indicator for the container's main process being OOM killed.
So, this change means a couple of things;
OOMKilled
field istrue
(but wasn't the cause of the container ultimately exiting).The above is neither "good", nor "bad", but "different". The only thing I'm wondering is; would there be reasons (would there be value) in being able to differentiate those? Thinking along the lines of;
OOMKilled
to be reserved for the container's main process? (devil's advocate; the exit event itself may not be directly caused by OOM for the main process, but as a result of other processes)OOMKilled
(in favor of a counter?) or on exit;OOMKilled = len(OOMKilledCounter) > 0
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OOMKilled always meant "any process" was killed. I have only changed when the flag is updated, and corrected the documentation.
Unfortunately, we can't reserve
OOMKilled
for the container's pid 1 because cgroups only keeps track of how many times the OOM killer has been invoked on processes in a group, not which processes in the group got killed. And even if we could, things could get really confusing withdocker run --init
and containers which use a fit-for-purpose init (e.g.s6-svscan
) as pid 1. If we had the ability to know which processes got OOM-killed, I think it would be so much more useful to surface that information as discrete events associated with the container with timestamps, PIDs and/proc/<pid>/cmdline
s.Even without detailed information, I think surfacing OOMs as timestamped events would be far superior to a counter as it would allow the fatal OOM-kills to be differentiated from the non-fatal ones, even after the container exits. (Heuristic: how much time elapsed between the OOM-kill and the container stopping.) A close runner-up would be surfacing a counter along with the timestamp of the most recent OOM-killed event received by the daemon.
If only
memory.oom.group
was enabled in containers. (AFAICT runC does not.) That would clear up any ambiguity quite nicely.