Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for CDI devices under Linux #45134

Merged
merged 4 commits into from
May 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cmd/dockerd/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ func installCommonConfigFlags(conf *config.Config, flags *pflag.FlagSet) error {
flags.StringVar(&conf.HTTPSProxy, "https-proxy", "", "HTTPS proxy URL to use for outgoing traffic")
flags.StringVar(&conf.NoProxy, "no-proxy", "", "Comma-separated list of hosts or IP addresses for which the proxy is skipped")

flags.Var(opts.NewNamedListOptsRef("cdi-spec-dirs", &conf.CDISpecDirs, nil), "cdi-spec-dir", "CDI specification directories to use")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In future we could expose this information in docker info

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure. I have added an item to #45192 where I'm tracking the minor follow-ups.


// Deprecated flags / options

flags.BoolVarP(&conf.AutoRestart, "restart", "r", true, "--restart on the daemon has been deprecated in favor of --restart policies on docker run")
Expand Down
13 changes: 13 additions & 0 deletions cmd/dockerd/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"sync"
"time"

"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
containerddefaults "github.com/containerd/containerd/defaults"
"github.com/docker/docker/api"
apiserver "github.com/docker/docker/api/server"
Expand Down Expand Up @@ -241,6 +242,18 @@ func (cli *DaemonCli) start(opts *daemonOptions) (err error) {
return errors.Wrap(err, "failed to validate authorization plugin")
}

// Note that CDI is not inherrently linux-specific, there are some linux-specific assumptions / implementations in the code that
// queries the properties of device on the host as wel as performs the injection of device nodes and their access permissions into the OCI spec.
//
// In order to lift this restriction the following would have to be addressed:
// - Support needs to be added to the cdi package for injecting Windows devices: https://github.com/container-orchestrated-devices/container-device-interface/issues/28
// - The DeviceRequests API must be extended to non-linux platforms.
if runtime.GOOS == "linux" && cli.Config.Experimental {
daemon.RegisterCDIDriver(
cdi.WithSpecDirs(cli.Config.CDISpecDirs...),
)
}

cli.d = d

if err := startMetricsServer(cli.Config.MetricsAddress); err != nil {
Expand Down
87 changes: 87 additions & 0 deletions daemon/cdi.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
package daemon

import (
"fmt"

"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
"github.com/docker/docker/errdefs"
"github.com/hashicorp/go-multierror"
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)

type cdiHandler struct {
registry *cdi.Cache
}

// RegisterCDIDriver registers the CDI device driver.
// The driver injects CDI devices into an incoming OCI spec and is called for DeviceRequests associated with CDI devices.
func RegisterCDIDriver(opts ...cdi.Option) {
cache, err := cdi.NewCache(opts...)
if err != nil {
logrus.WithError(err).Error("CDI registry initialization failed")
// We create a spec updater that always returns an error.
// This error will be returned only when a CDI device is requested.
// This ensures that daemon startup is not blocked by a CDI registry initialization failure.
errorOnUpdateSpec := func(s *specs.Spec, dev *deviceInstance) error {
return fmt.Errorf("CDI device injection failed due to registry initialization failure: %w", err)
}
driver := &deviceDriver{
updateSpec: errorOnUpdateSpec,
}
registerDeviceDriver("cdi", driver)
return
}

// We construct a spec updates that injects CDI devices into the OCI spec using the initialized registry.
c := &cdiHandler{
registry: cache,
}

driver := &deviceDriver{
updateSpec: c.injectCDIDevices,
}

registerDeviceDriver("cdi", driver)
}

// injectCDIDevices injects a set of CDI devices into the specified OCI specification.
func (c *cdiHandler) injectCDIDevices(s *specs.Spec, dev *deviceInstance) error {
if dev.req.Count != 0 {
return errdefs.InvalidParameter(errors.New("unexpected count in CDI device request"))
}
if len(dev.req.Options) > 0 {
return errdefs.InvalidParameter(errors.New("unexpected options in CDI device request"))
}

cdiDeviceNames := dev.req.DeviceIDs
if len(cdiDeviceNames) == 0 {
return nil
}

_, err := c.registry.InjectDevices(s, cdiDeviceNames...)
if err != nil {
if rerrs := c.getErrors(); rerrs != nil {
// We log the errors that may have been generated while refreshing the CDI registry.
// These may be due to malformed specifications or device name conflicts that could be
// the cause of an injection failure.
logrus.WithError(rerrs).Warning("Refreshing the CDI registry generated errors")
}

return fmt.Errorf("CDI device injection failed: %w", err)
}

return nil
}

// getErrors returns a single error representation of errors that may have occurred while refreshing the CDI registry.
func (c *cdiHandler) getErrors() error {
errors := c.registry.GetErrors()

var err *multierror.Error
for _, errs := range errors {
err = multierror.Append(err, errs...)
}
return err.ErrorOrNil()
}
5 changes: 5 additions & 0 deletions daemon/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"

"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
"github.com/containerd/containerd/runtime/v2/shim"
"github.com/docker/docker/opts"
"github.com/docker/docker/registry"
Expand Down Expand Up @@ -256,6 +257,9 @@ type CommonConfig struct {
ContainerdPluginNamespace string `json:"containerd-plugin-namespace,omitempty"`

DefaultRuntime string `json:"default-runtime,omitempty"`

// CDISpecDirs is a list of directories in which CDI specifications can be found.
CDISpecDirs []string `json:"cdi-spec-dirs,omitempty"`
}

// Proxies holds the proxies that are configured for the daemon.
Expand Down Expand Up @@ -295,6 +299,7 @@ func New() (*Config, error) {
ContainerdNamespace: DefaultContainersNamespace,
ContainerdPluginNamespace: DefaultPluginNamespace,
DefaultRuntime: StockRuntimeName,
CDISpecDirs: append([]string(nil), cdi.DefaultSpecDirs...),
},
}

Expand Down
11 changes: 11 additions & 0 deletions daemon/devices_linux.go → daemon/devices.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,17 @@ func (daemon *Daemon) handleDevice(req container.DeviceRequest, spec *specs.Spec
}
}
} else if dd := deviceDrivers[req.Driver]; dd != nil {
// We add a special case for the CDI driver here as the cdi driver does
// not distinguish between capabilities.
// Furthermore, the "OR" and "AND" matching logic for the capability
// sets requires that a dummy capability be specified when constructing a
// DeviceRequest.
// This workaround can be removed once these device driver are
// refactored to be plugins, with each driver implementing its own
// matching logic, for example.
if req.Driver == "cdi" {
return dd.updateSpec(spec, &deviceInstance{req: req})
}
if selected := dd.capset.Match(req.Capabilities); selected != nil {
return dd.updateSpec(spec, &deviceInstance{req: req, selectedCaps: selected})
}
Expand Down
64 changes: 64 additions & 0 deletions integration/container/cdi_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package container // import "github.com/docker/docker/integration/container"

import (
"bytes"
"context"
"io"
"os"
"path/filepath"
"strings"
"testing"

"github.com/docker/docker/api/types"
containertypes "github.com/docker/docker/api/types/container"
"github.com/docker/docker/integration/internal/container"
"github.com/docker/docker/pkg/stdcopy"
"github.com/docker/docker/testutil/daemon"
"gotest.tools/v3/assert"
is "gotest.tools/v3/assert/cmp"
"gotest.tools/v3/skip"
)

func TestCreateWithCDIDevices(t *testing.T) {
skip.If(t, testEnv.OSType != "linux", "CDI devices are only supported on Linux")
skip.If(t, testEnv.IsRemoteDaemon, "cannot run cdi tests with a remote daemon")

cwd, err := os.Getwd()
assert.NilError(t, err)
d := daemon.New(t, daemon.WithExperimental())
d.StartWithBusybox(t, "--cdi-spec-dir="+filepath.Join(cwd, "testdata", "cdi"))
defer d.Stop(t)

client := d.NewClientT(t)

ctx := context.Background()
id := container.Run(ctx, t, client,
container.WithCmd("/bin/sh", "-c", "env"),
container.WithCDIDevices("vendor1.com/device=foo"),
)
defer client.ContainerRemove(ctx, id, types.ContainerRemoveOptions{Force: true})

inspect, err := client.ContainerInspect(ctx, id)
assert.NilError(t, err)

expectedRequests := []containertypes.DeviceRequest{
{
Driver: "cdi",
DeviceIDs: []string{"vendor1.com/device=foo"},
},
}
assert.Check(t, is.DeepEqual(inspect.HostConfig.DeviceRequests, expectedRequests))

reader, err := client.ContainerLogs(ctx, id, types.ContainerLogsOptions{
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@corhere I initially tried to implement something that inspects the container here, but did not see the expected environment variable. This is probably because the container config is being based on the unmodified spec, so if the expectation is that the config shows the new envvar, then we would need to do some kind of reconcilliation after making the modifications. How is this currently handled for other entities (e.g. devices or mounts that are requested?).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had been wondering about that. I noticed that when this was raised before (docker/cli#3864 (comment)) it seemed the preference was to indeed have the spec shown by inspect be updated to match what was actually run, and that the CDI library has some API to support that.

That said, it seems like this could be solved at the moby level by capturing the OCI spec after all the DeviceRequests are processed and returning that in docker inspect; that would also capture the --gpu spec mutations which I assume are equally invisible, as I don't see anything in daemon/nvidia_linux.go apart from the updateSpec hook implementation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, at present the --gpu spec mutations are also not visible to the inspect command -- although in this case the changes are limited to a single hook as setting two environment variables.

I agree that the goal should be to get these changes visible to daemon, but I would suggest tackling this in a follow-up if possible. Currently, I don't believe that any of the container engines that offer CDI support do this kind of reconciliation -- unliess they use the on-disk config.json file as the source of truth when querying a container.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The container's OCI spec is based on the container config, not the other way around. The container config is the source of truth. No reconciliation is performed for other entities. I really wish we'd thought to record the maintainers call which you joined where we hashed this all out, because I vaguely recall we concluded reporting the CDI device request as-is when inspecting the container, without back-propagating how it patches the OCI spec, would be acceptable as it is sufficient to flag that that the container's OCI spec has been patched by CDI and therefore any issues reported with that particular container can be assumed to be the responsibility of the CDI spec's vendor until proven otherwise. Reaching a final decision on this aspect of CDI integration is one of the gates for the feature to graduate from experimental @TBBle.

Copy link
Contributor

@TBBle TBBle Mar 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be possible to capture the applied CDI spec into the container config or metadata in some way? One of the advantages of CDI is that the specs can be dynamic, e.g. generated just-in-time for that container, and then discarded afterwards, or even updated in-place, I believe.

This could make docker inspect very hard to backtrace for troubleshooting, particularly in the context of, say, cri-dockerd which would (I assume) simply translate the CDI field in CRI into DeviceRequests in the Docker API, but which may be referencing a dynamically-generated CDI spec created just-in-time by a kubernetes Device Plugin (or the newer Dynamic Resource Allocation which is more-suited to just-in-time device creation, e.g., Nvidia software-splittable GPUs), and hence the CDI name is meaningless (probably a GUID or similar) by that point.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was not aware that dynamic CDI specs were a thing. Given that, some mechanism for reporting the applied CDI spec when inspecting a container will be necessary. What affordances does cri-containerd provide for troubleshooting containers which have dynamic CDI specs applied to them?

Copy link
Contributor

@TBBle TBBle Mar 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC, in containerd-cri, CDI is applied to the OCI spec before it's passed to the runtime shim, so the config.json on-disk will have the results of applying the CDI annotations. I'm not sure what debugging options are available at the CRI layer though (e.g., can we dump the historical CRI requests that led to this container's state), and I haven't looked into CDI's support for this, beyond the mention in docker/cli#3864 (comment) that it has APIs to support inspection; so it's possible containerd-cri doesn't currently expose CDI to inspection, just the resulting OCI spec.

Edit: Quick check, and CDI is applied entirely by the containerd-cri plugin, so what the containerd core service gets is with CDI already applied, and so containerd itself is totally agnostic to CDI.

More edit, really going to bed after this: As it happens, right now, critool inspect should show annotations on the container, which I think will include the CDI annotations, but once we move to using the new CDIDevices field in CRI, this back-door goes away. I think critool inspect will show the config.Resources.CDIDevices field, but haven't actually tried this (or the annotations check), this was just from GitHub-based code exploring.

So anyway, I think for containerd-cri we get to see the CDI device name in the CRI API, and the OCI spec that has had the edits applied in containerd core, but the same risk of "cdi spec now isn't the cdi spec used" applies.


For dynamic CDI specs, NVidia already has a non-production-ready Dynamic Resource Allocation setup for Multi-Instance GPUs using CDI, and creating a CDI spec for claims on-the-fly (see CreateClaimSpecFile). In this case, the dynamic CDI spec is kept around until the resource is free'd, so in this case a live container is traceable; it may be that modifying a spec file or deleting it while the device exists is a CDI antipattern, but at this time, I'm not aware of that being explicit. cncf-tags/container-device-interface#15 is the only relevant discussion I'm currently aware of for this sort of use-case.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What you described for containerd-cri would also hold true verbatim for Moby if you substitute the analogous components:

  • containerd-cri -> dockerd
  • critool inspect -> docker inspect
  • CRI API -> Docker Engine API

You are correct that cri-dockerd is an adapter between the CRI API and Docker Engine API and so would implement CDI support by translating the config.Resources.CDIDevices field into Engine API DeviceRequests. The information it could include when inspecting a container through the CRI API would be limited by the information it could request over the Engine API.

I have no objections to the Moby+CDI troubleshooting experience being on par with cri-containerd+CDI's troubleshooting experience. I won't complain if you want to go above and beyond, either.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the discussion here @TBBle and for diving into how things are handled in containerd (and similarly in cri-o). Note that assuming that docker inspect returns a ContainerJSON struct which includes the DeviceRequests in the HostConfig, there will be a reference to the requested CDI device names in the docker inspect output. This would match what is currently available in other implementations since these also do not read the generated OCI specification for their inspect equivalents. This also matches what is currently available for the --gpus flag.

"DeviceRequests": [
                {
                    "Driver": "",
                    "Count": 4,
                    "DeviceIDs": null,
                    "Capabilities": [
                        [
                            "gpu"
                        ]
                    ],
                    "Options": {}
                }
            ],

With regards to:

Would it be possible to capture the applied CDI spec into the container config or metadata in some way?

One thing to investigate would be to "inject" the requested devices into an empty OCI spec and then store this. This should give a reasonable representation of the edits required for a set of CDI devices. Note that the fesibility of this would have to be properly investigated.

Now from a testing perspective, checking the HostConfig doesn't necessarily make sense for this test since this is what we set as input, so I think parsing the logs is fine here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd suggest including a HostConfig inspection check in the test, simply because that's a publicly visible API we'll want to maintain, i.e. dockerd-cri's implementation of critool inspect will depend on this working, and there's no harm and little cost in making that explicitly tested, since unit tests are amongst other things a form of documentation.

ShowStdout: true,
})
assert.NilError(t, err)

actualStdout := new(bytes.Buffer)
actualStderr := io.Discard
_, err = stdcopy.StdCopy(actualStdout, actualStderr, reader)
assert.NilError(t, err)

outlines := strings.Split(actualStdout.String(), "\n")
assert.Assert(t, is.Contains(outlines, "FOO=injected"))
}
7 changes: 7 additions & 0 deletions integration/container/testdata/cdi/vendor1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
cdiVersion: "0.3.0"
kind: "vendor1.com/device"
devices:
- name: foo
containerEdits:
env:
- FOO=injected
11 changes: 11 additions & 0 deletions integration/internal/container/ops.go
Original file line number Diff line number Diff line change
Expand Up @@ -237,3 +237,14 @@ func WithRuntime(name string) func(*TestContainerConfig) {
c.HostConfig.Runtime = name
}
}

// WithCDIDevices sets the CDI devices to use to start the container
func WithCDIDevices(cdiDeviceNames ...string) func(*TestContainerConfig) {
return func(c *TestContainerConfig) {
request := containertypes.DeviceRequest{
Driver: "cdi",
DeviceIDs: cdiDeviceNames,
}
c.HostConfig.DeviceRequests = append(c.HostConfig.DeviceRequests, request)
}
}
6 changes: 6 additions & 0 deletions vendor.mod
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ require (
github.com/aws/smithy-go v1.13.1
github.com/bsphere/le_go v0.0.0-20200109081728-fc06dab2caa8
github.com/cloudflare/cfssl v0.0.0-20180323000720-5d63dbd981b5
github.com/container-orchestrated-devices/container-device-interface v0.5.5-0.20230516140309-1e6752771dc5
github.com/containerd/cgroups/v3 v3.0.1
github.com/containerd/containerd v1.6.21
github.com/containerd/continuity v0.3.0
Expand Down Expand Up @@ -130,6 +131,7 @@ require (
github.com/dustin/go-humanize v1.0.0 // indirect
github.com/felixge/httpsnoop v1.0.2 // indirect
github.com/fernet/fernet-go v0.0.0-20211208181803-9f70042a33ee // indirect
github.com/fsnotify/fsnotify v1.5.1 // indirect
github.com/go-logr/logr v1.2.3 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/gofrs/flock v0.8.1 // indirect
Expand All @@ -155,6 +157,7 @@ require (
github.com/mitchellh/hashstructure/v2 v2.0.2 // indirect
github.com/onsi/ginkgo/v2 v2.1.4 // indirect
github.com/onsi/gomega v1.20.1 // indirect
github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 // indirect
github.com/package-url/packageurl-go v0.1.1-0.20220428063043-89078438f170 // indirect
github.com/philhofer/fwd v1.1.2 // indirect
github.com/prometheus/client_model v0.3.0 // indirect
Expand All @@ -164,6 +167,7 @@ require (
github.com/secure-systems-lab/go-securesystemslib v0.4.0 // indirect
github.com/shibumi/go-pathspec v1.3.0 // indirect
github.com/spdx/tools-golang v0.3.1-0.20230104082527-d6f58551be3f // indirect
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 // indirect
github.com/tinylib/msgp v1.1.6 // indirect
github.com/tonistiigi/go-actions-cache v0.0.0-20220404170428-0bdeb6e1eac7 // indirect
github.com/tonistiigi/units v0.0.0-20180711220420-6950e57a87ea // indirect
Expand Down Expand Up @@ -193,5 +197,7 @@ require (
google.golang.org/api v0.93.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/protobuf v1.28.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
k8s.io/klog/v2 v2.80.1 // indirect
sigs.k8s.io/yaml v1.3.0 // indirect
)
Loading
Loading