From e10b0abb6eec56cc3f1824658580a0a0e1fb6be8 Mon Sep 17 00:00:00 2001 From: Kevin Parsons Date: Tue, 18 Apr 2023 21:14:12 -0700 Subject: [PATCH] Rework layer handling to return a ResourceCloser Currently, the layers package relies on the caller of Mount*COWLayers to subsequently call NewImageLayers, which constructs a special ImageLayers object that can be used to later clean up the layer mounts. However, this requires the caller to know too much about the internals of the layer mounting process. A cleaner approach, which I take here, is to instead return a standard ResourceCloser from Mount*COWLayers which then knows how to clean up whatever mounts were done. I have also changed the layers code to use ResourceCloser in more places internally. There is a new check in resources_*cow.go, such that the layers closer is only stored if the container is not a hypervisor-isolated sandbox container. This duplicates the logic that was previously in (*ImageLayers).Release. Signed-off-by: Kevin Parsons --- internal/hcsoci/resources_lcow.go | 9 +- internal/hcsoci/resources_wcow.go | 9 +- internal/jobcontainers/jobcontainer.go | 61 ++-- internal/jobcontainers/storage.go | 17 +- internal/layers/layers.go | 482 ++++++++++--------------- internal/resources/resources.go | 9 +- internal/uvm/vpmem.go | 29 +- test/functional/uvm_vpmem_test.go | 4 +- test/functional/wcow_test.go | 32 +- 9 files changed, 295 insertions(+), 357 deletions(-) diff --git a/internal/hcsoci/resources_lcow.go b/internal/hcsoci/resources_lcow.go index 65e7acf022..58bf9d1115 100644 --- a/internal/hcsoci/resources_lcow.go +++ b/internal/hcsoci/resources_lcow.go @@ -30,13 +30,16 @@ func allocateLinuxResources(ctx context.Context, coi *createOptionsInternal, r * containerRootInUVM := r.ContainerRootInUVM() if coi.Spec.Windows != nil && len(coi.Spec.Windows.LayerFolders) > 0 { log.G(ctx).Debug("hcsshim::allocateLinuxResources mounting storage") - rootPath, scratchPath, err := layers.MountLCOWLayers(ctx, coi.actualID, coi.Spec.Windows.LayerFolders, containerRootInUVM, "", coi.HostingSystem) + rootPath, scratchPath, closer, err := layers.MountLCOWLayers(ctx, coi.actualID, coi.Spec.Windows.LayerFolders, containerRootInUVM, coi.HostingSystem) if err != nil { return errors.Wrap(err, "failed to mount container storage") } coi.Spec.Root.Path = rootPath - layers := layers.NewImageLayers(coi.HostingSystem, containerRootInUVM, coi.Spec.Windows.LayerFolders, "", isSandbox) - r.SetLayers(layers) + // If this is the pause container in a hypervisor-isolated pod, we can skip cleanup of + // layers, as that happens automatically when the UVM is terminated. + if !isSandbox || coi.HostingSystem == nil { + r.SetLayers(closer) + } r.SetLcowScratchPath(scratchPath) } else if coi.Spec.Root.Path != "" { // This is the "Plan 9" root filesystem. diff --git a/internal/hcsoci/resources_wcow.go b/internal/hcsoci/resources_wcow.go index 0149a05331..8a810a12da 100644 --- a/internal/hcsoci/resources_wcow.go +++ b/internal/hcsoci/resources_wcow.go @@ -60,13 +60,16 @@ func allocateWindowsResources(ctx context.Context, coi *createOptionsInternal, r if coi.Spec.Root.Path == "" && (coi.HostingSystem != nil || coi.Spec.Windows.HyperV == nil) { log.G(ctx).Debug("hcsshim::allocateWindowsResources mounting storage") containerRootInUVM := r.ContainerRootInUVM() - containerRootPath, err := layers.MountWCOWLayers(ctx, coi.actualID, coi.Spec.Windows.LayerFolders, containerRootInUVM, "", coi.HostingSystem) + containerRootPath, closer, err := layers.MountWCOWLayers(ctx, coi.actualID, coi.Spec.Windows.LayerFolders, containerRootInUVM, "", coi.HostingSystem) if err != nil { return errors.Wrap(err, "failed to mount container storage") } coi.Spec.Root.Path = containerRootPath - layers := layers.NewImageLayers(coi.HostingSystem, containerRootInUVM, coi.Spec.Windows.LayerFolders, "", isSandbox) - r.SetLayers(layers) + // If this is the pause container in a hypervisor-isolated pod, we can skip cleanup of + // layers, as that happens automatically when the UVM is terminated. + if !isSandbox || coi.HostingSystem == nil { + r.SetLayers(closer) + } } if err := setupMounts(ctx, coi, r); err != nil { diff --git a/internal/jobcontainers/jobcontainer.go b/internal/jobcontainers/jobcontainer.go index e68a4820e4..49faafd443 100644 --- a/internal/jobcontainers/jobcontainer.go +++ b/internal/jobcontainers/jobcontainer.go @@ -21,7 +21,6 @@ import ( "github.com/Microsoft/hcsshim/internal/hcs/schema1" hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" "github.com/Microsoft/hcsshim/internal/jobobject" - "github.com/Microsoft/hcsshim/internal/layers" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/queue" "github.com/Microsoft/hcsshim/internal/resources" @@ -189,25 +188,16 @@ func Create(ctx context.Context, id string, s *specs.Spec) (_ cow.Container, _ * } }) + var closer resources.ResourceCloser if fileBindingSupport { - if err := container.bindSetup(ctx, s); err != nil { - return nil, nil, err - } + closer, err = container.bindSetup(ctx, s) } else { - if err := container.fallbackSetup(ctx, s); err != nil { - return nil, nil, err - } + closer, err = container.fallbackSetup(ctx, s) } - - // We actually don't need to pass in anything for volumeMountPath below if we have file binding support, - // the filter allows us to pass in a raw volume path and it can use that to bind a volume to a friendly path - // instead so we can skip calling SetVolumeMountPoint. - var rootfsMountPoint string - if !fileBindingSupport { - rootfsMountPoint = container.rootfsLocation + if err != nil { + return nil, nil, err } - layers := layers.NewImageLayers(nil, "", s.Windows.LayerFolders, rootfsMountPoint, false) - r.SetLayers(layers) + r.SetLayers(closer) volumeGUIDRegex := `^\\\\\?\\(Volume)\{{0,1}[0-9a-fA-F]{8}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{12}(\}){0,1}\}(|\\)$` if matched, err := regexp.MatchString(volumeGUIDRegex, s.Root.Path); !matched || err != nil { @@ -775,37 +765,56 @@ func (c *JobContainer) replaceWithMountPoint(str string) (string, bool) { return newStr, str != newStr } -func (c *JobContainer) bindSetup(ctx context.Context, s *specs.Spec) (err error) { +func (c *JobContainer) bindSetup(ctx context.Context, s *specs.Spec) (_ resources.ResourceCloser, err error) { // Must be upgraded to a silo so we can get per silo bindings for the container. if err := c.job.PromoteToSilo(); err != nil { - return err + return nil, err } // Union the container layers. - if err := c.mountLayers(ctx, c.id, s, ""); err != nil { - return fmt.Errorf("failed to mount container layers: %w", err) + closer, err := c.mountLayers(ctx, c.id, s, "") + if err != nil { + return nil, fmt.Errorf("failed to mount container layers: %w", err) } + defer func() { + if err != nil { + _ = closer.Release(ctx) + } + }() + rootfsLocation := defaultSiloRootfsLocation if loc := customRootfsLocation(s.Annotations); loc != "" { rootfsLocation = loc } if err := c.setupRootfsBinding(rootfsLocation, s.Root.Path); err != nil { - return err + return nil, err } c.rootfsLocation = rootfsLocation - return c.setupMounts(ctx, s) + if err := c.setupMounts(ctx, s); err != nil { + return nil, err + } + return closer, nil } // This handles the fallback case where bind mounting isn't available on the machine. This mounts the // container layers on the host and sets up any mounts present in the OCI runtime spec. -func (c *JobContainer) fallbackSetup(ctx context.Context, s *specs.Spec) (err error) { +func (c *JobContainer) fallbackSetup(ctx context.Context, s *specs.Spec) (_ resources.ResourceCloser, err error) { rootfsLocation := fmt.Sprintf(fallbackRootfsFormat, c.id) if loc := customRootfsLocation(s.Annotations); loc != "" { rootfsLocation = filepath.Join(loc, c.id) } - if err := c.mountLayers(ctx, c.id, s, rootfsLocation); err != nil { - return fmt.Errorf("failed to mount container layers: %w", err) + closer, err := c.mountLayers(ctx, c.id, s, rootfsLocation) + if err != nil { + return nil, fmt.Errorf("failed to mount container layers: %w", err) } + defer func() { + if err != nil { + _ = closer.Release(ctx) + } + }() c.rootfsLocation = rootfsLocation - return fallbackMountSetup(s, c.rootfsLocation) + if err := fallbackMountSetup(s, c.rootfsLocation); err != nil { + return nil, err + } + return closer, nil } diff --git a/internal/jobcontainers/storage.go b/internal/jobcontainers/storage.go index edb2d4a4ef..fedcee814e 100644 --- a/internal/jobcontainers/storage.go +++ b/internal/jobcontainers/storage.go @@ -10,6 +10,7 @@ import ( "github.com/Microsoft/hcsshim/internal/layers" "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/resources" "github.com/Microsoft/hcsshim/internal/wclayer" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" @@ -25,23 +26,23 @@ const fallbackRootfsFormat = `C:\hpc\%s\` // C:\hpc\ const defaultSiloRootfsLocation = `C:\hpc\` -func (c *JobContainer) mountLayers(ctx context.Context, containerID string, s *specs.Spec, volumeMountPath string) (err error) { +func (c *JobContainer) mountLayers(ctx context.Context, containerID string, s *specs.Spec, volumeMountPath string) (_ resources.ResourceCloser, err error) { if s == nil || s.Windows == nil || s.Windows.LayerFolders == nil { - return errors.New("field 'Spec.Windows.Layerfolders' is not populated") + return nil, errors.New("field 'Spec.Windows.Layerfolders' is not populated") } // Last layer always contains the sandbox.vhdx, or 'scratch' space for the container. scratchFolder := s.Windows.LayerFolders[len(s.Windows.LayerFolders)-1] if _, err := os.Stat(scratchFolder); os.IsNotExist(err) { if err := os.MkdirAll(scratchFolder, 0777); err != nil { - return fmt.Errorf("failed to auto-create container scratch folder %s: %w", scratchFolder, err) + return nil, fmt.Errorf("failed to auto-create container scratch folder %s: %w", scratchFolder, err) } } // Create sandbox.vhdx if it doesn't exist in the scratch folder. if _, err := os.Stat(filepath.Join(scratchFolder, "sandbox.vhdx")); os.IsNotExist(err) { if err := wclayer.CreateScratchLayer(ctx, scratchFolder, s.Windows.LayerFolders[:len(s.Windows.LayerFolders)-1]); err != nil { - return fmt.Errorf("failed to CreateSandboxLayer: %w", err) + return nil, fmt.Errorf("failed to CreateSandboxLayer: %w", err) } } @@ -49,16 +50,18 @@ func (c *JobContainer) mountLayers(ctx context.Context, containerID string, s *s s.Root = &specs.Root{} } + var closer resources.ResourceCloser if s.Root.Path == "" { log.G(ctx).Debug("mounting job container storage") - rootPath, err := layers.MountWCOWLayers(ctx, containerID, s.Windows.LayerFolders, "", volumeMountPath, nil) + var rootPath string + rootPath, closer, err = layers.MountWCOWLayers(ctx, containerID, s.Windows.LayerFolders, "", volumeMountPath, nil) if err != nil { - return fmt.Errorf("failed to mount job container storage: %w", err) + return nil, fmt.Errorf("failed to mount job container storage: %w", err) } s.Root.Path = rootPath + "\\" } - return nil + return closer, nil } // setupRootfsBinding binds the copy on write volume for the container to a static path diff --git a/internal/layers/layers.go b/internal/layers/layers.go index e00c192769..fd7cfa5389 100644 --- a/internal/layers/layers.go +++ b/internal/layers/layers.go @@ -20,51 +20,43 @@ import ( "github.com/Microsoft/hcsshim/internal/hcserror" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/ospath" + "github.com/Microsoft/hcsshim/internal/resources" "github.com/Microsoft/hcsshim/internal/uvm" "github.com/Microsoft/hcsshim/internal/wclayer" ) -// ImageLayers contains all the layers for an image. -type ImageLayers struct { - vm *uvm.UtilityVM - containerRootInUVM string - volumeMountPath string - layers []string - // In some instances we may want to avoid cleaning up the image layers, such as when tearing - // down a sandbox container since the UVM will be torn down shortly after and the resources - // can be cleaned up on the host. - skipCleanup bool +type lcowLayersCloser struct { + uvm *uvm.UtilityVM + guestCombinedLayersPath string + scratchMount resources.ResourceCloser + layerClosers []resources.ResourceCloser } -func NewImageLayers(vm *uvm.UtilityVM, containerRootInUVM string, layers []string, volumeMountPath string, skipCleanup bool) *ImageLayers { - return &ImageLayers{ - vm: vm, - containerRootInUVM: containerRootInUVM, - layers: layers, - volumeMountPath: volumeMountPath, - skipCleanup: skipCleanup, - } -} - -// Release unmounts all of the layers located in the layers array. -func (layers *ImageLayers) Release(ctx context.Context, all bool) error { - if layers.skipCleanup && layers.vm != nil { - return nil - } - op := UnmountOperationSCSI - if layers.vm == nil || all { - op = UnmountOperationAll +func (lc *lcowLayersCloser) Release(ctx context.Context) (retErr error) { + if err := lc.uvm.RemoveCombinedLayersLCOW(ctx, lc.guestCombinedLayersPath); err != nil { + log.G(ctx).WithError(err).Error("failed RemoveCombinedLayersLCOW") + if retErr == nil { + retErr = fmt.Errorf("first error: %w", err) + } } - var crp string - if layers.vm != nil { - crp = containerRootfsPath(layers.vm, layers.containerRootInUVM) + if err := lc.scratchMount.Release(ctx); err != nil { + log.G(ctx).WithError(err).Error("failed LCOW scratch mount release") + if retErr == nil { + retErr = fmt.Errorf("first error: %w", err) + } } - err := UnmountContainerLayers(ctx, layers.layers, crp, layers.volumeMountPath, layers.vm, op) - if err != nil { - return err + for i, closer := range lc.layerClosers { + if err := closer.Release(ctx); err != nil { + log.G(ctx).WithFields(logrus.Fields{ + logrus.ErrorKey: err, + "layerIndex": i, + }).Error("failed releasing LCOW layer") + if retErr == nil { + retErr = fmt.Errorf("first error: %w", err) + } + } } - layers.layers = nil - return nil + return } // MountLCOWLayers is a helper for clients to hide all the complexity of layer mounting for LCOW @@ -72,22 +64,22 @@ func (layers *ImageLayers) Release(ctx context.Context, all bool) error { // Returns the path at which the `rootfs` of the container can be accessed. Also, returns the path inside the // UVM at which container scratch directory is located. Usually, this path is the path at which the container // scratch VHD is mounted. However, in case of scratch sharing this is a directory under the UVM scratch. -func MountLCOWLayers(ctx context.Context, containerID string, layerFolders []string, guestRoot, volumeMountPath string, vm *uvm.UtilityVM) (_, _ string, err error) { +func MountLCOWLayers(ctx context.Context, containerID string, layerFolders []string, guestRoot string, vm *uvm.UtilityVM) (_, _ string, _ resources.ResourceCloser, err error) { if vm.OS() != "linux" { - return "", "", errors.New("MountLCOWLayers should only be called for LCOW") + return "", "", nil, errors.New("MountLCOWLayers should only be called for LCOW") } // V2 UVM log.G(ctx).WithField("os", vm.OS()).Debug("hcsshim::MountLCOWLayers V2 UVM") var ( - layersAdded []string + layerClosers []resources.ResourceCloser lcowUvmLayerPaths []string ) defer func() { if err != nil { - for _, l := range layersAdded { - if err := removeLCOWLayer(ctx, vm, l); err != nil { + for _, closer := range layerClosers { + if err := closer.Release(ctx); err != nil { log.G(ctx).WithError(err).Warn("failed to remove lcow layer on cleanup") } } @@ -100,18 +92,18 @@ func MountLCOWLayers(ctx context.Context, containerID string, layerFolders []str layerPath = filepath.Join(layerPath, "layer.vhd") uvmPath string ) - uvmPath, err = addLCOWLayer(ctx, vm, layerPath) + uvmPath, closer, err := addLCOWLayer(ctx, vm, layerPath) if err != nil { - return "", "", fmt.Errorf("failed to add LCOW layer: %s", err) + return "", "", nil, fmt.Errorf("failed to add LCOW layer: %s", err) } - layersAdded = append(layersAdded, layerPath) + layerClosers = append(layerClosers, closer) lcowUvmLayerPaths = append(lcowUvmLayerPaths, uvmPath) } containerScratchPathInUVM := ospath.Join(vm.OS(), guestRoot) hostPath, err := getScratchVHDPath(layerFolders) if err != nil { - return "", "", fmt.Errorf("failed to get scratch VHD path in layer folders: %s", err) + return "", "", nil, fmt.Errorf("failed to eval symlinks on scratch path: %w", err) } log.G(ctx).WithField("hostPath", hostPath).Debug("mounting scratch VHD") @@ -126,20 +118,17 @@ func MountLCOWLayers(ctx context.Context, containerID string, layerFolders []str uvm.VMAccessTypeIndividual, ) if err != nil { - return "", "", fmt.Errorf("failed to add SCSI scratch VHD: %s", err) + return "", "", nil, fmt.Errorf("failed to add SCSI scratch VHD: %s", err) } // handles the case where we want to share a scratch disk for multiple containers instead // of mounting a new one. Pass a unique value for `ScratchPath` to avoid container upper and // work directories colliding in the UVM. - if scsiMount.RefCount() > 1 { - scratchFmt := fmt.Sprintf("container_%s", filepath.Base(containerScratchPathInUVM)) - containerScratchPathInUVM = ospath.Join("linux", scsiMount.UVMPath, scratchFmt) - } + containerScratchPathInUVM = ospath.Join("linux", scsiMount.UVMPath, "scratch", containerID) defer func() { if err != nil { - if err := vm.RemoveSCSI(ctx, hostPath); err != nil { + if err := scsiMount.Release(ctx); err != nil { log.G(ctx).WithError(err).Warn("failed to remove scratch on cleanup") } } @@ -148,10 +137,16 @@ func MountLCOWLayers(ctx context.Context, containerID string, layerFolders []str rootfs := ospath.Join(vm.OS(), guestRoot, guestpath.RootfsPath) err = vm.CombineLayersLCOW(ctx, containerID, lcowUvmLayerPaths, containerScratchPathInUVM, rootfs) if err != nil { - return "", "", err + return "", "", nil, err } log.G(ctx).Debug("hcsshim::MountLCOWLayers Succeeded") - return rootfs, containerScratchPathInUVM, nil + closer := &lcowLayersCloser{ + uvm: vm, + guestCombinedLayersPath: rootfs, + scratchMount: scsiMount, + layerClosers: layerClosers, + } + return rootfs, containerScratchPathInUVM, closer, nil } // MountWCOWLayers is a helper for clients to hide all the complexity of layer mounting for WCOW. @@ -165,100 +160,162 @@ func MountLCOWLayers(ctx context.Context, containerID string, layerFolders []str // // Job container: Returns the mount path on the host as a volume guid, with the volume mounted on // the host at `volumeMountPath`. -func MountWCOWLayers(ctx context.Context, containerID string, layerFolders []string, guestRoot, volumeMountPath string, vm *uvm.UtilityVM) (_ string, err error) { +func MountWCOWLayers(ctx context.Context, containerID string, layerFolders []string, guestRoot, volumeMountPath string, vm *uvm.UtilityVM) (_ string, _ resources.ResourceCloser, err error) { + if vm.OS() != "windows" { + return "", nil, errors.New("MountWCOWLayers should only be called for WCOW") + } + if vm == nil { - if len(layerFolders) < 2 { - return "", errors.New("need at least two layers - base and scratch") + return mountWCOWHostLayers(ctx, layerFolders, volumeMountPath) + } + return mountWCOWIsolatedLayers(ctx, containerID, layerFolders, guestRoot, volumeMountPath, vm) +} + +type wcowHostLayersCloser struct { + volumeMountPath string + scratchLayerFolderPath string +} + +func (lc *wcowHostLayersCloser) Release(ctx context.Context) error { + if lc.volumeMountPath != "" { + if err := RemoveSandboxMountPoint(ctx, lc.volumeMountPath); err != nil { + return err } - path := layerFolders[len(layerFolders)-1] - rest := layerFolders[:len(layerFolders)-1] - // Simple retry loop to handle some behavior on RS5. Loopback VHDs used to be mounted in a different manner on RS5 (ws2019) which led to some - // very odd cases where things would succeed when they shouldn't have, or we'd simply timeout if an operation took too long. Many - // parallel invocations of this code path and stressing the machine seem to bring out the issues, but all of the possible failure paths - // that bring about the errors we have observed aren't known. - // - // On 19h1+ this *shouldn't* be needed, but the logic is to break if everything succeeded so this is harmless and shouldn't need a version check. - var lErr error - for i := 0; i < 5; i++ { - lErr = func() (err error) { - if err := wclayer.ActivateLayer(ctx, path); err != nil { - return err - } + } + if err := wclayer.UnprepareLayer(ctx, lc.scratchLayerFolderPath); err != nil { + return err + } + return wclayer.DeactivateLayer(ctx, lc.scratchLayerFolderPath) +} - defer func() { - if err != nil { - _ = wclayer.DeactivateLayer(ctx, path) - } - }() +func mountWCOWHostLayers(ctx context.Context, layerFolders []string, volumeMountPath string) (_ string, _ resources.ResourceCloser, err error) { + if len(layerFolders) < 2 { + return "", nil, errors.New("need at least two layers - base and scratch") + } + path := layerFolders[len(layerFolders)-1] + rest := layerFolders[:len(layerFolders)-1] + // Simple retry loop to handle some behavior on RS5. Loopback VHDs used to be mounted in a different manner on RS5 (ws2019) which led to some + // very odd cases where things would succeed when they shouldn't have, or we'd simply timeout if an operation took too long. Many + // parallel invocations of this code path and stressing the machine seem to bring out the issues, but all of the possible failure paths + // that bring about the errors we have observed aren't known. + // + // On 19h1+ this *shouldn't* be needed, but the logic is to break if everything succeeded so this is harmless and shouldn't need a version check. + var lErr error + for i := 0; i < 5; i++ { + lErr = func() (err error) { + if err := wclayer.ActivateLayer(ctx, path); err != nil { + return err + } - return wclayer.PrepareLayer(ctx, path, rest) + defer func() { + if err != nil { + _ = wclayer.DeactivateLayer(ctx, path) + } }() - if lErr != nil { - // Common errors seen from the RS5 behavior mentioned above is ERROR_NOT_READY and ERROR_DEVICE_NOT_CONNECTED. The former occurs when HCS - // tries to grab the volume path of the disk but it doesn't succeed, usually because the disk isn't actually mounted. DEVICE_NOT_CONNECTED - // has been observed after launching multiple containers in parallel on a machine under high load. This has also been observed to be a trigger - // for ERROR_NOT_READY as well. - if hcserr, ok := lErr.(*hcserror.HcsError); ok { - if hcserr.Err == windows.ERROR_NOT_READY || hcserr.Err == windows.ERROR_DEVICE_NOT_CONNECTED { - log.G(ctx).WithField("path", path).WithError(hcserr.Err).Warning("retrying layer operations after failure") - - // Sleep for a little before a re-attempt. A probable cause for these issues in the first place is events not getting - // reported in time so might be good to give some time for things to "cool down" or get back to a known state. - time.Sleep(time.Millisecond * 100) - continue - } - } - // This was a failure case outside of the commonly known error conditions, don't retry here. - return "", lErr - } + return wclayer.PrepareLayer(ctx, path, rest) + }() - // No errors in layer setup, we can leave the loop - break - } - // If we got unlucky and ran into one of the two errors mentioned five times in a row and left the loop, we need to check - // the loop error here and fail also. if lErr != nil { - return "", errors.Wrap(lErr, "layer retry loop failed") + // Common errors seen from the RS5 behavior mentioned above is ERROR_NOT_READY and ERROR_DEVICE_NOT_CONNECTED. The former occurs when HCS + // tries to grab the volume path of the disk but it doesn't succeed, usually because the disk isn't actually mounted. DEVICE_NOT_CONNECTED + // has been observed after launching multiple containers in parallel on a machine under high load. This has also been observed to be a trigger + // for ERROR_NOT_READY as well. + if hcserr, ok := lErr.(*hcserror.HcsError); ok { + if hcserr.Err == windows.ERROR_NOT_READY || hcserr.Err == windows.ERROR_DEVICE_NOT_CONNECTED { + log.G(ctx).WithField("path", path).WithError(hcserr.Err).Warning("retrying layer operations after failure") + + // Sleep for a little before a re-attempt. A probable cause for these issues in the first place is events not getting + // reported in time so might be good to give some time for things to "cool down" or get back to a known state. + time.Sleep(time.Millisecond * 100) + continue + } + } + // This was a failure case outside of the commonly known error conditions, don't retry here. + return "", nil, lErr } - // If any of the below fails, we want to detach the filter and unmount the disk. - defer func() { - if err != nil { - _ = wclayer.UnprepareLayer(ctx, path) - _ = wclayer.DeactivateLayer(ctx, path) - } - }() + // No errors in layer setup, we can leave the loop + break + } + // If we got unlucky and ran into one of the two errors mentioned five times in a row and left the loop, we need to check + // the loop error here and fail also. + if lErr != nil { + return "", nil, errors.Wrap(lErr, "layer retry loop failed") + } - mountPath, err := wclayer.GetLayerMountPath(ctx, path) + // If any of the below fails, we want to detach the filter and unmount the disk. + defer func() { if err != nil { - return "", err + _ = wclayer.UnprepareLayer(ctx, path) + _ = wclayer.DeactivateLayer(ctx, path) } + }() - // Mount the volume to a directory on the host if requested. This is the case for job containers. - if volumeMountPath != "" { - if err := MountSandboxVolume(ctx, volumeMountPath, mountPath); err != nil { - return "", err - } + mountPath, err := wclayer.GetLayerMountPath(ctx, path) + if err != nil { + return "", nil, err + } + + // Mount the volume to a directory on the host if requested. This is the case for job containers. + if volumeMountPath != "" { + if err := MountSandboxVolume(ctx, volumeMountPath, mountPath); err != nil { + return "", nil, err } + } - return mountPath, nil + closer := &wcowHostLayersCloser{ + volumeMountPath: volumeMountPath, + scratchLayerFolderPath: path, } + return mountPath, closer, nil +} - if vm.OS() != "windows" { - return "", errors.New("MountWCOWLayers should only be called for WCOW") +type wcowIsolatedLayersCloser struct { + uvm *uvm.UtilityVM + guestCombinedLayersPath string + scratchMount resources.ResourceCloser + layerClosers []resources.ResourceCloser +} + +func (lc *wcowIsolatedLayersCloser) Release(ctx context.Context) (retErr error) { + if err := lc.uvm.RemoveCombinedLayersWCOW(ctx, lc.guestCombinedLayersPath); err != nil { + log.G(ctx).WithError(err).Error("failed RemoveCombinedLayersWCOW") + if retErr == nil { + retErr = fmt.Errorf("first error: %w", err) + } } + if err := lc.scratchMount.Release(ctx); err != nil { + log.G(ctx).WithError(err).Error("failed WCOW scratch mount release") + if retErr == nil { + retErr = fmt.Errorf("first error: %w", err) + } + } + for i, closer := range lc.layerClosers { + if err := closer.Release(ctx); err != nil { + log.G(ctx).WithFields(logrus.Fields{ + logrus.ErrorKey: err, + "layerIndex": i, + }).Error("failed releasing WCOW layer") + if retErr == nil { + retErr = fmt.Errorf("first error: %w", err) + } + } + } + return +} - // V2 UVM +func mountWCOWIsolatedLayers(ctx context.Context, containerID string, layerFolders []string, guestRoot, volumeMountPath string, vm *uvm.UtilityVM) (_ string, _ resources.ResourceCloser, err error) { log.G(ctx).WithField("os", vm.OS()).Debug("hcsshim::MountWCOWLayers V2 UVM") var ( - layersAdded []string + layersAdded []string + layerClosers []resources.ResourceCloser ) defer func() { if err != nil { - for _, l := range layersAdded { - if err := vm.RemoveVSMB(ctx, l, true); err != nil { + for _, l := range layerClosers { + if err := l.Release(ctx); err != nil { log.G(ctx).WithError(err).Warn("failed to remove wcow layer on cleanup") } } @@ -269,16 +326,18 @@ func MountWCOWLayers(ctx context.Context, containerID string, layerFolders []str log.G(ctx).WithField("layerPath", layerPath).Debug("mounting layer") options := vm.DefaultVSMBOptions(true) options.TakeBackupPrivilege = true - if _, err := vm.AddVSMB(ctx, layerPath, options); err != nil { - return "", fmt.Errorf("failed to add VSMB layer: %s", err) + mount, err := vm.AddVSMB(ctx, layerPath, options) + if err != nil { + return "", nil, fmt.Errorf("failed to add VSMB layer: %s", err) } layersAdded = append(layersAdded, layerPath) + layerClosers = append(layerClosers, mount) } containerScratchPathInUVM := ospath.Join(vm.OS(), guestRoot) hostPath, err := getScratchVHDPath(layerFolders) if err != nil { - return "", fmt.Errorf("failed to get scratch VHD path in layer folders: %s", err) + return "", nil, fmt.Errorf("failed to get scratch VHD path in layer folders: %s", err) } log.G(ctx).WithField("hostPath", hostPath).Debug("mounting scratch VHD") @@ -293,13 +352,13 @@ func MountWCOWLayers(ctx context.Context, containerID string, layerFolders []str uvm.VMAccessTypeIndividual, ) if err != nil { - return "", fmt.Errorf("failed to add SCSI scratch VHD: %s", err) + return "", nil, fmt.Errorf("failed to add SCSI scratch VHD: %s", err) } containerScratchPathInUVM = scsiMount.UVMPath defer func() { if err != nil { - if err := vm.RemoveSCSI(ctx, hostPath); err != nil { + if err := scsiMount.Release(ctx); err != nil { log.G(ctx).WithError(err).Warn("failed to remove scratch on cleanup") } } @@ -310,30 +369,36 @@ func MountWCOWLayers(ctx context.Context, containerID string, layerFolders []str var layers []hcsschema.Layer layers, err = GetHCSLayers(ctx, vm, layersAdded) if err != nil { - return "", err + return "", nil, err } err = vm.CombineLayersWCOW(ctx, layers, containerScratchPathInUVM) if err != nil { - return "", err + return "", nil, err } log.G(ctx).Debug("hcsshim::MountWCOWLayers Succeeded") - return containerScratchPathInUVM, nil + closer := &wcowIsolatedLayersCloser{ + uvm: vm, + guestCombinedLayersPath: containerScratchPathInUVM, + scratchMount: scsiMount, + layerClosers: layerClosers, + } + return containerScratchPathInUVM, closer, nil } -func addLCOWLayer(ctx context.Context, vm *uvm.UtilityVM, layerPath string) (uvmPath string, err error) { +func addLCOWLayer(ctx context.Context, vm *uvm.UtilityVM, layerPath string) (uvmPath string, _ resources.ResourceCloser, err error) { // don't try to add as vpmem when we want additional devices on the uvm to be fully physically backed if !vm.DevicesPhysicallyBacked() { // We first try vPMEM and if it is full or the file is too large we // fall back to SCSI. - uvmPath, err = vm.AddVPMem(ctx, layerPath) + mount, err := vm.AddVPMem(ctx, layerPath) if err == nil { log.G(ctx).WithFields(logrus.Fields{ "layerPath": layerPath, "layerType": "vpmem", }).Debug("Added LCOW layer") - return uvmPath, nil + return mount.GuestPath, mount, nil } else if err != uvm.ErrNoAvailableLocation && err != uvm.ErrMaxVPMemLayerSize { - return "", fmt.Errorf("failed to add VPMEM layer: %s", err) + return "", nil, fmt.Errorf("failed to add VPMEM layer: %s", err) } } @@ -341,153 +406,13 @@ func addLCOWLayer(ctx context.Context, vm *uvm.UtilityVM, layerPath string) (uvm uvmPath = fmt.Sprintf(guestpath.LCOWGlobalMountPrefixFmt, vm.UVMMountCounter()) sm, err := vm.AddSCSI(ctx, layerPath, uvmPath, true, false, options, uvm.VMAccessTypeNoop) if err != nil { - return "", fmt.Errorf("failed to add SCSI layer: %s", err) + return "", nil, fmt.Errorf("failed to add SCSI layer: %s", err) } log.G(ctx).WithFields(logrus.Fields{ "layerPath": layerPath, "layerType": "scsi", }).Debug("Added LCOW layer") - return sm.UVMPath, nil -} - -func removeLCOWLayer(ctx context.Context, vm *uvm.UtilityVM, layerPath string) error { - // Assume it was added to vPMEM and fall back to SCSI - err := vm.RemoveVPMem(ctx, layerPath) - if err == nil { - log.G(ctx).WithFields(logrus.Fields{ - "layerPath": layerPath, - "layerType": "vpmem", - }).Debug("Removed LCOW layer") - return nil - } else if err == uvm.ErrNotAttached { - err = vm.RemoveSCSI(ctx, layerPath) - if err == nil { - log.G(ctx).WithFields(logrus.Fields{ - "layerPath": layerPath, - "layerType": "scsi", - }).Debug("Removed LCOW layer") - return nil - } - return errors.Wrap(err, "failed to remove SCSI layer") - } - return errors.Wrap(err, "failed to remove VPMEM layer") -} - -// UnmountOperation is used when calling Unmount() to determine what type of unmount is -// required. In V1 schema, this must be unmountOperationAll. In V2, client can -// be more optimal and only unmount what they need which can be a minor performance -// improvement (eg if you know only one container is running in a utility VM, and -// the UVM is about to be torn down, there's no need to unmount the VSMB shares, -// just SCSI to have a consistent file system). -type UnmountOperation uint - -const ( - UnmountOperationSCSI UnmountOperation = 0x01 - UnmountOperationVSMB = 0x02 - UnmountOperationVPMEM = 0x04 - UnmountOperationAll = UnmountOperationSCSI | UnmountOperationVSMB | UnmountOperationVPMEM -) - -// UnmountContainerLayers is a helper for clients to hide all the complexity of layer unmounting -func UnmountContainerLayers(ctx context.Context, layerFolders []string, containerRootPath, volumeMountPath string, vm *uvm.UtilityVM, op UnmountOperation) error { - log.G(ctx).WithField("layerFolders", layerFolders).Debug("hcsshim::unmountContainerLayers") - if vm == nil { - // Must be an argon - folders are mounted on the host - if op != UnmountOperationAll { - return errors.New("only operation supported for host-mounted folders is unmountOperationAll") - } - if len(layerFolders) < 1 { - return errors.New("need at least one layer for Unmount") - } - - // Remove the mount point if there is one. This is the fallback case for job containers - // if no bind mount support is available. - if volumeMountPath != "" { - if err := RemoveSandboxMountPoint(ctx, volumeMountPath); err != nil { - return err - } - } - - path := layerFolders[len(layerFolders)-1] - if err := wclayer.UnprepareLayer(ctx, path); err != nil { - return err - } - return wclayer.DeactivateLayer(ctx, path) - } - - // V2 Xenon - - // Base+Scratch as a minimum. This is different to v1 which only requires the scratch - if len(layerFolders) < 2 { - return errors.New("at least two layers are required for unmount") - } - - var retError error - - // Always remove the combined layers as they are part of scsi/vsmb/vpmem - // removals. - if vm.OS() == "windows" { - if err := vm.RemoveCombinedLayersWCOW(ctx, containerRootPath); err != nil { - log.G(ctx).WithError(err).Warn("failed guest request to remove combined layers") - retError = err - } - } else { - if err := vm.RemoveCombinedLayersLCOW(ctx, containerRootPath); err != nil { - log.G(ctx).WithError(err).Warn("failed guest request to remove combined layers") - retError = err - } - } - - // Unload the SCSI scratch path - if (op & UnmountOperationSCSI) == UnmountOperationSCSI { - hostScratchFile, err := getScratchVHDPath(layerFolders) - if err != nil { - return errors.Wrap(err, "failed to get scratch VHD path in layer folders") - } - if err := vm.RemoveSCSI(ctx, hostScratchFile); err != nil { - log.G(ctx).WithError(err).Warn("failed to remove scratch") - if retError == nil { - retError = err - } else { - retError = errors.Wrapf(retError, err.Error()) - } - } - } - - // Remove each of the read-only layers from VSMB. These's are ref-counted and - // only removed once the count drops to zero. This allows multiple containers - // to share layers. - if vm.OS() == "windows" && (op&UnmountOperationVSMB) == UnmountOperationVSMB { - for _, layerPath := range layerFolders[:len(layerFolders)-1] { - if e := vm.RemoveVSMB(ctx, layerPath, true); e != nil { - log.G(ctx).WithError(e).Warn("remove VSMB failed") - if retError == nil { - retError = e - } else { - retError = errors.Wrapf(retError, e.Error()) - } - } - } - } - - // Remove each of the read-only layers from VPMEM (or SCSI). These's are ref-counted - // and only removed once the count drops to zero. This allows multiple containers to - // share layers. Note that SCSI is used on large layers. - if vm.OS() == "linux" && (op&UnmountOperationVPMEM) == UnmountOperationVPMEM { - for _, layerPath := range layerFolders[:len(layerFolders)-1] { - hostPath := filepath.Join(layerPath, "layer.vhd") - if err := removeLCOWLayer(ctx, vm, hostPath); err != nil { - log.G(ctx).WithError(err).Warn("remove layer failed") - if retError == nil { - retError = err - } else { - retError = errors.Wrapf(retError, err.Error()) - } - } - } - } - - return retError + return sm.UVMPath, sm, nil } // GetHCSLayers converts host paths corresponding to container layers into HCS schema V2 layers @@ -506,13 +431,6 @@ func GetHCSLayers(ctx context.Context, vm *uvm.UtilityVM, paths []string) (layer return layers, nil } -func containerRootfsPath(vm *uvm.UtilityVM, rootPath string) string { - if vm.OS() == "windows" { - return ospath.Join(vm.OS(), rootPath) - } - return ospath.Join(vm.OS(), rootPath, guestpath.RootfsPath) -} - func getScratchVHDPath(layerFolders []string) (string, error) { hostPath := filepath.Join(layerFolders[len(layerFolders)-1], "sandbox.vhdx") // For LCOW, we can reuse another container's scratch space (usually the sandbox container's). diff --git a/internal/resources/resources.go b/internal/resources/resources.go index 319c88461f..d1f83dbc64 100644 --- a/internal/resources/resources.go +++ b/internal/resources/resources.go @@ -7,7 +7,6 @@ import ( "errors" "github.com/Microsoft/hcsshim/internal/credentials" - "github.com/Microsoft/hcsshim/internal/layers" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/uvm" ) @@ -51,7 +50,7 @@ func (r *Resources) LcowScratchPath() string { } // SetLayers updates the container resource's image layers -func (r *Resources) SetLayers(l *layers.ImageLayers) { +func (r *Resources) SetLayers(l ResourceCloser) { r.layers = l } @@ -86,7 +85,7 @@ type Resources struct { // addedNetNSToVM indicates if the network namespace has been added to the containers utility VM addedNetNSToVM bool // layers is a pointer to a struct of the layers paths of a container - layers *layers.ImageLayers + layers ResourceCloser // resources is a slice of the resources associated with a container resources []ResourceCloser } @@ -157,9 +156,7 @@ func ReleaseResources(ctx context.Context, r *Resources, vm *uvm.UtilityVM, all } if r.layers != nil { - // TODO dcantah: Either make it so layers doesn't rely on the all bool for cleanup logic - // or find a way to factor out the all bool in favor of something else. - if err := r.layers.Release(ctx, all); err != nil { + if err := r.layers.Release(ctx); err != nil { return err } } diff --git a/internal/uvm/vpmem.go b/internal/uvm/vpmem.go index cde51aa014..4d0d50d963 100644 --- a/internal/uvm/vpmem.go +++ b/internal/uvm/vpmem.go @@ -29,6 +29,18 @@ var ( ErrMaxVPMemLayerSize = errors.New("layer size is to large for VPMEM max size") ) +// var _ resources.ResourceCloser = &VPMEMMount{} -- Causes an import cycle. + +type VPMEMMount struct { + GuestPath string + uvm *UtilityVM + hostPath string +} + +func (vc *VPMEMMount) Release(ctx context.Context) error { + return vc.uvm.RemoveVPMem(ctx, vc.hostPath) +} + type vPMemInfoDefault struct { hostPath string uvmPath string @@ -235,18 +247,27 @@ func (uvm *UtilityVM) removeVPMemDefault(ctx context.Context, hostPath string) e return nil } -func (uvm *UtilityVM) AddVPMem(ctx context.Context, hostPath string) (string, error) { +func (uvm *UtilityVM) AddVPMem(ctx context.Context, hostPath string) (*VPMEMMount, error) { if uvm.operatingSystem != "linux" { - return "", errNotSupported + return nil, errNotSupported } uvm.m.Lock() defer uvm.m.Unlock() + var ( + guestPath string + err error + ) if uvm.vpmemMultiMapping { - return uvm.addVPMemMappedDevice(ctx, hostPath) + guestPath, err = uvm.addVPMemMappedDevice(ctx, hostPath) + } else { + guestPath, err = uvm.addVPMemDefault(ctx, hostPath) + } + if err != nil { + return nil, err } - return uvm.addVPMemDefault(ctx, hostPath) + return &VPMEMMount{GuestPath: guestPath, uvm: uvm, hostPath: hostPath}, nil } func (uvm *UtilityVM) RemoveVPMem(ctx context.Context, hostPath string) error { diff --git a/test/functional/uvm_vpmem_test.go b/test/functional/uvm_vpmem_test.go index 70b6118e5b..78f9ee7645 100644 --- a/test/functional/uvm_vpmem_test.go +++ b/test/functional/uvm_vpmem_test.go @@ -37,11 +37,11 @@ func TestVPMEM(t *testing.T) { } for i := 0; i < int(iterations); i++ { - uvmPath, err := u.AddVPMem(ctx, filepath.Join(tempDir, "layer.vhd")) + mount, err := u.AddVPMem(ctx, filepath.Join(tempDir, "layer.vhd")) if err != nil { t.Fatalf("AddVPMEM failed: %s", err) } - t.Logf("exposed as %s", uvmPath) + t.Logf("exposed as %s", mount.GuestPath) } // Remove them all diff --git a/test/functional/wcow_test.go b/test/functional/wcow_test.go index de42540b5b..84b118dca5 100644 --- a/test/functional/wcow_test.go +++ b/test/functional/wcow_test.go @@ -371,7 +371,6 @@ func TestWCOWArgonShim(t *testing.T) { requireFeatures(t, featureWCOW) imageLayers := windowsServercoreImageLayers(context.Background(), t) - argonShimMounted := false argonShimScratchDir := t.TempDir() if err := wclayer.CreateScratchLayer(context.Background(), argonShimScratchDir, imageLayers); err != nil { @@ -381,25 +380,17 @@ func TestWCOWArgonShim(t *testing.T) { hostRWSharedDirectory, hostROSharedDirectory := createTestMounts(t) layers := generateShimLayersStruct(t, imageLayers) - // For cleanup on failure - defer func() { - if argonShimMounted { - _ = layerspkg.UnmountContainerLayers(context.Background(), - append(imageLayers, argonShimScratchDir), - "", - "", - nil, - layerspkg.UnmountOperationAll) - } - }() - id := "argon" // This is a cheat but stops us re-writing exactly the same code just for test - argonShimLocalMountPath, err := layerspkg.MountWCOWLayers(context.Background(), id, append(imageLayers, argonShimScratchDir), "", "", nil) + argonShimLocalMountPath, closer, err := layerspkg.MountWCOWLayers(context.Background(), id, append(imageLayers, argonShimScratchDir), "", "", nil) if err != nil { t.Fatal(err) } - argonShimMounted = true + defer func() { + if closer != nil { + _ = closer.Release(context.Background()) + } + }() argonShim, err := hcsshim.CreateContainer(id, &hcsshim.ContainerConfig{ SystemType: "Container", Name: "argonShim", @@ -428,17 +419,10 @@ func TestWCOWArgonShim(t *testing.T) { } runShimCommands(t, argonShim) stopContainer(t, argonShim) - if err := layerspkg.UnmountContainerLayers( - context.Background(), - append(imageLayers, argonShimScratchDir), - "", - "", - nil, - layerspkg.UnmountOperationAll, - ); err != nil { + if err := closer.Release(context.Background()); err != nil { t.Fatal(err) } - argonShimMounted = false + closer = nil } // Xenon through HCSShim interface (v1)