diff --git a/internal/hcsoci/resources_lcow.go b/internal/hcsoci/resources_lcow.go index 65e7acf022..58bf9d1115 100644 --- a/internal/hcsoci/resources_lcow.go +++ b/internal/hcsoci/resources_lcow.go @@ -30,13 +30,16 @@ func allocateLinuxResources(ctx context.Context, coi *createOptionsInternal, r * containerRootInUVM := r.ContainerRootInUVM() if coi.Spec.Windows != nil && len(coi.Spec.Windows.LayerFolders) > 0 { log.G(ctx).Debug("hcsshim::allocateLinuxResources mounting storage") - rootPath, scratchPath, err := layers.MountLCOWLayers(ctx, coi.actualID, coi.Spec.Windows.LayerFolders, containerRootInUVM, "", coi.HostingSystem) + rootPath, scratchPath, closer, err := layers.MountLCOWLayers(ctx, coi.actualID, coi.Spec.Windows.LayerFolders, containerRootInUVM, coi.HostingSystem) if err != nil { return errors.Wrap(err, "failed to mount container storage") } coi.Spec.Root.Path = rootPath - layers := layers.NewImageLayers(coi.HostingSystem, containerRootInUVM, coi.Spec.Windows.LayerFolders, "", isSandbox) - r.SetLayers(layers) + // If this is the pause container in a hypervisor-isolated pod, we can skip cleanup of + // layers, as that happens automatically when the UVM is terminated. + if !isSandbox || coi.HostingSystem == nil { + r.SetLayers(closer) + } r.SetLcowScratchPath(scratchPath) } else if coi.Spec.Root.Path != "" { // This is the "Plan 9" root filesystem. diff --git a/internal/hcsoci/resources_wcow.go b/internal/hcsoci/resources_wcow.go index 0149a05331..8a810a12da 100644 --- a/internal/hcsoci/resources_wcow.go +++ b/internal/hcsoci/resources_wcow.go @@ -60,13 +60,16 @@ func allocateWindowsResources(ctx context.Context, coi *createOptionsInternal, r if coi.Spec.Root.Path == "" && (coi.HostingSystem != nil || coi.Spec.Windows.HyperV == nil) { log.G(ctx).Debug("hcsshim::allocateWindowsResources mounting storage") containerRootInUVM := r.ContainerRootInUVM() - containerRootPath, err := layers.MountWCOWLayers(ctx, coi.actualID, coi.Spec.Windows.LayerFolders, containerRootInUVM, "", coi.HostingSystem) + containerRootPath, closer, err := layers.MountWCOWLayers(ctx, coi.actualID, coi.Spec.Windows.LayerFolders, containerRootInUVM, "", coi.HostingSystem) if err != nil { return errors.Wrap(err, "failed to mount container storage") } coi.Spec.Root.Path = containerRootPath - layers := layers.NewImageLayers(coi.HostingSystem, containerRootInUVM, coi.Spec.Windows.LayerFolders, "", isSandbox) - r.SetLayers(layers) + // If this is the pause container in a hypervisor-isolated pod, we can skip cleanup of + // layers, as that happens automatically when the UVM is terminated. + if !isSandbox || coi.HostingSystem == nil { + r.SetLayers(closer) + } } if err := setupMounts(ctx, coi, r); err != nil { diff --git a/internal/jobcontainers/jobcontainer.go b/internal/jobcontainers/jobcontainer.go index e68a4820e4..95a6715f5f 100644 --- a/internal/jobcontainers/jobcontainer.go +++ b/internal/jobcontainers/jobcontainer.go @@ -21,7 +21,6 @@ import ( "github.com/Microsoft/hcsshim/internal/hcs/schema1" hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" "github.com/Microsoft/hcsshim/internal/jobobject" - "github.com/Microsoft/hcsshim/internal/layers" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/queue" "github.com/Microsoft/hcsshim/internal/resources" @@ -189,25 +188,16 @@ func Create(ctx context.Context, id string, s *specs.Spec) (_ cow.Container, _ * } }) + var closer resources.ResourceCloser if fileBindingSupport { - if err := container.bindSetup(ctx, s); err != nil { - return nil, nil, err - } + closer, err = container.bindSetup(ctx, s) } else { - if err := container.fallbackSetup(ctx, s); err != nil { - return nil, nil, err - } + closer, err = container.fallbackSetup(ctx, s) } - - // We actually don't need to pass in anything for volumeMountPath below if we have file binding support, - // the filter allows us to pass in a raw volume path and it can use that to bind a volume to a friendly path - // instead so we can skip calling SetVolumeMountPoint. - var rootfsMountPoint string - if !fileBindingSupport { - rootfsMountPoint = container.rootfsLocation + if err != nil { + return nil, nil, err } - layers := layers.NewImageLayers(nil, "", s.Windows.LayerFolders, rootfsMountPoint, false) - r.SetLayers(layers) + r.SetLayers(closer) volumeGUIDRegex := `^\\\\\?\\(Volume)\{{0,1}[0-9a-fA-F]{8}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{12}(\}){0,1}\}(|\\)$` if matched, err := regexp.MatchString(volumeGUIDRegex, s.Root.Path); !matched || err != nil { @@ -775,37 +765,56 @@ func (c *JobContainer) replaceWithMountPoint(str string) (string, bool) { return newStr, str != newStr } -func (c *JobContainer) bindSetup(ctx context.Context, s *specs.Spec) (err error) { +func (c *JobContainer) bindSetup(ctx context.Context, s *specs.Spec) (_ resources.ResourceCloser, err error) { // Must be upgraded to a silo so we can get per silo bindings for the container. if err := c.job.PromoteToSilo(); err != nil { - return err + return nil, err } // Union the container layers. - if err := c.mountLayers(ctx, c.id, s, ""); err != nil { - return fmt.Errorf("failed to mount container layers: %w", err) + closer, err := c.mountLayers(ctx, c.id, s, "") + if err != nil { + return nil, fmt.Errorf("failed to mount container layers: %w", err) } + defer func() { + if err != nil { + closer.Release(ctx) + } + }() + rootfsLocation := defaultSiloRootfsLocation if loc := customRootfsLocation(s.Annotations); loc != "" { rootfsLocation = loc } if err := c.setupRootfsBinding(rootfsLocation, s.Root.Path); err != nil { - return err + return nil, err } c.rootfsLocation = rootfsLocation - return c.setupMounts(ctx, s) + if err := c.setupMounts(ctx, s); err != nil { + return nil, err + } + return closer, nil } // This handles the fallback case where bind mounting isn't available on the machine. This mounts the // container layers on the host and sets up any mounts present in the OCI runtime spec. -func (c *JobContainer) fallbackSetup(ctx context.Context, s *specs.Spec) (err error) { +func (c *JobContainer) fallbackSetup(ctx context.Context, s *specs.Spec) (_ resources.ResourceCloser, err error) { rootfsLocation := fmt.Sprintf(fallbackRootfsFormat, c.id) if loc := customRootfsLocation(s.Annotations); loc != "" { rootfsLocation = filepath.Join(loc, c.id) } - if err := c.mountLayers(ctx, c.id, s, rootfsLocation); err != nil { - return fmt.Errorf("failed to mount container layers: %w", err) + closer, err := c.mountLayers(ctx, c.id, s, rootfsLocation) + if err != nil { + return nil, fmt.Errorf("failed to mount container layers: %w", err) } + defer func() { + if err != nil { + closer.Release(ctx) + } + }() c.rootfsLocation = rootfsLocation - return fallbackMountSetup(s, c.rootfsLocation) + if err := fallbackMountSetup(s, c.rootfsLocation); err != nil { + return nil, err + } + return closer, nil } diff --git a/internal/jobcontainers/storage.go b/internal/jobcontainers/storage.go index edb2d4a4ef..fedcee814e 100644 --- a/internal/jobcontainers/storage.go +++ b/internal/jobcontainers/storage.go @@ -10,6 +10,7 @@ import ( "github.com/Microsoft/hcsshim/internal/layers" "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/resources" "github.com/Microsoft/hcsshim/internal/wclayer" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" @@ -25,23 +26,23 @@ const fallbackRootfsFormat = `C:\hpc\%s\` // C:\hpc\ const defaultSiloRootfsLocation = `C:\hpc\` -func (c *JobContainer) mountLayers(ctx context.Context, containerID string, s *specs.Spec, volumeMountPath string) (err error) { +func (c *JobContainer) mountLayers(ctx context.Context, containerID string, s *specs.Spec, volumeMountPath string) (_ resources.ResourceCloser, err error) { if s == nil || s.Windows == nil || s.Windows.LayerFolders == nil { - return errors.New("field 'Spec.Windows.Layerfolders' is not populated") + return nil, errors.New("field 'Spec.Windows.Layerfolders' is not populated") } // Last layer always contains the sandbox.vhdx, or 'scratch' space for the container. scratchFolder := s.Windows.LayerFolders[len(s.Windows.LayerFolders)-1] if _, err := os.Stat(scratchFolder); os.IsNotExist(err) { if err := os.MkdirAll(scratchFolder, 0777); err != nil { - return fmt.Errorf("failed to auto-create container scratch folder %s: %w", scratchFolder, err) + return nil, fmt.Errorf("failed to auto-create container scratch folder %s: %w", scratchFolder, err) } } // Create sandbox.vhdx if it doesn't exist in the scratch folder. if _, err := os.Stat(filepath.Join(scratchFolder, "sandbox.vhdx")); os.IsNotExist(err) { if err := wclayer.CreateScratchLayer(ctx, scratchFolder, s.Windows.LayerFolders[:len(s.Windows.LayerFolders)-1]); err != nil { - return fmt.Errorf("failed to CreateSandboxLayer: %w", err) + return nil, fmt.Errorf("failed to CreateSandboxLayer: %w", err) } } @@ -49,16 +50,18 @@ func (c *JobContainer) mountLayers(ctx context.Context, containerID string, s *s s.Root = &specs.Root{} } + var closer resources.ResourceCloser if s.Root.Path == "" { log.G(ctx).Debug("mounting job container storage") - rootPath, err := layers.MountWCOWLayers(ctx, containerID, s.Windows.LayerFolders, "", volumeMountPath, nil) + var rootPath string + rootPath, closer, err = layers.MountWCOWLayers(ctx, containerID, s.Windows.LayerFolders, "", volumeMountPath, nil) if err != nil { - return fmt.Errorf("failed to mount job container storage: %w", err) + return nil, fmt.Errorf("failed to mount job container storage: %w", err) } s.Root.Path = rootPath + "\\" } - return nil + return closer, nil } // setupRootfsBinding binds the copy on write volume for the container to a static path diff --git a/internal/layers/layers.go b/internal/layers/layers.go index e00c192769..fd7cfa5389 100644 --- a/internal/layers/layers.go +++ b/internal/layers/layers.go @@ -20,51 +20,43 @@ import ( "github.com/Microsoft/hcsshim/internal/hcserror" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/ospath" + "github.com/Microsoft/hcsshim/internal/resources" "github.com/Microsoft/hcsshim/internal/uvm" "github.com/Microsoft/hcsshim/internal/wclayer" ) -// ImageLayers contains all the layers for an image. -type ImageLayers struct { - vm *uvm.UtilityVM - containerRootInUVM string - volumeMountPath string - layers []string - // In some instances we may want to avoid cleaning up the image layers, such as when tearing - // down a sandbox container since the UVM will be torn down shortly after and the resources - // can be cleaned up on the host. - skipCleanup bool +type lcowLayersCloser struct { + uvm *uvm.UtilityVM + guestCombinedLayersPath string + scratchMount resources.ResourceCloser + layerClosers []resources.ResourceCloser } -func NewImageLayers(vm *uvm.UtilityVM, containerRootInUVM string, layers []string, volumeMountPath string, skipCleanup bool) *ImageLayers { - return &ImageLayers{ - vm: vm, - containerRootInUVM: containerRootInUVM, - layers: layers, - volumeMountPath: volumeMountPath, - skipCleanup: skipCleanup, - } -} - -// Release unmounts all of the layers located in the layers array. -func (layers *ImageLayers) Release(ctx context.Context, all bool) error { - if layers.skipCleanup && layers.vm != nil { - return nil - } - op := UnmountOperationSCSI - if layers.vm == nil || all { - op = UnmountOperationAll +func (lc *lcowLayersCloser) Release(ctx context.Context) (retErr error) { + if err := lc.uvm.RemoveCombinedLayersLCOW(ctx, lc.guestCombinedLayersPath); err != nil { + log.G(ctx).WithError(err).Error("failed RemoveCombinedLayersLCOW") + if retErr == nil { + retErr = fmt.Errorf("first error: %w", err) + } } - var crp string - if layers.vm != nil { - crp = containerRootfsPath(layers.vm, layers.containerRootInUVM) + if err := lc.scratchMount.Release(ctx); err != nil { + log.G(ctx).WithError(err).Error("failed LCOW scratch mount release") + if retErr == nil { + retErr = fmt.Errorf("first error: %w", err) + } } - err := UnmountContainerLayers(ctx, layers.layers, crp, layers.volumeMountPath, layers.vm, op) - if err != nil { - return err + for i, closer := range lc.layerClosers { + if err := closer.Release(ctx); err != nil { + log.G(ctx).WithFields(logrus.Fields{ + logrus.ErrorKey: err, + "layerIndex": i, + }).Error("failed releasing LCOW layer") + if retErr == nil { + retErr = fmt.Errorf("first error: %w", err) + } + } } - layers.layers = nil - return nil + return } // MountLCOWLayers is a helper for clients to hide all the complexity of layer mounting for LCOW @@ -72,22 +64,22 @@ func (layers *ImageLayers) Release(ctx context.Context, all bool) error { // Returns the path at which the `rootfs` of the container can be accessed. Also, returns the path inside the // UVM at which container scratch directory is located. Usually, this path is the path at which the container // scratch VHD is mounted. However, in case of scratch sharing this is a directory under the UVM scratch. -func MountLCOWLayers(ctx context.Context, containerID string, layerFolders []string, guestRoot, volumeMountPath string, vm *uvm.UtilityVM) (_, _ string, err error) { +func MountLCOWLayers(ctx context.Context, containerID string, layerFolders []string, guestRoot string, vm *uvm.UtilityVM) (_, _ string, _ resources.ResourceCloser, err error) { if vm.OS() != "linux" { - return "", "", errors.New("MountLCOWLayers should only be called for LCOW") + return "", "", nil, errors.New("MountLCOWLayers should only be called for LCOW") } // V2 UVM log.G(ctx).WithField("os", vm.OS()).Debug("hcsshim::MountLCOWLayers V2 UVM") var ( - layersAdded []string + layerClosers []resources.ResourceCloser lcowUvmLayerPaths []string ) defer func() { if err != nil { - for _, l := range layersAdded { - if err := removeLCOWLayer(ctx, vm, l); err != nil { + for _, closer := range layerClosers { + if err := closer.Release(ctx); err != nil { log.G(ctx).WithError(err).Warn("failed to remove lcow layer on cleanup") } } @@ -100,18 +92,18 @@ func MountLCOWLayers(ctx context.Context, containerID string, layerFolders []str layerPath = filepath.Join(layerPath, "layer.vhd") uvmPath string ) - uvmPath, err = addLCOWLayer(ctx, vm, layerPath) + uvmPath, closer, err := addLCOWLayer(ctx, vm, layerPath) if err != nil { - return "", "", fmt.Errorf("failed to add LCOW layer: %s", err) + return "", "", nil, fmt.Errorf("failed to add LCOW layer: %s", err) } - layersAdded = append(layersAdded, layerPath) + layerClosers = append(layerClosers, closer) lcowUvmLayerPaths = append(lcowUvmLayerPaths, uvmPath) } containerScratchPathInUVM := ospath.Join(vm.OS(), guestRoot) hostPath, err := getScratchVHDPath(layerFolders) if err != nil { - return "", "", fmt.Errorf("failed to get scratch VHD path in layer folders: %s", err) + return "", "", nil, fmt.Errorf("failed to eval symlinks on scratch path: %w", err) } log.G(ctx).WithField("hostPath", hostPath).Debug("mounting scratch VHD") @@ -126,20 +118,17 @@ func MountLCOWLayers(ctx context.Context, containerID string, layerFolders []str uvm.VMAccessTypeIndividual, ) if err != nil { - return "", "", fmt.Errorf("failed to add SCSI scratch VHD: %s", err) + return "", "", nil, fmt.Errorf("failed to add SCSI scratch VHD: %s", err) } // handles the case where we want to share a scratch disk for multiple containers instead // of mounting a new one. Pass a unique value for `ScratchPath` to avoid container upper and // work directories colliding in the UVM. - if scsiMount.RefCount() > 1 { - scratchFmt := fmt.Sprintf("container_%s", filepath.Base(containerScratchPathInUVM)) - containerScratchPathInUVM = ospath.Join("linux", scsiMount.UVMPath, scratchFmt) - } + containerScratchPathInUVM = ospath.Join("linux", scsiMount.UVMPath, "scratch", containerID) defer func() { if err != nil { - if err := vm.RemoveSCSI(ctx, hostPath); err != nil { + if err := scsiMount.Release(ctx); err != nil { log.G(ctx).WithError(err).Warn("failed to remove scratch on cleanup") } } @@ -148,10 +137,16 @@ func MountLCOWLayers(ctx context.Context, containerID string, layerFolders []str rootfs := ospath.Join(vm.OS(), guestRoot, guestpath.RootfsPath) err = vm.CombineLayersLCOW(ctx, containerID, lcowUvmLayerPaths, containerScratchPathInUVM, rootfs) if err != nil { - return "", "", err + return "", "", nil, err } log.G(ctx).Debug("hcsshim::MountLCOWLayers Succeeded") - return rootfs, containerScratchPathInUVM, nil + closer := &lcowLayersCloser{ + uvm: vm, + guestCombinedLayersPath: rootfs, + scratchMount: scsiMount, + layerClosers: layerClosers, + } + return rootfs, containerScratchPathInUVM, closer, nil } // MountWCOWLayers is a helper for clients to hide all the complexity of layer mounting for WCOW. @@ -165,100 +160,162 @@ func MountLCOWLayers(ctx context.Context, containerID string, layerFolders []str // // Job container: Returns the mount path on the host as a volume guid, with the volume mounted on // the host at `volumeMountPath`. -func MountWCOWLayers(ctx context.Context, containerID string, layerFolders []string, guestRoot, volumeMountPath string, vm *uvm.UtilityVM) (_ string, err error) { +func MountWCOWLayers(ctx context.Context, containerID string, layerFolders []string, guestRoot, volumeMountPath string, vm *uvm.UtilityVM) (_ string, _ resources.ResourceCloser, err error) { + if vm.OS() != "windows" { + return "", nil, errors.New("MountWCOWLayers should only be called for WCOW") + } + if vm == nil { - if len(layerFolders) < 2 { - return "", errors.New("need at least two layers - base and scratch") + return mountWCOWHostLayers(ctx, layerFolders, volumeMountPath) + } + return mountWCOWIsolatedLayers(ctx, containerID, layerFolders, guestRoot, volumeMountPath, vm) +} + +type wcowHostLayersCloser struct { + volumeMountPath string + scratchLayerFolderPath string +} + +func (lc *wcowHostLayersCloser) Release(ctx context.Context) error { + if lc.volumeMountPath != "" { + if err := RemoveSandboxMountPoint(ctx, lc.volumeMountPath); err != nil { + return err } - path := layerFolders[len(layerFolders)-1] - rest := layerFolders[:len(layerFolders)-1] - // Simple retry loop to handle some behavior on RS5. Loopback VHDs used to be mounted in a different manner on RS5 (ws2019) which led to some - // very odd cases where things would succeed when they shouldn't have, or we'd simply timeout if an operation took too long. Many - // parallel invocations of this code path and stressing the machine seem to bring out the issues, but all of the possible failure paths - // that bring about the errors we have observed aren't known. - // - // On 19h1+ this *shouldn't* be needed, but the logic is to break if everything succeeded so this is harmless and shouldn't need a version check. - var lErr error - for i := 0; i < 5; i++ { - lErr = func() (err error) { - if err := wclayer.ActivateLayer(ctx, path); err != nil { - return err - } + } + if err := wclayer.UnprepareLayer(ctx, lc.scratchLayerFolderPath); err != nil { + return err + } + return wclayer.DeactivateLayer(ctx, lc.scratchLayerFolderPath) +} - defer func() { - if err != nil { - _ = wclayer.DeactivateLayer(ctx, path) - } - }() +func mountWCOWHostLayers(ctx context.Context, layerFolders []string, volumeMountPath string) (_ string, _ resources.ResourceCloser, err error) { + if len(layerFolders) < 2 { + return "", nil, errors.New("need at least two layers - base and scratch") + } + path := layerFolders[len(layerFolders)-1] + rest := layerFolders[:len(layerFolders)-1] + // Simple retry loop to handle some behavior on RS5. Loopback VHDs used to be mounted in a different manner on RS5 (ws2019) which led to some + // very odd cases where things would succeed when they shouldn't have, or we'd simply timeout if an operation took too long. Many + // parallel invocations of this code path and stressing the machine seem to bring out the issues, but all of the possible failure paths + // that bring about the errors we have observed aren't known. + // + // On 19h1+ this *shouldn't* be needed, but the logic is to break if everything succeeded so this is harmless and shouldn't need a version check. + var lErr error + for i := 0; i < 5; i++ { + lErr = func() (err error) { + if err := wclayer.ActivateLayer(ctx, path); err != nil { + return err + } - return wclayer.PrepareLayer(ctx, path, rest) + defer func() { + if err != nil { + _ = wclayer.DeactivateLayer(ctx, path) + } }() - if lErr != nil { - // Common errors seen from the RS5 behavior mentioned above is ERROR_NOT_READY and ERROR_DEVICE_NOT_CONNECTED. The former occurs when HCS - // tries to grab the volume path of the disk but it doesn't succeed, usually because the disk isn't actually mounted. DEVICE_NOT_CONNECTED - // has been observed after launching multiple containers in parallel on a machine under high load. This has also been observed to be a trigger - // for ERROR_NOT_READY as well. - if hcserr, ok := lErr.(*hcserror.HcsError); ok { - if hcserr.Err == windows.ERROR_NOT_READY || hcserr.Err == windows.ERROR_DEVICE_NOT_CONNECTED { - log.G(ctx).WithField("path", path).WithError(hcserr.Err).Warning("retrying layer operations after failure") - - // Sleep for a little before a re-attempt. A probable cause for these issues in the first place is events not getting - // reported in time so might be good to give some time for things to "cool down" or get back to a known state. - time.Sleep(time.Millisecond * 100) - continue - } - } - // This was a failure case outside of the commonly known error conditions, don't retry here. - return "", lErr - } + return wclayer.PrepareLayer(ctx, path, rest) + }() - // No errors in layer setup, we can leave the loop - break - } - // If we got unlucky and ran into one of the two errors mentioned five times in a row and left the loop, we need to check - // the loop error here and fail also. if lErr != nil { - return "", errors.Wrap(lErr, "layer retry loop failed") + // Common errors seen from the RS5 behavior mentioned above is ERROR_NOT_READY and ERROR_DEVICE_NOT_CONNECTED. The former occurs when HCS + // tries to grab the volume path of the disk but it doesn't succeed, usually because the disk isn't actually mounted. DEVICE_NOT_CONNECTED + // has been observed after launching multiple containers in parallel on a machine under high load. This has also been observed to be a trigger + // for ERROR_NOT_READY as well. + if hcserr, ok := lErr.(*hcserror.HcsError); ok { + if hcserr.Err == windows.ERROR_NOT_READY || hcserr.Err == windows.ERROR_DEVICE_NOT_CONNECTED { + log.G(ctx).WithField("path", path).WithError(hcserr.Err).Warning("retrying layer operations after failure") + + // Sleep for a little before a re-attempt. A probable cause for these issues in the first place is events not getting + // reported in time so might be good to give some time for things to "cool down" or get back to a known state. + time.Sleep(time.Millisecond * 100) + continue + } + } + // This was a failure case outside of the commonly known error conditions, don't retry here. + return "", nil, lErr } - // If any of the below fails, we want to detach the filter and unmount the disk. - defer func() { - if err != nil { - _ = wclayer.UnprepareLayer(ctx, path) - _ = wclayer.DeactivateLayer(ctx, path) - } - }() + // No errors in layer setup, we can leave the loop + break + } + // If we got unlucky and ran into one of the two errors mentioned five times in a row and left the loop, we need to check + // the loop error here and fail also. + if lErr != nil { + return "", nil, errors.Wrap(lErr, "layer retry loop failed") + } - mountPath, err := wclayer.GetLayerMountPath(ctx, path) + // If any of the below fails, we want to detach the filter and unmount the disk. + defer func() { if err != nil { - return "", err + _ = wclayer.UnprepareLayer(ctx, path) + _ = wclayer.DeactivateLayer(ctx, path) } + }() - // Mount the volume to a directory on the host if requested. This is the case for job containers. - if volumeMountPath != "" { - if err := MountSandboxVolume(ctx, volumeMountPath, mountPath); err != nil { - return "", err - } + mountPath, err := wclayer.GetLayerMountPath(ctx, path) + if err != nil { + return "", nil, err + } + + // Mount the volume to a directory on the host if requested. This is the case for job containers. + if volumeMountPath != "" { + if err := MountSandboxVolume(ctx, volumeMountPath, mountPath); err != nil { + return "", nil, err } + } - return mountPath, nil + closer := &wcowHostLayersCloser{ + volumeMountPath: volumeMountPath, + scratchLayerFolderPath: path, } + return mountPath, closer, nil +} - if vm.OS() != "windows" { - return "", errors.New("MountWCOWLayers should only be called for WCOW") +type wcowIsolatedLayersCloser struct { + uvm *uvm.UtilityVM + guestCombinedLayersPath string + scratchMount resources.ResourceCloser + layerClosers []resources.ResourceCloser +} + +func (lc *wcowIsolatedLayersCloser) Release(ctx context.Context) (retErr error) { + if err := lc.uvm.RemoveCombinedLayersWCOW(ctx, lc.guestCombinedLayersPath); err != nil { + log.G(ctx).WithError(err).Error("failed RemoveCombinedLayersWCOW") + if retErr == nil { + retErr = fmt.Errorf("first error: %w", err) + } } + if err := lc.scratchMount.Release(ctx); err != nil { + log.G(ctx).WithError(err).Error("failed WCOW scratch mount release") + if retErr == nil { + retErr = fmt.Errorf("first error: %w", err) + } + } + for i, closer := range lc.layerClosers { + if err := closer.Release(ctx); err != nil { + log.G(ctx).WithFields(logrus.Fields{ + logrus.ErrorKey: err, + "layerIndex": i, + }).Error("failed releasing WCOW layer") + if retErr == nil { + retErr = fmt.Errorf("first error: %w", err) + } + } + } + return +} - // V2 UVM +func mountWCOWIsolatedLayers(ctx context.Context, containerID string, layerFolders []string, guestRoot, volumeMountPath string, vm *uvm.UtilityVM) (_ string, _ resources.ResourceCloser, err error) { log.G(ctx).WithField("os", vm.OS()).Debug("hcsshim::MountWCOWLayers V2 UVM") var ( - layersAdded []string + layersAdded []string + layerClosers []resources.ResourceCloser ) defer func() { if err != nil { - for _, l := range layersAdded { - if err := vm.RemoveVSMB(ctx, l, true); err != nil { + for _, l := range layerClosers { + if err := l.Release(ctx); err != nil { log.G(ctx).WithError(err).Warn("failed to remove wcow layer on cleanup") } } @@ -269,16 +326,18 @@ func MountWCOWLayers(ctx context.Context, containerID string, layerFolders []str log.G(ctx).WithField("layerPath", layerPath).Debug("mounting layer") options := vm.DefaultVSMBOptions(true) options.TakeBackupPrivilege = true - if _, err := vm.AddVSMB(ctx, layerPath, options); err != nil { - return "", fmt.Errorf("failed to add VSMB layer: %s", err) + mount, err := vm.AddVSMB(ctx, layerPath, options) + if err != nil { + return "", nil, fmt.Errorf("failed to add VSMB layer: %s", err) } layersAdded = append(layersAdded, layerPath) + layerClosers = append(layerClosers, mount) } containerScratchPathInUVM := ospath.Join(vm.OS(), guestRoot) hostPath, err := getScratchVHDPath(layerFolders) if err != nil { - return "", fmt.Errorf("failed to get scratch VHD path in layer folders: %s", err) + return "", nil, fmt.Errorf("failed to get scratch VHD path in layer folders: %s", err) } log.G(ctx).WithField("hostPath", hostPath).Debug("mounting scratch VHD") @@ -293,13 +352,13 @@ func MountWCOWLayers(ctx context.Context, containerID string, layerFolders []str uvm.VMAccessTypeIndividual, ) if err != nil { - return "", fmt.Errorf("failed to add SCSI scratch VHD: %s", err) + return "", nil, fmt.Errorf("failed to add SCSI scratch VHD: %s", err) } containerScratchPathInUVM = scsiMount.UVMPath defer func() { if err != nil { - if err := vm.RemoveSCSI(ctx, hostPath); err != nil { + if err := scsiMount.Release(ctx); err != nil { log.G(ctx).WithError(err).Warn("failed to remove scratch on cleanup") } } @@ -310,30 +369,36 @@ func MountWCOWLayers(ctx context.Context, containerID string, layerFolders []str var layers []hcsschema.Layer layers, err = GetHCSLayers(ctx, vm, layersAdded) if err != nil { - return "", err + return "", nil, err } err = vm.CombineLayersWCOW(ctx, layers, containerScratchPathInUVM) if err != nil { - return "", err + return "", nil, err } log.G(ctx).Debug("hcsshim::MountWCOWLayers Succeeded") - return containerScratchPathInUVM, nil + closer := &wcowIsolatedLayersCloser{ + uvm: vm, + guestCombinedLayersPath: containerScratchPathInUVM, + scratchMount: scsiMount, + layerClosers: layerClosers, + } + return containerScratchPathInUVM, closer, nil } -func addLCOWLayer(ctx context.Context, vm *uvm.UtilityVM, layerPath string) (uvmPath string, err error) { +func addLCOWLayer(ctx context.Context, vm *uvm.UtilityVM, layerPath string) (uvmPath string, _ resources.ResourceCloser, err error) { // don't try to add as vpmem when we want additional devices on the uvm to be fully physically backed if !vm.DevicesPhysicallyBacked() { // We first try vPMEM and if it is full or the file is too large we // fall back to SCSI. - uvmPath, err = vm.AddVPMem(ctx, layerPath) + mount, err := vm.AddVPMem(ctx, layerPath) if err == nil { log.G(ctx).WithFields(logrus.Fields{ "layerPath": layerPath, "layerType": "vpmem", }).Debug("Added LCOW layer") - return uvmPath, nil + return mount.GuestPath, mount, nil } else if err != uvm.ErrNoAvailableLocation && err != uvm.ErrMaxVPMemLayerSize { - return "", fmt.Errorf("failed to add VPMEM layer: %s", err) + return "", nil, fmt.Errorf("failed to add VPMEM layer: %s", err) } } @@ -341,153 +406,13 @@ func addLCOWLayer(ctx context.Context, vm *uvm.UtilityVM, layerPath string) (uvm uvmPath = fmt.Sprintf(guestpath.LCOWGlobalMountPrefixFmt, vm.UVMMountCounter()) sm, err := vm.AddSCSI(ctx, layerPath, uvmPath, true, false, options, uvm.VMAccessTypeNoop) if err != nil { - return "", fmt.Errorf("failed to add SCSI layer: %s", err) + return "", nil, fmt.Errorf("failed to add SCSI layer: %s", err) } log.G(ctx).WithFields(logrus.Fields{ "layerPath": layerPath, "layerType": "scsi", }).Debug("Added LCOW layer") - return sm.UVMPath, nil -} - -func removeLCOWLayer(ctx context.Context, vm *uvm.UtilityVM, layerPath string) error { - // Assume it was added to vPMEM and fall back to SCSI - err := vm.RemoveVPMem(ctx, layerPath) - if err == nil { - log.G(ctx).WithFields(logrus.Fields{ - "layerPath": layerPath, - "layerType": "vpmem", - }).Debug("Removed LCOW layer") - return nil - } else if err == uvm.ErrNotAttached { - err = vm.RemoveSCSI(ctx, layerPath) - if err == nil { - log.G(ctx).WithFields(logrus.Fields{ - "layerPath": layerPath, - "layerType": "scsi", - }).Debug("Removed LCOW layer") - return nil - } - return errors.Wrap(err, "failed to remove SCSI layer") - } - return errors.Wrap(err, "failed to remove VPMEM layer") -} - -// UnmountOperation is used when calling Unmount() to determine what type of unmount is -// required. In V1 schema, this must be unmountOperationAll. In V2, client can -// be more optimal and only unmount what they need which can be a minor performance -// improvement (eg if you know only one container is running in a utility VM, and -// the UVM is about to be torn down, there's no need to unmount the VSMB shares, -// just SCSI to have a consistent file system). -type UnmountOperation uint - -const ( - UnmountOperationSCSI UnmountOperation = 0x01 - UnmountOperationVSMB = 0x02 - UnmountOperationVPMEM = 0x04 - UnmountOperationAll = UnmountOperationSCSI | UnmountOperationVSMB | UnmountOperationVPMEM -) - -// UnmountContainerLayers is a helper for clients to hide all the complexity of layer unmounting -func UnmountContainerLayers(ctx context.Context, layerFolders []string, containerRootPath, volumeMountPath string, vm *uvm.UtilityVM, op UnmountOperation) error { - log.G(ctx).WithField("layerFolders", layerFolders).Debug("hcsshim::unmountContainerLayers") - if vm == nil { - // Must be an argon - folders are mounted on the host - if op != UnmountOperationAll { - return errors.New("only operation supported for host-mounted folders is unmountOperationAll") - } - if len(layerFolders) < 1 { - return errors.New("need at least one layer for Unmount") - } - - // Remove the mount point if there is one. This is the fallback case for job containers - // if no bind mount support is available. - if volumeMountPath != "" { - if err := RemoveSandboxMountPoint(ctx, volumeMountPath); err != nil { - return err - } - } - - path := layerFolders[len(layerFolders)-1] - if err := wclayer.UnprepareLayer(ctx, path); err != nil { - return err - } - return wclayer.DeactivateLayer(ctx, path) - } - - // V2 Xenon - - // Base+Scratch as a minimum. This is different to v1 which only requires the scratch - if len(layerFolders) < 2 { - return errors.New("at least two layers are required for unmount") - } - - var retError error - - // Always remove the combined layers as they are part of scsi/vsmb/vpmem - // removals. - if vm.OS() == "windows" { - if err := vm.RemoveCombinedLayersWCOW(ctx, containerRootPath); err != nil { - log.G(ctx).WithError(err).Warn("failed guest request to remove combined layers") - retError = err - } - } else { - if err := vm.RemoveCombinedLayersLCOW(ctx, containerRootPath); err != nil { - log.G(ctx).WithError(err).Warn("failed guest request to remove combined layers") - retError = err - } - } - - // Unload the SCSI scratch path - if (op & UnmountOperationSCSI) == UnmountOperationSCSI { - hostScratchFile, err := getScratchVHDPath(layerFolders) - if err != nil { - return errors.Wrap(err, "failed to get scratch VHD path in layer folders") - } - if err := vm.RemoveSCSI(ctx, hostScratchFile); err != nil { - log.G(ctx).WithError(err).Warn("failed to remove scratch") - if retError == nil { - retError = err - } else { - retError = errors.Wrapf(retError, err.Error()) - } - } - } - - // Remove each of the read-only layers from VSMB. These's are ref-counted and - // only removed once the count drops to zero. This allows multiple containers - // to share layers. - if vm.OS() == "windows" && (op&UnmountOperationVSMB) == UnmountOperationVSMB { - for _, layerPath := range layerFolders[:len(layerFolders)-1] { - if e := vm.RemoveVSMB(ctx, layerPath, true); e != nil { - log.G(ctx).WithError(e).Warn("remove VSMB failed") - if retError == nil { - retError = e - } else { - retError = errors.Wrapf(retError, e.Error()) - } - } - } - } - - // Remove each of the read-only layers from VPMEM (or SCSI). These's are ref-counted - // and only removed once the count drops to zero. This allows multiple containers to - // share layers. Note that SCSI is used on large layers. - if vm.OS() == "linux" && (op&UnmountOperationVPMEM) == UnmountOperationVPMEM { - for _, layerPath := range layerFolders[:len(layerFolders)-1] { - hostPath := filepath.Join(layerPath, "layer.vhd") - if err := removeLCOWLayer(ctx, vm, hostPath); err != nil { - log.G(ctx).WithError(err).Warn("remove layer failed") - if retError == nil { - retError = err - } else { - retError = errors.Wrapf(retError, err.Error()) - } - } - } - } - - return retError + return sm.UVMPath, sm, nil } // GetHCSLayers converts host paths corresponding to container layers into HCS schema V2 layers @@ -506,13 +431,6 @@ func GetHCSLayers(ctx context.Context, vm *uvm.UtilityVM, paths []string) (layer return layers, nil } -func containerRootfsPath(vm *uvm.UtilityVM, rootPath string) string { - if vm.OS() == "windows" { - return ospath.Join(vm.OS(), rootPath) - } - return ospath.Join(vm.OS(), rootPath, guestpath.RootfsPath) -} - func getScratchVHDPath(layerFolders []string) (string, error) { hostPath := filepath.Join(layerFolders[len(layerFolders)-1], "sandbox.vhdx") // For LCOW, we can reuse another container's scratch space (usually the sandbox container's). diff --git a/internal/resources/resources.go b/internal/resources/resources.go index 319c88461f..d1f83dbc64 100644 --- a/internal/resources/resources.go +++ b/internal/resources/resources.go @@ -7,7 +7,6 @@ import ( "errors" "github.com/Microsoft/hcsshim/internal/credentials" - "github.com/Microsoft/hcsshim/internal/layers" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/uvm" ) @@ -51,7 +50,7 @@ func (r *Resources) LcowScratchPath() string { } // SetLayers updates the container resource's image layers -func (r *Resources) SetLayers(l *layers.ImageLayers) { +func (r *Resources) SetLayers(l ResourceCloser) { r.layers = l } @@ -86,7 +85,7 @@ type Resources struct { // addedNetNSToVM indicates if the network namespace has been added to the containers utility VM addedNetNSToVM bool // layers is a pointer to a struct of the layers paths of a container - layers *layers.ImageLayers + layers ResourceCloser // resources is a slice of the resources associated with a container resources []ResourceCloser } @@ -157,9 +156,7 @@ func ReleaseResources(ctx context.Context, r *Resources, vm *uvm.UtilityVM, all } if r.layers != nil { - // TODO dcantah: Either make it so layers doesn't rely on the all bool for cleanup logic - // or find a way to factor out the all bool in favor of something else. - if err := r.layers.Release(ctx, all); err != nil { + if err := r.layers.Release(ctx); err != nil { return err } } diff --git a/internal/uvm/vpmem.go b/internal/uvm/vpmem.go index cde51aa014..4d0d50d963 100644 --- a/internal/uvm/vpmem.go +++ b/internal/uvm/vpmem.go @@ -29,6 +29,18 @@ var ( ErrMaxVPMemLayerSize = errors.New("layer size is to large for VPMEM max size") ) +// var _ resources.ResourceCloser = &VPMEMMount{} -- Causes an import cycle. + +type VPMEMMount struct { + GuestPath string + uvm *UtilityVM + hostPath string +} + +func (vc *VPMEMMount) Release(ctx context.Context) error { + return vc.uvm.RemoveVPMem(ctx, vc.hostPath) +} + type vPMemInfoDefault struct { hostPath string uvmPath string @@ -235,18 +247,27 @@ func (uvm *UtilityVM) removeVPMemDefault(ctx context.Context, hostPath string) e return nil } -func (uvm *UtilityVM) AddVPMem(ctx context.Context, hostPath string) (string, error) { +func (uvm *UtilityVM) AddVPMem(ctx context.Context, hostPath string) (*VPMEMMount, error) { if uvm.operatingSystem != "linux" { - return "", errNotSupported + return nil, errNotSupported } uvm.m.Lock() defer uvm.m.Unlock() + var ( + guestPath string + err error + ) if uvm.vpmemMultiMapping { - return uvm.addVPMemMappedDevice(ctx, hostPath) + guestPath, err = uvm.addVPMemMappedDevice(ctx, hostPath) + } else { + guestPath, err = uvm.addVPMemDefault(ctx, hostPath) + } + if err != nil { + return nil, err } - return uvm.addVPMemDefault(ctx, hostPath) + return &VPMEMMount{GuestPath: guestPath, uvm: uvm, hostPath: hostPath}, nil } func (uvm *UtilityVM) RemoveVPMem(ctx context.Context, hostPath string) error { diff --git a/test/functional/uvm_vpmem_test.go b/test/functional/uvm_vpmem_test.go index 70b6118e5b..78f9ee7645 100644 --- a/test/functional/uvm_vpmem_test.go +++ b/test/functional/uvm_vpmem_test.go @@ -37,11 +37,11 @@ func TestVPMEM(t *testing.T) { } for i := 0; i < int(iterations); i++ { - uvmPath, err := u.AddVPMem(ctx, filepath.Join(tempDir, "layer.vhd")) + mount, err := u.AddVPMem(ctx, filepath.Join(tempDir, "layer.vhd")) if err != nil { t.Fatalf("AddVPMEM failed: %s", err) } - t.Logf("exposed as %s", uvmPath) + t.Logf("exposed as %s", mount.GuestPath) } // Remove them all diff --git a/test/functional/wcow_test.go b/test/functional/wcow_test.go index de42540b5b..5fed699865 100644 --- a/test/functional/wcow_test.go +++ b/test/functional/wcow_test.go @@ -371,7 +371,6 @@ func TestWCOWArgonShim(t *testing.T) { requireFeatures(t, featureWCOW) imageLayers := windowsServercoreImageLayers(context.Background(), t) - argonShimMounted := false argonShimScratchDir := t.TempDir() if err := wclayer.CreateScratchLayer(context.Background(), argonShimScratchDir, imageLayers); err != nil { @@ -381,25 +380,17 @@ func TestWCOWArgonShim(t *testing.T) { hostRWSharedDirectory, hostROSharedDirectory := createTestMounts(t) layers := generateShimLayersStruct(t, imageLayers) - // For cleanup on failure - defer func() { - if argonShimMounted { - _ = layerspkg.UnmountContainerLayers(context.Background(), - append(imageLayers, argonShimScratchDir), - "", - "", - nil, - layerspkg.UnmountOperationAll) - } - }() - id := "argon" // This is a cheat but stops us re-writing exactly the same code just for test - argonShimLocalMountPath, err := layerspkg.MountWCOWLayers(context.Background(), id, append(imageLayers, argonShimScratchDir), "", "", nil) + argonShimLocalMountPath, closer, err := layerspkg.MountWCOWLayers(context.Background(), id, append(imageLayers, argonShimScratchDir), "", "", nil) if err != nil { t.Fatal(err) } - argonShimMounted = true + defer func() { + if closer != nil { + closer.Release(context.Background()) + } + }() argonShim, err := hcsshim.CreateContainer(id, &hcsshim.ContainerConfig{ SystemType: "Container", Name: "argonShim", @@ -428,17 +419,10 @@ func TestWCOWArgonShim(t *testing.T) { } runShimCommands(t, argonShim) stopContainer(t, argonShim) - if err := layerspkg.UnmountContainerLayers( - context.Background(), - append(imageLayers, argonShimScratchDir), - "", - "", - nil, - layerspkg.UnmountOperationAll, - ); err != nil { + if err := closer.Release(context.Background()); err != nil { t.Fatal(err) } - argonShimMounted = false + closer = nil } // Xenon through HCSShim interface (v1)