Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP]Delete corrupt exist lower layer when pull image #42978

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
36 changes: 33 additions & 3 deletions daemon/graphdriver/overlay2/overlay.go
Expand Up @@ -663,10 +663,40 @@ func (d *Driver) Put(id string) error {
return nil
}

// Exists checks to see if the id is already mounted.
// Exists checks to see if the id is already mounted and valid.
func (d *Driver) Exists(id string) bool {
_, err := os.Stat(d.dir(id))
return err == nil
dir := d.dir(id)
_, err := os.Stat(dir)
if err != nil {
return false
}

// Check link file, link file must exist and has content.
link, err := os.ReadFile(path.Join(dir, "link"))
if err != nil {
return false
}
if len(link) == 0 {
return false
}

// Check lower file, lower file must has content if exist.
lowerFilePath := path.Join(dir, lowerFile)
_, err = os.Stat(lowerFilePath)
if os.IsNotExist(err) {
return true
}
if err != nil {
return false
}
lower, err := os.ReadFile(lowerFilePath)
if err != nil {
return false
}
if len(lower) == 0 {
return false
}
return true
}

// isParent determines whether the given parent is the direct parent of the
Expand Down
3 changes: 3 additions & 0 deletions layer/layer.go
Expand Up @@ -25,6 +25,9 @@ var (
// attempted on a layer which does not exist.
ErrLayerDoesNotExist = errors.New("layer does not exist")

// ErrLayerCorrupt is used when layer cache metadata corrupt
ErrLayerCorrupt = errors.New("layer cache metadata corrupt")

// ErrLayerNotRetained is used when a release is
// attempted on a layer which is not retained.
ErrLayerNotRetained = errors.New("layer not retained")
Expand Down
31 changes: 28 additions & 3 deletions layer/layer_store.go
Expand Up @@ -345,6 +345,11 @@ func (ls *layerStore) registerWithDescriptor(ts io.Reader, parent ChainID, descr
return nil, err
}

// If storage driver is overlay2, check the layer cache metadata before register
if ls.driver.String() == "overlay2" && !ls.driver.Exists(layer.cacheID) {
return nil, ErrLayerCorrupt
}

if layer.parent == nil {
layer.chainID = ChainID(layer.diffID)
} else {
Expand All @@ -359,9 +364,14 @@ func (ls *layerStore) registerWithDescriptor(ts io.Reader, parent ChainID, descr
defer ls.layerL.Unlock()

if existingLayer := ls.getWithoutLock(layer.chainID); existingLayer != nil {
// Set error for cleanup, but do not return the error
err = errors.New("layer already exists")
return existingLayer.getReference(), nil
if ls.driver.String() != "overlay2" || ls.driver.Exists(existingLayer.cacheID) {
// Set error for cleanup, but do not return the error
err = errors.New("layer already exists")
return existingLayer.getReference(), nil
}
if err := ls.deleteCorruptLayer(existingLayer); err != nil {
logrus.Errorf("delete corrupt layer failed when register with cached id %s, chain id %s, error %s", existingLayer.cacheID, existingLayer.chainID, err.Error())
}
}

if err = tx.Commit(layer.chainID); err != nil {
Expand Down Expand Up @@ -398,10 +408,25 @@ func (ls *layerStore) Get(l ChainID) (Layer, error) {
if layer == nil {
return nil, ErrLayerDoesNotExist
}
if ls.driver.String() == "overlay2" && !ls.driver.Exists(layer.cacheID) {
if err := ls.deleteCorruptLayer(layer); err != nil {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even though the layer itself is corrupt, deleting it at this level doesn't really seem right.
For drivers like btrfs this especially wouldn't work (granted it seems like it would be weirder to get into this situation with btrfs).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like if we want to recover from this we need a way to repair the layer, but this seems non-trivial from a more generic standpoint.

logrus.Errorf("delete corrupt layer failed when get with cached id %s, chain id %s, error %s", layer.cacheID, layer.chainID, err.Error())
return nil, err
}
return nil, ErrLayerDoesNotExist
}

return layer.getReference(), nil
}

func (ls *layerStore) deleteCorruptLayer(l *roLayer) error {
logrus.Infof("delete corrupt layer with cached id %s, chain id %s", l.cacheID, l.chainID)
delete(ls.layerMap, l.chainID)
err := ls.driver.Remove(l.cacheID)

return err
}

func (ls *layerStore) Map() map[ChainID]Layer {
ls.layerL.Lock()
defer ls.layerL.Unlock()
Expand Down