diff --git a/daemon/daemon.go b/daemon/daemon.go index 014ec970a98a4..acc619a6af142 100644 --- a/daemon/daemon.go +++ b/daemon/daemon.go @@ -866,6 +866,24 @@ func NewDaemon(ctx context.Context, config *config.Config, pluginStore *plugin.S registerMetricsPluginCallback(d.PluginStore, metricsSockPath) gopts := []grpc.DialOption{ + // WithBlock makes sure that the following containerd request + // is reliable. + // + // NOTE: In one edge case with high load pressure, kernel kills + // dockerd, containerd and containerd-shims caused by OOM. + // When both dockerd and containerd restart, but containerd + // will take time to recover all the existing containers. Before + // containerd serving, dockerd will failed with gRPC error. + // That bad thing is that restore action will still ignore the + // any non-NotFound errors and returns running state for + // already stopped container. It is unexpected behavior. And + // we need to restart dockerd to make sure that anything is OK. + // + // It is painful. Add WithBlock can prevent the edge case. And + // n common case, the containerd will be serving in shortly. + // It is not harm to add WithBlock for containerd connection. + grpc.WithBlock(), + grpc.WithInsecure(), grpc.WithBackoffMaxDelay(3 * time.Second), grpc.WithDialer(dialer.Dialer),