Skip to content

Commit

Permalink
fix(hatchery/swarm): add timeout on docker calls (#3399)
Browse files Browse the repository at this point in the history
  • Loading branch information
yesnault authored Oct 4, 2018
1 parent 4d0b6c3 commit 9bf9df8
Show file tree
Hide file tree
Showing 7 changed files with 49 additions and 19 deletions.
1 change: 1 addition & 0 deletions engine/hatchery/marathon/marathon.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ func (h *HatcheryMarathon) CheckConfiguration(cfg interface{}) error {

//Custom http client with 3 retries
httpClient := &http.Client{
Timeout: time.Minute,
Transport: &httpcontrol.Transport{
RequestTimeout: time.Minute,
MaxTries: 3,
Expand Down
15 changes: 11 additions & 4 deletions engine/hatchery/swarm/swarm.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
types "github.com/docker/docker/api/types"
docker "github.com/docker/docker/client"
"github.com/docker/go-connections/tlsconfig"
"github.com/facebookgo/httpcontrol"
"github.com/gorilla/mux"
context "golang.org/x/net/context"

Expand Down Expand Up @@ -46,7 +47,9 @@ func (h *HatcherySwarm) Init() error {
log.Error("hatchery> swarm> unable to connect to a docker client:%s", errc)
return errc
}
if _, errPing := d.Ping(context.Background()); errPing != nil {
ctxDocker, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if _, errPing := d.Ping(ctxDocker); errPing != nil {
log.Error("hatchery> swarm> unable to ping docker host:%s", errPing)
return errPing
}
Expand All @@ -61,6 +64,7 @@ func (h *HatcherySwarm) Init() error {
for hostName, cfg := range h.Config.DockerEngines {
log.Info("hatchery> swarm> connecting to %s: %s", hostName, cfg.Host)
httpClient := new(http.Client)
httpClient.Timeout = 30 * time.Second
if cfg.CertPath != "" {
options := tlsconfig.Options{
CAFile: filepath.Join(cfg.CertPath, "ca.pem"),
Expand Down Expand Up @@ -108,18 +112,21 @@ func (h *HatcherySwarm) Init() error {
continue
}

httpClient.Transport = &http.Transport{
httpClient.Transport = &httpcontrol.Transport{
RequestTimeout: 30 * time.Second,
TLSClientConfig: tlsc,
}
} else {
httpClient.Transport = &http.Transport{}
httpClient.Transport = &httpcontrol.Transport{RequestTimeout: 30 * time.Second}
}
d, errc := docker.NewClientWithOpts(docker.WithHost(cfg.Host), docker.WithVersion(cfg.APIVersion), docker.WithHTTPClient(httpClient))
if errc != nil {
log.Error("hatchery> swarm> unable to connect to a docker client:%s for host %s (%s)", hostName, cfg.Host, errc)
continue
}
if _, errPing := d.Ping(context.Background()); errPing != nil {
ctxDocker, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if _, errPing := d.Ping(ctxDocker); errPing != nil {
log.Error("hatchery> swarm> unable to ping docker host:%s", errPing)
continue
}
Expand Down
4 changes: 3 additions & 1 deletion engine/hatchery/swarm/swarm_conf.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@ func (h *HatcherySwarm) Status() sdk.MonitoringStatus {
for dockerName, dockerClient := range h.dockerClients {
//Check images
status := sdk.MonitoringStatusOK
images, err := dockerClient.ImageList(context.Background(), types.ImageListOptions{All: true})
ctxList, cancelList := context.WithTimeout(context.Background(), 20*time.Second)
defer cancelList()
images, err := dockerClient.ImageList(ctxList, types.ImageListOptions{All: true})
if err != nil {
log.Warning("hatchery> swarm> %s> Status> Unable to list images on %s: %s", h.Name, dockerName, err)
status = sdk.MonitoringStatusAlert
Expand Down
2 changes: 1 addition & 1 deletion engine/hatchery/swarm/swarm_util_create.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ checkImage:
c, err := dockerClient.ContainerCreate(ctx, config, hostConfig, networkingConfig, name)
if err != nil {
next()
return sdk.WrapError(err, "createAndStartContainer> Unable to create container %s", name)
return sdk.WrapError(err, "createAndStartContainer> Unable to create container %s on %s", name, dockerClient.name)
}
next()

Expand Down
9 changes: 6 additions & 3 deletions engine/hatchery/swarm/swarm_util_get.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package swarm

import (
"strings"
"time"

types "github.com/docker/docker/api/types"
context "golang.org/x/net/context"
Expand All @@ -10,17 +11,19 @@ import (
)

func (h *HatcherySwarm) getContainers(dockerClient *dockerClient, options types.ContainerListOptions) ([]types.Container, error) {
s, err := dockerClient.ContainerList(context.Background(), options)
ctxList, cancelList := context.WithTimeout(context.Background(), 10*time.Second)
defer cancelList()
s, err := dockerClient.ContainerList(ctxList, options)
if err != nil {
return nil, sdk.WrapError(err, "hatchery> swarm> getContainers> unable to list containers")
return nil, sdk.WrapError(err, "hatchery> swarm> getContainers> unable to list containers on %s", dockerClient.name)
}
return s, nil
}

func (h *HatcherySwarm) getContainer(dockerClient *dockerClient, name string, options types.ContainerListOptions) (*types.Container, error) {
containers, err := h.getContainers(dockerClient, options)
if err != nil {
return nil, sdk.WrapError(err, "hatchery> swarm> getContainer> cannot getContainers")
return nil, sdk.WrapError(err, "hatchery> swarm> getContainer> cannot getContainers on %s", dockerClient.name)
}

for i := range containers {
Expand Down
36 changes: 26 additions & 10 deletions engine/hatchery/swarm/swarm_util_kill.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,16 @@ const (
)

func (h *HatcherySwarm) killAndRemove(dockerClient *dockerClient, ID string) error {
container, err := dockerClient.ContainerInspect(context.Background(), ID)
ctxList, cancelList := context.WithTimeout(context.Background(), 3*time.Second)
defer cancelList()
container, err := dockerClient.ContainerInspect(ctxList, ID)
if err != nil {
//If there is an error, we try to remove the container
if strings.Contains(err.Error(), "No such container") {
log.Debug("hatchery> swarm> killAndRemove> cannot InspectContainer: %v", err)
log.Debug("hatchery> swarm> killAndRemove> cannot InspectContainer: %v on %s", err, dockerClient.name)
return nil
}
log.Info("hatchery> swarm> killAndRemove> cannot InspectContainer: %v", err)
log.Info("hatchery> swarm> killAndRemove> cannot InspectContainer: %v on %s", err, dockerClient.name)
} else {
// If its a worker "register", check registration before deleting it
if strings.Contains(container.Name, "register-") {
Expand All @@ -50,7 +52,9 @@ func (h *HatcherySwarm) killAndRemove(dockerClient *dockerClient, ID string) err

for _, cnetwork := range container.NetworkSettings.Networks {
//Get the network
network, err := dockerClient.NetworkInspect(context.Background(), cnetwork.NetworkID, types.NetworkInspectOptions{})
ctxList, cancelList := context.WithTimeout(context.Background(), 3*time.Second)
defer cancelList()
network, err := dockerClient.NetworkInspect(ctxList, cnetwork.NetworkID, types.NetworkInspectOptions{})
if err != nil {
if !strings.Contains(err.Error(), "No such network") {
return sdk.WrapError(err, "hatchery> swarm> killAndRemove> unable to get network for %s on %s", ID[:7], dockerClient.name)
Expand All @@ -75,7 +79,9 @@ func (h *HatcherySwarm) killAndRemove(dockerClient *dockerClient, ID string) err

//Finally remove the network
log.Info("hatchery> swarm> remove network %s (%s)", network.Name, network.ID)
if err := dockerClient.NetworkRemove(context.Background(), network.ID); err != nil {
ctxDocker, cancelList := context.WithTimeout(context.Background(), 10*time.Second)
defer cancelList()
if err := dockerClient.NetworkRemove(ctxDocker, network.ID); err != nil {
log.Error("hatchery> swarm> killAndRemove> unable to kill and remove network %s from %s err:%s", network.ID[:12], dockerClient.name, err)
}
}
Expand All @@ -84,13 +90,17 @@ func (h *HatcherySwarm) killAndRemove(dockerClient *dockerClient, ID string) err

func (h *HatcherySwarm) killAndRemoveContainer(dockerClient *dockerClient, ID string) error {
log.Debug("hatchery> swarm> killAndRemove> remove container %s on %s", ID, dockerClient.name)
if err := dockerClient.ContainerKill(context.Background(), ID, "SIGKILL"); err != nil {
ctxDocker, cancelList := context.WithTimeout(context.Background(), 20*time.Second)
defer cancelList()
if err := dockerClient.ContainerKill(ctxDocker, ID, "SIGKILL"); err != nil {
if !strings.Contains(err.Error(), "is not running") && !strings.Contains(err.Error(), "No such container") {
return sdk.WrapError(err, "hatchery> swarm> killAndRemove> err on kill container %v from %s", err, dockerClient.name)
}
}

if err := dockerClient.ContainerRemove(context.Background(), ID, types.ContainerRemoveOptions{Force: true}); err != nil {
ctxDockerRemove, cancelList := context.WithTimeout(context.Background(), 20*time.Second)
defer cancelList()
if err := dockerClient.ContainerRemove(ctxDockerRemove, ID, types.ContainerRemoveOptions{Force: true}); err != nil {
// container could be already removed by a previous call to docker
if !strings.Contains(err.Error(), "No such container") {
return sdk.WrapError(err, "hatchery> swarm> killAndRemove> Unable to remove container %s form %s", ID, dockerClient.name)
Expand All @@ -103,14 +113,18 @@ func (h *HatcherySwarm) killAndRemoveContainer(dockerClient *dockerClient, ID st
func (h *HatcherySwarm) killAwolNetworks() error {
for _, dockerClient := range h.dockerClients {
//Checking networks
nets, errLN := dockerClient.NetworkList(context.Background(), types.NetworkListOptions{})
ctxDocker, cancelList := context.WithTimeout(context.Background(), 5*time.Second)
defer cancelList()
nets, errLN := dockerClient.NetworkList(ctxDocker, types.NetworkListOptions{})
if errLN != nil {
log.Warning("hatchery> swarm> killAwolNetworks> Cannot get networks on %s: %s", dockerClient.name, errLN)
return errLN
}

for i := range nets {
n, err := dockerClient.NetworkInspect(context.Background(), nets[i].ID, types.NetworkInspectOptions{})
ctxDocker, cancelList := context.WithTimeout(context.Background(), 5*time.Second)
defer cancelList()
n, err := dockerClient.NetworkInspect(ctxDocker, nets[i].ID, types.NetworkInspectOptions{})
if err != nil {
log.Warning("hatchery> swarm> killAwolNetworks> Unable to get network info: %v", err)
continue
Expand All @@ -134,7 +148,9 @@ func (h *HatcherySwarm) killAwolNetworks() error {
}

log.Info("hatchery> swarm> killAwolNetworks> remove network[%s] %s on %s (created on %v)", n.ID, n.Name, dockerClient.name, n.Created)
if err := dockerClient.NetworkRemove(context.Background(), n.ID); err != nil {
ctxDocker2, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if err := dockerClient.NetworkRemove(ctxDocker2, n.ID); err != nil {
log.Warning("hatchery> swarm> killAwolNetworks> Unable to delete network %s err:%s", n.Name, err)
}
}
Expand Down
1 change: 1 addition & 0 deletions sdk/cdsclient/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ func NewService(endpoint string, timeout time.Duration, insecureSkipVerifyTLS bo
cli := new(client)
cli.config = conf
cli.HTTPClient = &http.Client{
Timeout: timeout,
Transport: &httpcontrol.Transport{
RequestTimeout: timeout,
MaxTries: 5,
Expand Down

0 comments on commit 9bf9df8

Please sign in to comment.