Skip to content

Commit

Permalink
Add a minimum of retries before timing out "okteto up" loops (#1588)
Browse files Browse the repository at this point in the history
Signed-off-by: Pablo Chico de Guzman <pchico83@gmail.com>
  • Loading branch information
pchico83 committed Jun 2, 2021
1 parent 248e676 commit d354c3f
Show file tree
Hide file tree
Showing 6 changed files with 21 additions and 15 deletions.
2 changes: 2 additions & 0 deletions cmd/up/activate.go
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ func (up *upContext) waitUntilDevelopmentContainerIsRunning(ctx context.Context)
if !ok {
watcherEvents, err = up.Client.CoreV1().Events(up.Dev.Namespace).Watch(ctx, optsWatchEvents)
if err != nil {
log.Infof("error watching events: %s", err.Error())
return err
}
continue
Expand Down Expand Up @@ -352,6 +353,7 @@ func (up *upContext) waitUntilDevelopmentContainerIsRunning(ctx context.Context)
if !ok {
watcherPod, err = up.Client.CoreV1().Pods(up.Dev.Namespace).Watch(ctx, optsWatchPod)
if err != nil {
log.Infof("error watching pod events: %s", err.Error())
return err
}
continue
Expand Down
1 change: 1 addition & 0 deletions pkg/errors/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ func IsTransient(err error) bool {
case strings.Contains(err.Error(), "operation time out"),
strings.Contains(err.Error(), "operation timed out"),
strings.Contains(err.Error(), "i/o timeout"),
strings.Contains(err.Error(), "unknown (get events)"),
strings.Contains(err.Error(), "Client.Timeout exceeded while awaiting headers"),
strings.Contains(err.Error(), "can't assign requested address"),
strings.Contains(err.Error(), "command exited without exit status or exit signal"),
Expand Down
4 changes: 2 additions & 2 deletions pkg/k8s/deployments/crud.go
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ func UpdateOktetoRevision(ctx context.Context, d *appsv1.Deployment, client *kub
ticker := time.NewTicker(200 * time.Millisecond)
to := time.Now().Add(timeout * 2) // 60 seconds

for i := 0; ; i++ {
for retries := 0; ; retries++ {
updated, err := client.AppsV1().Deployments(d.Namespace).Get(ctx, d.Name, metav1.GetOptions{})
if err != nil {
return fmt.Errorf("failed to get deployment %s/%s: %w", d.Namespace, d.Name, err)
Expand All @@ -274,7 +274,7 @@ func UpdateOktetoRevision(ctx context.Context, d *appsv1.Deployment, client *kub
return Update(ctx, d, client)
}

if time.Now().After(to) {
if time.Now().After(to) && retries >= 10 {
return fmt.Errorf("kubernetes is taking too long to update the '%s' annotation of the deployment '%s'. Please check for errors and try again", revisionAnnotation, d.Name)
}

Expand Down
8 changes: 4 additions & 4 deletions pkg/k8s/pods/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ func GetDevPodInLoop(ctx context.Context, dev *model.Dev, c *kubernetes.Clientse
start := time.Now()
to := start.Add(dev.Timeout * 4) // 120 seconds

for i := 0; ; i++ {
for retries := 0; ; retries++ {
pod, err := GetDevPod(ctx, dev, c, waitUntilDeployed)
if err != nil {
return nil, err
Expand All @@ -104,13 +104,13 @@ func GetDevPodInLoop(ctx context.Context, dev *model.Dev, c *kubernetes.Clientse
return pod, nil
}

if time.Now().After(to) {
if time.Now().After(to) && retries > 10 {
return nil, fmt.Errorf("kubernetes is taking too long to create your development container. Please check for errors and try again")
}

select {
case <-ticker.C:
if i%5 == 0 {
if retries%5 == 0 {
log.Info("development container is not ready yet, will retry")
}

Expand Down Expand Up @@ -254,7 +254,7 @@ func Destroy(ctx context.Context, podName, namespace string, c kubernetes.Interf
func GetDevPodUserID(ctx context.Context, dev *model.Dev, c *kubernetes.Clientset) int64 {
devPodLogs, err := GetDevPodLogs(ctx, dev, false, c)
if err != nil {
log.Errorf("failed to access development container logs: %s", err)
log.Infof("failed to access development container logs: %s", err)
return -1
}
return parseUserID(devPodLogs)
Expand Down
5 changes: 4 additions & 1 deletion pkg/ssh/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,11 @@ func (fm *ForwardManager) Start(devPod, namespace string) error {

ticker := time.NewTicker(200 * time.Millisecond)
to := time.Now().Add(10 * time.Second)
retries := 0

for {
retries++
log.Infof("SSH forward manager retry %d", retries)
if fm.pf != nil {
if err := fm.pf.Start(devPod, namespace); err != nil {
return fmt.Errorf("failed to start SSH port-forward: %w", err)
Expand All @@ -130,7 +133,7 @@ func (fm *ForwardManager) Start(devPod, namespace string) error {
break
}
log.Infof("error starting SSH connection pool on %s: %s", fm.sshAddr, err.Error())
if time.Now().After(to) {
if time.Now().After(to) && retries > 10 {
return errors.ErrSSHConnectError
}

Expand Down
16 changes: 8 additions & 8 deletions pkg/syncthing/syncthing.go
Original file line number Diff line number Diff line change
Expand Up @@ -315,17 +315,17 @@ func (s *Syncthing) WaitForPing(ctx context.Context, local bool) error {
to := time.Now().Add(s.timeout)

log.Infof("waiting for syncthing local=%t to be ready", local)
for i := 0; ; i++ {
for retries := 0; ; retries++ {
select {
case <-ticker.C:
if s.Ping(ctx, local) {
return nil
}
if i%5 == 0 {
if retries%5 == 0 {
log.Infof("syncthing local=%t is not ready yet", local)
}

if time.Now().After(to) {
if time.Now().After(to) && retries > 10 {
return fmt.Errorf("syncthing local=%t didn't respond after %s", local, s.timeout.String())
}

Expand Down Expand Up @@ -381,7 +381,7 @@ func (s *Syncthing) WaitForConnected(ctx context.Context, dev *model.Dev) error
ticker := time.NewTicker(100 * time.Millisecond)
log.Info("waiting for remote device to be connected")
to := time.Now().Add(s.timeout)
for i := 0; ; i++ {
for retries := 0; ; retries++ {
connections := &Connections{}
body, err := s.APICall(ctx, "rest/system/connections", "GET", 200, nil, true, nil, true, 3)
if err != nil {
Expand All @@ -403,7 +403,7 @@ func (s *Syncthing) WaitForConnected(ctx context.Context, dev *model.Dev) error
}
}

if time.Now().After(to) {
if time.Now().After(to) && retries > 10 {
log.Infof("remote syncthing connection not completed after %s, please try again", s.timeout.String())
return errors.ErrLostSyncthing
}
Expand Down Expand Up @@ -434,14 +434,14 @@ func (s *Syncthing) waitForFolderScanning(ctx context.Context, folder *Folder, l

to := time.Now().Add(s.timeout * 10) // 5 minutes

for i := 0; ; i++ {
for retries := 0; ; retries++ {
status, err := s.GetStatus(ctx, folder, local)
if err != nil && err != errors.ErrBusySyncthing {
return err
}

if status != nil {
if i%100 == 0 {
if retries%100 == 0 {
// one log every 10 seconds
log.Infof("syncthing folder local=%t is '%s'", local, status.State)
}
Expand All @@ -451,7 +451,7 @@ func (s *Syncthing) waitForFolderScanning(ctx context.Context, folder *Folder, l
}
}

if time.Now().After(to) {
if time.Now().After(to) && retries > 10 {
return fmt.Errorf("initial file scan not completed after %s, please try again", s.timeout.String())
}

Expand Down

0 comments on commit d354c3f

Please sign in to comment.