Skip to content

Commit

Permalink
MGMT-16266: Indication event showing how often host has been rebooted…
Browse files Browse the repository at this point in the history
… missing on some nodes

There are many possible failures.  Many of these failures are temporary
such as API is not reachable, or failure to create debug pod.
Added retry for the retrieval of number of reboots
  • Loading branch information
ori-amizur committed Dec 8, 2023
1 parent 81ef854 commit c26f6c1
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 14 deletions.
26 changes: 20 additions & 6 deletions src/assisted_installer_controller/reboots_notifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,18 @@ import (
"github.com/openshift/assisted-installer/src/common"
"github.com/openshift/assisted-installer/src/inventory_client"
"github.com/openshift/assisted-installer/src/ops"
"github.com/openshift/assisted-installer/src/utils"
"github.com/openshift/assisted-service/models"
"github.com/sirupsen/logrus"
"github.com/thoas/go-funk"
)

const (
eventName = "reboots_for_node"
eventMessageTemplate = "Node %s has been rebooted %d times before completing installation"
eventName = "reboots_for_node"
eventMessageTemplate = "Node %s has been rebooted %d times before completing installation"
getNumRebootsRetries = 10
getNumRebootsTimeout = 6 * time.Minute
getNumRebootsRetrySleepDuration = 15 * time.Second
)

//go:generate mockgen -source=reboots_notifier.go -package=assisted_installer_controller -destination=mock_reboots_notifier.go
Expand Down Expand Up @@ -53,10 +57,16 @@ func (r *rebootsNotifier) getKubeconfigPath(ctx context.Context) (string, error)
r.mu.Lock()
defer r.mu.Unlock()
if r.kubeconfigPath == "" {
var err error
if r.kubeconfigPath, err = common.DownloadKubeconfigNoingress(ctx, os.TempDir(), r.ic, r.log); err != nil {
dir, err := os.MkdirTemp("", "kubedir")
if err != nil {
return "", err
}
kubeconfigPath, err := common.DownloadKubeconfigNoingress(ctx, dir, r.ic, r.log)
if err != nil {
_ = os.RemoveAll(dir)
return "", err
}
r.kubeconfigPath = kubeconfigPath
}
return r.kubeconfigPath, nil
}
Expand All @@ -68,7 +78,11 @@ func (r *rebootsNotifier) run(ctx context.Context, nodeName string, hostId, infr
r.log.Warningf("failed to get kubeconfig. aborting notifying reboots for %s", nodeName)
return
}
numberOfReboots, err := r.ops.GetNumberOfReboots(ctx, nodeName, kubeconfigPath)
var numberOfReboots int
err = utils.RetryWithContext(ctx, getNumRebootsRetries, getNumRebootsRetrySleepDuration, r.log, func() (err error) {
numberOfReboots, err = r.ops.GetNumberOfReboots(ctx, nodeName, kubeconfigPath)
return err
})
if err != nil {
r.log.WithError(err).Errorf("failed to get number of reboots for node %s", nodeName)
return
Expand All @@ -91,7 +105,7 @@ func (r *rebootsNotifier) run(ctx context.Context, nodeName string, hostId, infr
}

func (r *rebootsNotifier) Start(ctx context.Context, nodeName string, hostId, infraenvId, clusterId *strfmt.UUID) {
execCtx, cancel := context.WithTimeout(ctx, 2*time.Minute)
execCtx, cancel := context.WithTimeout(ctx, getNumRebootsTimeout)
r.cancelers = append(r.cancelers, cancel)
r.wg.Add(1)
go r.run(execCtx, nodeName, hostId, infraenvId, clusterId)
Expand Down
6 changes: 0 additions & 6 deletions src/assisted_installer_controller/reboots_notifier_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,4 @@ var _ = Describe("Reboots notifier", func() {
notifier.Start(context.TODO(), nodeName, &hostId, &infraenvId, &clusterId)
notifier.Finalize()
})
It("fail to get number of reboots", func() {
mockclient.EXPECT().DownloadClusterCredentials(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil)
mockops.EXPECT().GetNumberOfReboots(gomock.Any(), nodeName, gomock.Any()).Return(1, errors.New("error"))
notifier.Start(context.TODO(), nodeName, &hostId, &infraenvId, &clusterId)
notifier.Finalize()
})
})
14 changes: 12 additions & 2 deletions src/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,13 +161,23 @@ func FindAndRemoveElementFromStringList(s []string, r string) []string {
}

func Retry(attempts int, sleep time.Duration, log logrus.FieldLogger, f func() error) (err error) {
return RetryWithContext(context.TODO(), attempts, sleep, log, f)
}

func RetryWithContext(ctx context.Context, attempts int, sleep time.Duration, log logrus.FieldLogger, f func() error) (err error) {
ticker := time.NewTicker(sleep)
defer ticker.Stop()
for i := 0; i < attempts-1; i++ {
err = f()
if err == nil {
return
}
time.Sleep(sleep)
log.Warnf("Retrying after error: %s", err)
select {
case <-ctx.Done():
return ctx.Err()
case <-ticker.C:
log.Warnf("Retrying after error: %s", err)
}
}
// Don't wait after the last retry
err = f()
Expand Down

0 comments on commit c26f6c1

Please sign in to comment.