Skip to content

Commit

Permalink
MGMT-16266: Indication event showing how often host has been rebooted…
Browse files Browse the repository at this point in the history
… missing on some nodes

There are many possible failures.  Many of these failures are temporary
such as API is not reachable, or failure to create debug pod.
Added retry for the retrieval of number of reboots
  • Loading branch information
ori-amizur committed Dec 7, 2023
1 parent 81ef854 commit 26f7e32
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 5 deletions.
18 changes: 14 additions & 4 deletions src/assisted_installer_controller/reboots_notifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/openshift/assisted-installer/src/common"
"github.com/openshift/assisted-installer/src/inventory_client"
"github.com/openshift/assisted-installer/src/ops"
"github.com/openshift/assisted-installer/src/utils"
"github.com/openshift/assisted-service/models"
"github.com/sirupsen/logrus"
"github.com/thoas/go-funk"
Expand All @@ -20,8 +21,12 @@ import (
const (
eventName = "reboots_for_node"
eventMessageTemplate = "Node %s has been rebooted %d times before completing installation"
getNumRebootsRetries = 10
getNumRebootsTimeout = 6 * time.Minute
)

var getNumRebootsRetrySleepDuration = 15 * time.Second

//go:generate mockgen -source=reboots_notifier.go -package=assisted_installer_controller -destination=mock_reboots_notifier.go
type RebootsNotifier interface {
Start(ctx context.Context, nodeName string, hostId, infraenvId, clusterId *strfmt.UUID)
Expand Down Expand Up @@ -53,10 +58,11 @@ func (r *rebootsNotifier) getKubeconfigPath(ctx context.Context) (string, error)
r.mu.Lock()
defer r.mu.Unlock()
if r.kubeconfigPath == "" {
var err error
if r.kubeconfigPath, err = common.DownloadKubeconfigNoingress(ctx, os.TempDir(), r.ic, r.log); err != nil {
kubeconfigPath, err := common.DownloadKubeconfigNoingress(ctx, os.TempDir(), r.ic, r.log)
if err != nil {
return "", err
}
r.kubeconfigPath = kubeconfigPath
}
return r.kubeconfigPath, nil
}
Expand All @@ -68,7 +74,11 @@ func (r *rebootsNotifier) run(ctx context.Context, nodeName string, hostId, infr
r.log.Warningf("failed to get kubeconfig. aborting notifying reboots for %s", nodeName)
return
}
numberOfReboots, err := r.ops.GetNumberOfReboots(ctx, nodeName, kubeconfigPath)
var numberOfReboots int
err = utils.Retry(getNumRebootsRetries, getNumRebootsRetrySleepDuration, r.log, func() (err error) {
numberOfReboots, err = r.ops.GetNumberOfReboots(ctx, nodeName, kubeconfigPath)
return err
})
if err != nil {
r.log.WithError(err).Errorf("failed to get number of reboots for node %s", nodeName)
return
Expand All @@ -91,7 +101,7 @@ func (r *rebootsNotifier) run(ctx context.Context, nodeName string, hostId, infr
}

func (r *rebootsNotifier) Start(ctx context.Context, nodeName string, hostId, infraenvId, clusterId *strfmt.UUID) {
execCtx, cancel := context.WithTimeout(ctx, 2*time.Minute)
execCtx, cancel := context.WithTimeout(ctx, getNumRebootsTimeout)
r.cancelers = append(r.cancelers, cancel)
r.wg.Add(1)
go r.run(execCtx, nodeName, hostId, infraenvId, clusterId)
Expand Down
20 changes: 19 additions & 1 deletion src/assisted_installer_controller/reboots_notifier_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package assisted_installer_controller
import (
"context"
"fmt"
"time"

"github.com/go-openapi/strfmt"
"github.com/go-openapi/swag"
Expand Down Expand Up @@ -35,6 +36,7 @@ var _ = Describe("Reboots notifier", func() {
hostId = strfmt.UUID(uuid.New().String())
infraenvId = strfmt.UUID(uuid.New().String())
clusterId = strfmt.UUID(uuid.New().String())
getNumRebootsRetrySleepDuration = time.Millisecond
})
AfterEach(func() {
ctrl.Finish()
Expand Down Expand Up @@ -79,7 +81,23 @@ var _ = Describe("Reboots notifier", func() {
})
It("fail to get number of reboots", func() {
mockclient.EXPECT().DownloadClusterCredentials(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil)
mockops.EXPECT().GetNumberOfReboots(gomock.Any(), nodeName, gomock.Any()).Return(1, errors.New("error"))
mockops.EXPECT().GetNumberOfReboots(gomock.Any(), nodeName, gomock.Any()).Return(1, errors.New("error")).Times(getNumRebootsRetries)
notifier.Start(context.TODO(), nodeName, &hostId, &infraenvId, &clusterId)
notifier.Finalize()
})
It("fail to get number of reboots and then succeeds", func() {
mockclient.EXPECT().DownloadClusterCredentials(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil)
mockops.EXPECT().GetNumberOfReboots(gomock.Any(), nodeName, gomock.Any()).Return(1, errors.New("error")).Times(getNumRebootsRetries - 1)
mockops.EXPECT().GetNumberOfReboots(gomock.Any(), nodeName, gomock.Any()).Return(1, nil)
mockclient.EXPECT().TriggerEvent(gomock.Any(), &models.Event{
Category: models.EventCategoryUser,
ClusterID: &clusterId,
HostID: &hostId,
InfraEnvID: &infraenvId,
Message: swag.String(fmt.Sprintf(eventMessageTemplate, nodeName, 1)),
Name: eventName,
Severity: swag.String(models.EventSeverityInfo),
}).Return(nil)
notifier.Start(context.TODO(), nodeName, &hostId, &infraenvId, &clusterId)
notifier.Finalize()
})
Expand Down

0 comments on commit 26f7e32

Please sign in to comment.