Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MGMT-12552: Day-2 agent stuck with status_info rebooting although the node is already part of the cluster #4610

Merged
merged 1 commit into from
Nov 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
12 changes: 5 additions & 7 deletions internal/controller/controllers/agent_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,10 @@ func (r *AgentReconciler) updateStatus(ctx context.Context, log logrus.FieldLogg
agent.Status.ValidationsInfo = newValidationsInfo
}
if h.Progress != nil && h.Progress.CurrentStage != "" {
if swag.StringValue(h.Status) == models.HostStatusAddedToExistingCluster {
// In case the node didn't reboot yet, we get the stage from the host (else)
if swag.StringValue(h.Kind) == models.HostKindAddToExistingClusterHost &&
funk.Contains([]models.HostStage{models.HostStageRebooting, models.HostStageJoined, models.HostStageConfiguring}, h.Progress.CurrentStage) {

spokeClient, err = r.spokeKubeClient(ctx, agent.Spec.ClusterDeploymentName)
if err != nil {
r.Log.WithError(err).Errorf("Agent %s/%s: Failed to create spoke client", agent.Namespace, agent.Name)
Expand Down Expand Up @@ -611,11 +614,6 @@ func (r *AgentReconciler) updateStatus(ctx context.Context, log logrus.FieldLogg
}

func (r *AgentReconciler) UpdateDay2InstallPogress(ctx context.Context, h *models.Host, agent *aiv1beta1.Agent, spokeClient spoke_k8s_client.SpokeK8sClient) (ctrl.Result, error) {
if !funk.Contains([]models.HostStage{models.HostStageRebooting, models.HostStageJoined, models.HostStageConfiguring}, h.Progress.CurrentStage) {
// In case the node didn't reboot yet, we get the stage from the host
agent.Status.Progress.CurrentStage = h.Progress.CurrentStage
return ctrl.Result{}, nil
}
node, err := r.getNode(agent, spokeClient)
if err != nil {
if k8serrors.IsNotFound(err) {
Expand Down Expand Up @@ -1390,7 +1388,7 @@ func (r *AgentReconciler) SetupWithManager(mgr ctrl.Manager) error {
}

func (r *AgentReconciler) updateHostInstallProgress(ctx context.Context, host *models.Host, stage models.HostStage) error {
r.Log.Info("Updating host %s install progress to %s", host.ID, stage)
r.Log.Infof("Updating host %s install progress to %s", host.ID, stage)
_, err := r.Installer.V2UpdateHostInstallProgressInternal(ctx, installer.V2UpdateHostInstallProgressParams{
InfraEnvID: host.InfraEnvID,
HostID: *host.ID,
Expand Down
44 changes: 30 additions & 14 deletions internal/controller/controllers/agent_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1967,7 +1967,7 @@ VU1eS0RiS/Lz6HwRs2mATNY5FrpZOgdM3cI=
ClusterID: &sId,
Kind: swag.String(models.HostKindAddToExistingClusterHost),
Inventory: generateInventory(),
Status: swag.String(models.HostStatusAddedToExistingCluster),
Status: swag.String(models.HostStatusInstalling),
Progress: &models.HostProgressInfo{
CurrentStage: models.HostStageRebooting,
},
Expand Down Expand Up @@ -2017,6 +2017,7 @@ VU1eS0RiS/Lz6HwRs2mATNY5FrpZOgdM3cI=
clusterInstall *hiveext.AgentClusterInstall
updateProgressStage bool
getNodeCount int
isDay1Host bool
}{
{
name: "Not day 2 host - do nothing",
Expand All @@ -2028,14 +2029,15 @@ VU1eS0RiS/Lz6HwRs2mATNY5FrpZOgdM3cI=
clusterInstall: newAciWithUserManagedNetworkingNoSNO("test-cluster-aci", testNamespace),
updateProgressStage: false,
getNodeCount: 0,
isDay1Host: true,
},
{
name: "No matching node - No csrs",
createClient: true,
csrs: &certificatesv1.CertificateSigningRequestList{},
nodeError: &notFoundError{},
expectedResult: ctrl.Result{RequeueAfter: time.Minute},
expectedStatus: models.HostStatusAddedToExistingCluster,
expectedStatus: models.HostStatusInstalling,
expectedStage: models.HostStageRebooting,
clusterInstall: newAciWithUserManagedNetworkingNoSNO("test-cluster-aci", testNamespace),
updateProgressStage: false,
Expand Down Expand Up @@ -2068,7 +2070,7 @@ VU1eS0RiS/Lz6HwRs2mATNY5FrpZOgdM3cI=
expectedResult: ctrl.Result{
RequeueAfter: time.Minute,
},
expectedStatus: models.HostStatusAddedToExistingCluster,
expectedStatus: models.HostStatusInstalling,
expectedStage: models.HostStageJoined,
clusterInstall: newAciNoUserManagedNetworkingNoSNO("test-cluster-aci", testNamespace),
updateProgressStage: true,
Expand Down Expand Up @@ -2099,7 +2101,7 @@ VU1eS0RiS/Lz6HwRs2mATNY5FrpZOgdM3cI=
},
approveExpected: false,
expectedResult: ctrl.Result{},
expectedStatus: models.HostStatusAddedToExistingCluster,
expectedStatus: models.HostStatusInstalling,
expectedStage: models.HostStageDone,
clusterInstall: newAciNoUserManagedNetworkingNoSNO("test-cluster-aci", testNamespace),
updateProgressStage: true,
Expand Down Expand Up @@ -2133,7 +2135,7 @@ VU1eS0RiS/Lz6HwRs2mATNY5FrpZOgdM3cI=
expectedResult: ctrl.Result{
RequeueAfter: time.Minute,
},
expectedStatus: models.HostStatusAddedToExistingCluster,
expectedStatus: models.HostStatusInstalling,
expectedStage: models.HostStageJoined,
clusterInstall: newAciWithUserManagedNetworkingNoSNO("test-cluster-aci", testNamespace),
updateProgressStage: true,
Expand Down Expand Up @@ -2167,7 +2169,7 @@ VU1eS0RiS/Lz6HwRs2mATNY5FrpZOgdM3cI=
expectedResult: ctrl.Result{
RequeueAfter: time.Minute,
},
expectedStatus: models.HostStatusAddedToExistingCluster,
expectedStatus: models.HostStatusInstalling,
expectedStage: models.HostStageJoined,
clusterInstall: newAciNoUserManagedNetworkingWithSNO("test-cluster-aci", testNamespace),
updateProgressStage: true,
Expand Down Expand Up @@ -2197,7 +2199,7 @@ VU1eS0RiS/Lz6HwRs2mATNY5FrpZOgdM3cI=
expectedResult: ctrl.Result{
RequeueAfter: time.Minute,
},
expectedStatus: models.HostStatusAddedToExistingCluster,
expectedStatus: models.HostStatusInstalling,
expectedStage: models.HostStageRebooting,
clusterInstall: newAciWithUserManagedNetworkingNoSNO("test-cluster-aci", testNamespace),
updateProgressStage: false,
Expand Down Expand Up @@ -2226,7 +2228,7 @@ VU1eS0RiS/Lz6HwRs2mATNY5FrpZOgdM3cI=
expectedResult: ctrl.Result{
RequeueAfter: time.Minute,
},
expectedStatus: models.HostStatusAddedToExistingCluster,
expectedStatus: models.HostStatusInstalling,
expectedStage: models.HostStageJoined,
clusterInstall: newAciWithUserManagedNetworkingNoSNO("test-cluster-aci", testNamespace),
updateProgressStage: true,
Expand Down Expand Up @@ -2257,7 +2259,7 @@ VU1eS0RiS/Lz6HwRs2mATNY5FrpZOgdM3cI=
csrs: serverCsrs(),
approveExpected: true,
expectedResult: ctrl.Result{},
expectedStatus: models.HostStatusAddedToExistingCluster,
expectedStatus: models.HostStatusInstalling,
expectedStage: models.HostStageDone,
clusterInstall: newAciWithUserManagedNetworkingNoSNO("test-cluster-aci", testNamespace),
updateProgressStage: true,
Expand All @@ -2273,7 +2275,7 @@ VU1eS0RiS/Lz6HwRs2mATNY5FrpZOgdM3cI=
expectedResult: ctrl.Result{
RequeueAfter: time.Minute,
},
expectedStatus: models.HostStatusAddedToExistingCluster,
expectedStatus: models.HostStatusInstalling,
expectedStage: models.HostStageRebooting,
clusterInstall: newAciWithUserManagedNetworkingNoSNO("test-cluster-aci", testNamespace),
updateProgressStage: false,
Expand All @@ -2289,7 +2291,7 @@ VU1eS0RiS/Lz6HwRs2mATNY5FrpZOgdM3cI=
expectedResult: ctrl.Result{
RequeueAfter: time.Minute,
},
expectedStatus: models.HostStatusAddedToExistingCluster,
expectedStatus: models.HostStatusInstalling,
expectedStage: models.HostStageRebooting,
clusterInstall: newAciWithUserManagedNetworkingNoSNO("test-cluster-aci", testNamespace),
updateProgressStage: false,
Expand Down Expand Up @@ -2318,23 +2320,34 @@ VU1eS0RiS/Lz6HwRs2mATNY5FrpZOgdM3cI=
expectedResult: ctrl.Result{
RequeueAfter: time.Minute,
},
expectedStatus: models.HostStatusAddedToExistingCluster,
expectedStatus: models.HostStatusInstalling,
expectedStage: models.HostStageJoined,
clusterInstall: newAciWithUserManagedNetworkingNoSNO("test-cluster-aci", testNamespace),
updateProgressStage: true,
getNodeCount: 2,
},
{
name: "Already done",
createClient: true,
createClient: false,
hostInitialStage: models.HostStageDone,
expectedResult: ctrl.Result{},
expectedStatus: models.HostStatusAddedToExistingCluster,
expectedStatus: models.HostStatusInstalling,
expectedStage: models.HostStageDone,
clusterInstall: newAciWithUserManagedNetworkingNoSNO("test-cluster-aci", testNamespace),
updateProgressStage: false,
getNodeCount: 0,
},
{
name: "Not rebooting yet - do nothing",
createClient: false,
hostInitialStage: models.HostStageWritingImageToDisk,
expectedResult: ctrl.Result{},
expectedStatus: models.HostStatusInstalling,
expectedStage: models.HostStageWritingImageToDisk,
clusterInstall: newAciWithUserManagedNetworkingNoSNO("test-cluster-aci", testNamespace),
updateProgressStage: false,
getNodeCount: 0,
},
}

for i := range tests {
Expand All @@ -2347,6 +2360,9 @@ VU1eS0RiS/Lz6HwRs2mATNY5FrpZOgdM3cI=
if t.hostname != "" {
agentSpec.Hostname = t.hostname
}
if t.isDay1Host {
commonHost.Kind = swag.String(models.HostKindHost)
}
host := newAgent(hostId.String(), testNamespace, agentSpec)
host.Spec.Approved = true
mockInstallerInternal.EXPECT().UpdateHostApprovedInternal(gomock.Any(), gomock.Any(), gomock.Any(), true).Return(nil)
Expand Down
10 changes: 8 additions & 2 deletions internal/host/host.go
Original file line number Diff line number Diff line change
Expand Up @@ -587,8 +587,12 @@ func (m *Manager) UpdateInstallProgress(ctx context.Context, h *models.Host, pro
var err error
switch progress.CurrentStage {
case models.HostStageDone:
newStatus := models.HostStatusInstalled
if swag.StringValue(h.Kind) == models.HostKindAddToExistingClusterHost {
newStatus = models.HostStatusAddedToExistingCluster
}
_, err = hostutil.UpdateHostProgress(ctx, logutil.FromContext(ctx, m.log), m.db, m.eventsHandler, h.InfraEnvID, *h.ID,
swag.StringValue(h.Status), models.HostStatusInstalled, statusInfo,
swag.StringValue(h.Status), newStatus, statusInfo,
previousProgress.CurrentStage, progress.CurrentStage, progress.ProgressInfo, extra...)
case models.HostStageFailed:
// Keeps the last progress
Expand All @@ -603,13 +607,15 @@ func (m *Manager) UpdateInstallProgress(ctx context.Context, h *models.Host, pro
if swag.StringValue(h.Kind) == models.HostKindAddToExistingClusterHost {
infoMessage := statusInfoRebootingDay2
stage := models.HostStageDone
newStatus := models.HostStatusAddedToExistingCluster
if m.kubeApiEnabled {
// in case kubeApiEnabled the agent controller will keep updating the host stage until the installation is complete
infoMessage = statusInfo
stage = models.HostStageRebooting
newStatus = swag.StringValue(h.Status)
}
_, err = hostutil.UpdateHostProgress(ctx, logutil.FromContext(ctx, m.log), m.db, m.eventsHandler, h.InfraEnvID, *h.ID,
swag.StringValue(h.Status), models.HostStatusAddedToExistingCluster, infoMessage,
swag.StringValue(h.Status), newStatus, infoMessage,
h.Progress.CurrentStage, stage, progress.ProgressInfo, extra...)
break
}
Expand Down
25 changes: 21 additions & 4 deletions internal/host/host_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4547,10 +4547,6 @@ var _ = Describe("Rebooting day2", func() {
host.Kind = &hostKindDay2
host.Role = models.HostRoleMaster
Expect(db.Create(&host).Error).ShouldNot(HaveOccurred())
mockEventsAPI.EXPECT().SendHostEvent(ctx, eventstest.NewEventMatcher(
eventstest.WithHostIdMatcher(host.ID.String()),
eventstest.WithInfraEnvIdMatcher(host.InfraEnvID.String()),
eventstest.WithSeverityMatcher(models.EventSeverityInfo)))
})

AfterEach(func() {
Expand All @@ -4559,13 +4555,18 @@ var _ = Describe("Rebooting day2", func() {
})

It("during rebooting phase, if kubeapi is not enabled, should have the no further updates statusinfo", func() {
mockEventsAPI.EXPECT().SendHostEvent(ctx, eventstest.NewEventMatcher(
eventstest.WithHostIdMatcher(host.ID.String()),
eventstest.WithInfraEnvIdMatcher(host.InfraEnvID.String()),
eventstest.WithSeverityMatcher(models.EventSeverityInfo)))
api.kubeApiEnabled = false
err := api.UpdateInstallProgress(ctx, &host, &models.HostProgress{
CurrentStage: models.HostStageRebooting,
})
verifyHost := hostutil.GetHostFromDB(*host.ID, host.InfraEnvID, db)
Expect(err).ToNot(HaveOccurred())
Expect(*verifyHost.StatusInfo).Should(BeEquivalentTo("Host has rebooted and no further updates will be posted. Please check console for progress and to possibly approve pending CSRs"))
Expect(*verifyHost.Status).Should(BeEquivalentTo(models.HostStatusAddedToExistingCluster))
})

It("during rebooting phase, if kubeapi is enabled, should have the rebooting statusinfo", func() {
Expand All @@ -4576,6 +4577,22 @@ var _ = Describe("Rebooting day2", func() {
verifyHost := hostutil.GetHostFromDB(*host.ID, host.InfraEnvID, db)
Expect(err).ToNot(HaveOccurred())
Expect(*verifyHost.StatusInfo).Should(BeEquivalentTo("Rebooting"))
Expect(*verifyHost.Status).Should(BeEquivalentTo(models.HostStatusInstalling))
})

It("day-2 host with stage Done should move to HostStatusAddedToExistingCluster", func() {
api.kubeApiEnabled = true
mockEventsAPI.EXPECT().SendHostEvent(ctx, eventstest.NewEventMatcher(
eventstest.WithHostIdMatcher(host.ID.String()),
eventstest.WithInfraEnvIdMatcher(host.InfraEnvID.String()),
eventstest.WithSeverityMatcher(models.EventSeverityInfo)))
err := api.UpdateInstallProgress(ctx, &host, &models.HostProgress{
CurrentStage: models.HostStageDone,
})
verifyHost := hostutil.GetHostFromDB(*host.ID, host.InfraEnvID, db)
Expect(err).ToNot(HaveOccurred())
Expect(*verifyHost.StatusInfo).Should(BeEquivalentTo(models.HostStageDone))
Expect(*verifyHost.Status).Should(BeEquivalentTo(models.HostStatusAddedToExistingCluster))
})

})