diff --git a/cmd/main.go b/cmd/main.go index b57678e142..cb950330a4 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -612,6 +612,7 @@ func main() { ApproveCsrsRequeueDuration: Options.ApproveCsrsRequeueDuration, AgentContainerImage: Options.BMConfig.AgentDockerImg, HostFSMountDir: hostFSMountDir, + EventSender: eventsHandler, }).SetupWithManager(ctrlMgr), "unable to create controller Agent") failOnError((&controllers.BMACReconciler{ diff --git a/go.mod b/go.mod index 86423cefcc..8879e3ed47 100644 --- a/go.mod +++ b/go.mod @@ -17,6 +17,7 @@ require ( github.com/danielerez/go-dns-client v0.0.0-20200630114514-0b60d1703f0b github.com/dustin/go-humanize v1.0.0 github.com/filanov/stateswitch v1.0.1-0.20221122134945-bfa198e3a83a + github.com/go-errors/errors v1.0.1 github.com/go-gormigrate/gormigrate/v2 v2.0.1 github.com/go-logr/logr v1.2.4 github.com/go-openapi/errors v0.20.3 @@ -102,7 +103,6 @@ require ( github.com/emicklei/go-restful/v3 v3.11.0 // indirect github.com/evanphx/json-patch/v5 v5.6.0 // indirect github.com/exponent-io/jsonpath v0.0.0-20151013193312-d6023ce2651d // indirect - github.com/go-errors/errors v1.0.1 // indirect github.com/google/btree v1.0.1 // indirect github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7 // indirect github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect @@ -246,7 +246,7 @@ require ( go.uber.org/zap v1.21.0 // indirect golang.org/x/lint v0.0.0-20210508222113-6edffad5e616 // indirect golang.org/x/mod v0.12.0 // indirect - golang.org/x/net v0.17.0 // indirect + golang.org/x/net v0.17.0 golang.org/x/oauth2 v0.12.0 // indirect golang.org/x/term v0.13.0 // indirect golang.org/x/text v0.13.0 // indirect diff --git a/internal/controller/controllers/agent_controller.go b/internal/controller/controllers/agent_controller.go index 9bbe91074d..a96416cdec 100644 --- a/internal/controller/controllers/agent_controller.go +++ b/internal/controller/controllers/agent_controller.go @@ -34,8 +34,11 @@ import ( aiv1beta1 "github.com/openshift/assisted-service/api/v1beta1" "github.com/openshift/assisted-service/internal/bminventory" "github.com/openshift/assisted-service/internal/common" + "github.com/openshift/assisted-service/internal/common/events" + eventsapi "github.com/openshift/assisted-service/internal/events/api" "github.com/openshift/assisted-service/internal/gencrypto" "github.com/openshift/assisted-service/internal/host" + "github.com/openshift/assisted-service/internal/oc" "github.com/openshift/assisted-service/internal/spoke_k8s_client" "github.com/openshift/assisted-service/models" "github.com/openshift/assisted-service/pkg/auth" @@ -89,6 +92,7 @@ type AgentReconciler struct { AgentContainerImage string HostFSMountDir string reclaimer *agentReclaimer + EventSender eventsapi.Sender } // +kubebuilder:rbac:groups=agent-install.openshift.io,resources=agents,verbs=get;list;watch;create;update;patch;delete @@ -484,6 +488,31 @@ func (r *AgentReconciler) spokeKubeClient(ctx context.Context, clusterRef *aiv1b return r.SpokeK8sClientFactory.CreateFromSecret(secret) } +func (r *AgentReconciler) notifyNumberOfReboots(ctx context.Context, h *models.Host, node *corev1.Node, clusterRef *aiv1beta1.ClusterReference) error { + if !isNodeReady(node) { + return nil + } + secret, err := spokeKubeconfigSecret(ctx, r.Log, r.Client, r.APIReader, clusterRef) + if err != nil { + r.Log.WithError(err).Errorf("failed to get spoke secret for cluster %s/%s", clusterRef.Namespace, clusterRef.Name) + return err + } + kubeconfig, err := spoke_k8s_client.KubeconfigFromSecret(secret) + if err != nil { + r.Log.WithError(err).Errorf("failed to get kubeconfig from secret for cluster %s/%s", clusterRef.Namespace, clusterRef.Name) + return err + } + dbg := oc.NewDebug(kubeconfig) + reboots, err := dbg.RebootsForNode(node.Name) + if err != nil { + r.Log.WithError(err).Errorf("failed to get number of reboots for node %s, cluster %s/%s", node.Name, + clusterRef.Namespace, clusterRef.Name) + return err + } + events.SendRebootsForNodeEvent(ctx, r.EventSender, *h.ID, node.Name, h.InfraEnvID, h.ClusterID, int64(reboots)) + return nil +} + // Attempt to approve CSRs for agent. If already approved then the node will be marked as done // requeue means that approval will be attempted again func (r *AgentReconciler) tryApproveDay2CSRs(ctx context.Context, agent *aiv1beta1.Agent, node *corev1.Node, client spoke_k8s_client.SpokeK8sClient) { @@ -787,6 +816,9 @@ func (r *AgentReconciler) updateStatus(ctx context.Context, log logrus.FieldLogg log.WithError(err).Errorf("Failed to apply labels for day2 node %s/%s", agent.Namespace, agent.Name) return ctrl.Result{RequeueAfter: defaultRequeueAfterOnError}, err } + if err = r.notifyNumberOfReboots(ctx, h, node, agent.Spec.ClusterDeploymentName); err != nil { + log.WithError(err).Errorf("Failed to notify number of reboots for day2 node %s/%s", agent.Namespace, agent.Name) + } if err = r.UpdateDay2InstallPogress(ctx, h, agent, node); err != nil { return ctrl.Result{RequeueAfter: defaultRequeueAfterOnError}, err } diff --git a/internal/oc/debug.go b/internal/oc/debug.go new file mode 100644 index 0000000000..3066447ba2 --- /dev/null +++ b/internal/oc/debug.go @@ -0,0 +1,96 @@ +package oc + +import ( + "io" + "os" + "strings" + "time" + + "github.com/go-errors/errors" + "github.com/openshift/assisted-service/pkg/executer" + "golang.org/x/net/context" +) + +type Debug interface { + RebootsForNode(nodeName string) (int, error) +} + +type debug struct { + kubeconfig []byte + exec executer.Executer +} + +func NewDebug(kubeconfig []byte) Debug { + return &debug{ + kubeconfig: kubeconfig, + exec: &executer.CommonExecuter{}, + } +} + +func NewDebugWithExecuter(kubeconfig []byte, exec executer.Executer) Debug { + return &debug{ + kubeconfig: kubeconfig, + exec: exec, + } +} + +func (d *debug) kubeconfigFile() (string, error) { + var n int + + f, err := os.CreateTemp("", "kubeconfig") + if err != nil { + return "", err + } + defer f.Close() + numWritten := 0 + for n, err = f.Write(d.kubeconfig[numWritten:]); err == io.ErrShortWrite; n, err = f.Write(d.kubeconfig[numWritten:]) { + numWritten += n + } + if err != nil { + return "", err + } + return f.Name(), nil +} + +func (d *debug) runDebug(entity string, args ...string) (string, string, error) { + kubeconfigFname, err := d.kubeconfigFile() + if err != nil { + return "", "", err + } + defer func() { + _ = os.RemoveAll(kubeconfigFname) + }() + execArgs := append([]string{ + "debug", + "--kubeconfig", + kubeconfigFname, + entity, + "--", + }, args...) + execCtx, cancel := context.WithTimeout(context.TODO(), 2*time.Minute) + defer cancel() + out, errOut, exitCode := d.exec.ExecuteWithContext(execCtx, "oc", execArgs...) + if exitCode != 0 { + return "", "", errors.Errorf("oc debug failed to execute with code %d: %s", exitCode, errOut) + } + return out, errOut, nil +} + +func (d *debug) RebootsForNode(nodeName string) (int, error) { + out, _, err := d.runDebug("node/"+nodeName, + "chroot", + "/host", + "last", + "reboot") + if err != nil { + return 0, err + } + lines := strings.Split(out, "\n") + numReboots := 0 + for _, line := range lines { + if strings.HasPrefix(line, "reboot ") { + numReboots++ + } + } + return numReboots, nil +} diff --git a/internal/oc/debug_test.go b/internal/oc/debug_test.go new file mode 100644 index 0000000000..85f6102d43 --- /dev/null +++ b/internal/oc/debug_test.go @@ -0,0 +1,68 @@ +package oc + +import ( + "fmt" + + "github.com/golang/mock/gomock" + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" + "github.com/openshift/assisted-service/pkg/executer" +) + +var _ = Describe("oc debug", func() { + const ( + kubeconfig = "kubeconfig" + nodeName = "node1" + ) + var ( + ctrl *gomock.Controller + execMock *executer.MockExecuter + d Debug + ) + + BeforeEach(func() { + ctrl = gomock.NewController(GinkgoT()) + execMock = executer.NewMockExecuter(ctrl) + d = NewDebugWithExecuter([]byte(kubeconfig), execMock) + }) + expectOk := func(ret string) { + execMock.EXPECT().ExecuteWithContext(gomock.Any(), + "oc", + "debug", + "--kubeconfig", + gomock.Any(), + fmt.Sprintf("node/%s", nodeName), + "--", + "chroot", + "/host", + "last", + "reboot").Return(ret, "", 0) + } + It("1 reboot", func() { + expectOk("reboot system boot 4.18.0-372.9.1.e Tue Mar 7 04:13 still running\n") + numReboots, err := d.RebootsForNode(nodeName) + Expect(err).ToNot(HaveOccurred()) + Expect(numReboots).To(Equal(1)) + }) + It("2 reboot", func() { + expectOk("reboot system boot 4.18.0-372.9.1.e Tue Mar 7 04:13 still running\nreboot system boot 4.18.0-372.9.1.e Sun Mar 5 07:29 - 09:11 (2+01:41)\n") + numReboots, err := d.RebootsForNode(nodeName) + Expect(err).ToNot(HaveOccurred()) + Expect(numReboots).To(Equal(2)) + }) + It("with error", func() { + execMock.EXPECT().ExecuteWithContext(gomock.Any(), + "oc", + "debug", + "--kubeconfig", + gomock.Any(), + fmt.Sprintf("node/%s", nodeName), + "--", + "chroot", + "/host", + "last", + "reboot").Return("", "This is an error", -1) + _, err := d.RebootsForNode(nodeName) + Expect(err).To(HaveOccurred()) + }) +}) diff --git a/internal/spoke_k8s_client/factory.go b/internal/spoke_k8s_client/factory.go index 9c6d403a10..8939864189 100644 --- a/internal/spoke_k8s_client/factory.go +++ b/internal/spoke_k8s_client/factory.go @@ -41,7 +41,7 @@ func (cf *spokeK8sClientFactory) CreateFromRawKubeconfig(kubeconfig []byte) (Spo } func (cf *spokeK8sClientFactory) CreateFromSecret(secret *corev1.Secret) (SpokeK8sClient, error) { - kubeconfigData, err := kubeconfigFromSecret(secret) + kubeconfigData, err := KubeconfigFromSecret(secret) if err != nil { return nil, err } @@ -66,7 +66,7 @@ func (cf *spokeK8sClientFactory) CreateFromStorageKubeconfig(ctx context.Context } func (cf *spokeK8sClientFactory) ClientAndSetFromSecret(secret *corev1.Secret) (SpokeK8sClient, *kubernetes.Clientset, error) { - kubeconfig, err := kubeconfigFromSecret(secret) + kubeconfig, err := KubeconfigFromSecret(secret) if err != nil { cf.log.WithError(err).Error("failed to get kubeconfig from secret") return nil, nil, err @@ -75,7 +75,7 @@ func (cf *spokeK8sClientFactory) ClientAndSetFromSecret(secret *corev1.Secret) ( return cf.clientAndSetForKubeconfig(kubeconfig) } -func kubeconfigFromSecret(secret *corev1.Secret) ([]byte, error) { +func KubeconfigFromSecret(secret *corev1.Secret) ([]byte, error) { if secret.Data == nil { return nil, errors.Errorf("Secret %s/%s does not contain any data", secret.Namespace, secret.Name) }