Skip to content

Commit

Permalink
MGMT-15902: Trigger reboots for node event when day2 node moves to done
Browse files Browse the repository at this point in the history
This is done for kube api client only when spoke kubeconfig is available.
The implementation is done using debug command (as in oc command).
The number of reboots are counted by 'last reboot' linux command.
  • Loading branch information
ori-amizur committed Nov 6, 2023
1 parent 69d831a commit fd61402
Show file tree
Hide file tree
Showing 6 changed files with 195 additions and 4 deletions.
1 change: 1 addition & 0 deletions cmd/main.go
Expand Up @@ -610,6 +610,7 @@ func main() {
ApproveCsrsRequeueDuration: Options.ApproveCsrsRequeueDuration,
AgentContainerImage: Options.BMConfig.AgentDockerImg,
HostFSMountDir: hostFSMountDir,
EventSender: eventsHandler,
}).SetupWithManager(ctrlMgr), "unable to create controller Agent")

failOnError((&controllers.BMACReconciler{
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Expand Up @@ -17,6 +17,7 @@ require (
github.com/danielerez/go-dns-client v0.0.0-20200630114514-0b60d1703f0b
github.com/dustin/go-humanize v1.0.0
github.com/filanov/stateswitch v1.0.1-0.20221122134945-bfa198e3a83a
github.com/go-errors/errors v1.0.1
github.com/go-gormigrate/gormigrate/v2 v2.0.1
github.com/go-logr/logr v1.2.4
github.com/go-openapi/errors v0.20.3
Expand Down Expand Up @@ -102,7 +103,6 @@ require (
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
github.com/evanphx/json-patch/v5 v5.6.0 // indirect
github.com/exponent-io/jsonpath v0.0.0-20151013193312-d6023ce2651d // indirect
github.com/go-errors/errors v1.0.1 // indirect
github.com/google/btree v1.0.1 // indirect
github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7 // indirect
github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect
Expand Down
32 changes: 32 additions & 0 deletions internal/controller/controllers/agent_controller.go
Expand Up @@ -34,8 +34,11 @@ import (
aiv1beta1 "github.com/openshift/assisted-service/api/v1beta1"
"github.com/openshift/assisted-service/internal/bminventory"
"github.com/openshift/assisted-service/internal/common"
"github.com/openshift/assisted-service/internal/common/events"
eventsapi "github.com/openshift/assisted-service/internal/events/api"
"github.com/openshift/assisted-service/internal/gencrypto"
"github.com/openshift/assisted-service/internal/host"
"github.com/openshift/assisted-service/internal/oc"
"github.com/openshift/assisted-service/internal/spoke_k8s_client"
"github.com/openshift/assisted-service/models"
"github.com/openshift/assisted-service/pkg/auth"
Expand Down Expand Up @@ -89,6 +92,7 @@ type AgentReconciler struct {
AgentContainerImage string
HostFSMountDir string
reclaimer *agentReclaimer
EventSender eventsapi.Sender
}

// +kubebuilder:rbac:groups=agent-install.openshift.io,resources=agents,verbs=get;list;watch;create;update;patch;delete
Expand Down Expand Up @@ -484,6 +488,31 @@ func (r *AgentReconciler) spokeKubeClient(ctx context.Context, clusterRef *aiv1b
return r.SpokeK8sClientFactory.CreateFromSecret(secret)
}

func (r *AgentReconciler) notifyNumberOfReboots(ctx context.Context, h *models.Host, node *corev1.Node, clusterRef *aiv1beta1.ClusterReference) error {
if !isNodeReady(node) {
return nil
}
secret, err := spokeKubeconfigSecret(ctx, r.Log, r.Client, r.APIReader, clusterRef)
if err != nil {
r.Log.WithError(err).Errorf("failed to get spoke secret for cluster %s/%s", clusterRef.Namespace, clusterRef.Name)
return err
}
kubeconfig, err := spoke_k8s_client.KubeconfigFromSecret(secret)
if err != nil {
r.Log.WithError(err).Errorf("failed to get kubeconfig from secret for cluster %s/%s", clusterRef.Namespace, clusterRef.Name)
return err
}
dbg := oc.NewDebug(kubeconfig)
reboots, err := dbg.RebootsForNode(node.Name)
if err != nil {
r.Log.WithError(err).Errorf("failed to get number of reboots for node %s, cluster %s/%s", node.Name,
clusterRef.Namespace, clusterRef.Name)
return err
}
events.SendRebootsForNodeEvent(ctx, r.EventSender, *h.ID, node.Name, h.InfraEnvID, h.ClusterID, int64(reboots))
return nil
}

// Attempt to approve CSRs for agent. If already approved then the node will be marked as done
// requeue means that approval will be attempted again
func (r *AgentReconciler) tryApproveDay2CSRs(ctx context.Context, agent *aiv1beta1.Agent, node *corev1.Node, client spoke_k8s_client.SpokeK8sClient) {
Expand Down Expand Up @@ -787,6 +816,9 @@ func (r *AgentReconciler) updateStatus(ctx context.Context, log logrus.FieldLogg
log.WithError(err).Errorf("Failed to apply labels for day2 node %s/%s", agent.Namespace, agent.Name)
return ctrl.Result{RequeueAfter: defaultRequeueAfterOnError}, err
}
if err = r.notifyNumberOfReboots(ctx, h, node, agent.Spec.ClusterDeploymentName); err != nil {
log.WithError(err).Errorf("Failed to notify number of reboots for day2 node %s/%s", agent.Namespace, agent.Name)
}
if err = r.UpdateDay2InstallPogress(ctx, h, agent, node); err != nil {
return ctrl.Result{RequeueAfter: defaultRequeueAfterOnError}, err
}
Expand Down
92 changes: 92 additions & 0 deletions internal/oc/debug.go
@@ -0,0 +1,92 @@
package oc

import (
"io"
"os"
"strings"

"github.com/go-errors/errors"
"github.com/openshift/assisted-service/pkg/executer"
)

type Debug interface {
RebootsForNode(nodeName string) (int, error)
}

type debug struct {
kubeconfig []byte
exec executer.Executer
}

func NewDebug(kubeconfig []byte) Debug {
return &debug{
kubeconfig: kubeconfig,
exec: &executer.CommonExecuter{},
}
}

func NewDebugWithExecuter(kubeconfig []byte, exec executer.Executer) Debug {
return &debug{
kubeconfig: kubeconfig,
exec: exec,
}
}

func (d *debug) kubeconfigFile() (string, error) {
var n int

f, err := os.CreateTemp("", "kubeconfig")
if err != nil {
return "", err
}
defer f.Close()
numWritten := 0
for n, err = f.Write(d.kubeconfig[numWritten:]); err == io.ErrShortWrite; n, err = f.Write(d.kubeconfig[numWritten:]) {
numWritten += n
}
if err != nil {
return "", err
}
return f.Name(), nil
}

func (d *debug) runDebug(entity string, args ...string) (string, string, error) {
kubeconfigFname, err := d.kubeconfigFile()
if err != nil {
return "", "", err
}
defer func() {
_ = os.RemoveAll(kubeconfigFname)
}()
execArgs := append([]string{
"debug",
"--kubeconfig",
kubeconfigFname,
entity,
"--",
}, args...)
out, errOut, exitCode := d.exec.Execute("oc", execArgs...)
if exitCode != 0 {
return "", "", errors.Errorf("oc debug failed to execute with code %d: %s", exitCode, errOut)
}
return out, errOut, nil
}

func (d *debug) RebootsForNode(nodeName string) (int, error) {
out, _, err := d.runDebug("node/"+nodeName,
"chroot",
"/host",
"last",
"reboot")
if err != nil {
return 0, err
}
lines := strings.Split(out, "\n")
numReboots := 0
for _, line := range lines {
if strings.HasPrefix(line, "reboot ") {
numReboots++
}
}
return numReboots, nil
}
66 changes: 66 additions & 0 deletions internal/oc/debug_test.go
@@ -0,0 +1,66 @@
package oc

import (
"fmt"

"github.com/golang/mock/gomock"
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
"github.com/openshift/assisted-service/pkg/executer"
)

var _ = Describe("oc debug", func() {
const (
kubeconfig = "kubeconfig"
nodeName = "node1"
)
var (
ctrl *gomock.Controller
execMock *executer.MockExecuter
d Debug
)

BeforeEach(func() {
ctrl = gomock.NewController(GinkgoT())
execMock = executer.NewMockExecuter(ctrl)
d = NewDebugWithExecuter([]byte(kubeconfig), execMock)
})
expectOk := func(ret string) {
execMock.EXPECT().Execute("oc",
"debug",
"--kubeconfig",
gomock.Any(),
fmt.Sprintf("node/%s", nodeName),
"--",
"chroot",
"/host",
"last",
"reboot").Return(ret, "", 0)
}
It("1 reboot", func() {
expectOk("reboot system boot 4.18.0-372.9.1.e Tue Mar 7 04:13 still running\n")
numReboots, err := d.RebootsForNode(nodeName)
Expect(err).ToNot(HaveOccurred())
Expect(numReboots).To(Equal(1))
})
It("2 reboot", func() {
expectOk("reboot system boot 4.18.0-372.9.1.e Tue Mar 7 04:13 still running\nreboot system boot 4.18.0-372.9.1.e Sun Mar 5 07:29 - 09:11 (2+01:41)\n")
numReboots, err := d.RebootsForNode(nodeName)
Expect(err).ToNot(HaveOccurred())
Expect(numReboots).To(Equal(2))
})
It("with error", func() {
execMock.EXPECT().Execute("oc",
"debug",
"--kubeconfig",
gomock.Any(),
fmt.Sprintf("node/%s", nodeName),
"--",
"chroot",
"/host",
"last",
"reboot").Return("", "This is an error", -1)
_, err := d.RebootsForNode(nodeName)
Expect(err).To(HaveOccurred())
})
})
6 changes: 3 additions & 3 deletions internal/spoke_k8s_client/factory.go
Expand Up @@ -41,7 +41,7 @@ func (cf *spokeK8sClientFactory) CreateFromRawKubeconfig(kubeconfig []byte) (Spo
}

func (cf *spokeK8sClientFactory) CreateFromSecret(secret *corev1.Secret) (SpokeK8sClient, error) {
kubeconfigData, err := kubeconfigFromSecret(secret)
kubeconfigData, err := KubeconfigFromSecret(secret)
if err != nil {
return nil, err
}
Expand All @@ -66,7 +66,7 @@ func (cf *spokeK8sClientFactory) CreateFromStorageKubeconfig(ctx context.Context
}

func (cf *spokeK8sClientFactory) ClientAndSetFromSecret(secret *corev1.Secret) (SpokeK8sClient, *kubernetes.Clientset, error) {
kubeconfig, err := kubeconfigFromSecret(secret)
kubeconfig, err := KubeconfigFromSecret(secret)
if err != nil {
cf.log.WithError(err).Error("failed to get kubeconfig from secret")
return nil, nil, err
Expand All @@ -75,7 +75,7 @@ func (cf *spokeK8sClientFactory) ClientAndSetFromSecret(secret *corev1.Secret) (
return cf.clientAndSetForKubeconfig(kubeconfig)
}

func kubeconfigFromSecret(secret *corev1.Secret) ([]byte, error) {
func KubeconfigFromSecret(secret *corev1.Secret) ([]byte, error) {
if secret.Data == nil {
return nil, errors.Errorf("Secret %s/%s does not contain any data", secret.Namespace, secret.Name)
}
Expand Down

0 comments on commit fd61402

Please sign in to comment.