Skip to content

Commit

Permalink
MGMT-15902: Trigger reboots for node event when day2 node moves to done
Browse files Browse the repository at this point in the history
This is done for kube api client only when spoke kubeconfig is available.
The implementation is done using debug command (as in oc command).
The number of reboots are counted by 'last reboot' linux command.
  • Loading branch information
ori-amizur committed Nov 5, 2023
1 parent 69d831a commit 8f31d44
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 3 deletions.
1 change: 1 addition & 0 deletions cmd/main.go
Expand Up @@ -610,6 +610,7 @@ func main() {
ApproveCsrsRequeueDuration: Options.ApproveCsrsRequeueDuration,
AgentContainerImage: Options.BMConfig.AgentDockerImg,
HostFSMountDir: hostFSMountDir,
EventSender: eventsHandler,
}).SetupWithManager(ctrlMgr), "unable to create controller Agent")

failOnError((&controllers.BMACReconciler{
Expand Down
32 changes: 32 additions & 0 deletions internal/controller/controllers/agent_controller.go
Expand Up @@ -34,8 +34,11 @@ import (
aiv1beta1 "github.com/openshift/assisted-service/api/v1beta1"
"github.com/openshift/assisted-service/internal/bminventory"
"github.com/openshift/assisted-service/internal/common"
"github.com/openshift/assisted-service/internal/common/events"
eventsapi "github.com/openshift/assisted-service/internal/events/api"
"github.com/openshift/assisted-service/internal/gencrypto"
"github.com/openshift/assisted-service/internal/host"
"github.com/openshift/assisted-service/internal/oc"
"github.com/openshift/assisted-service/internal/spoke_k8s_client"
"github.com/openshift/assisted-service/models"
"github.com/openshift/assisted-service/pkg/auth"
Expand Down Expand Up @@ -89,6 +92,7 @@ type AgentReconciler struct {
AgentContainerImage string
HostFSMountDir string
reclaimer *agentReclaimer
EventSender eventsapi.Sender
}

// +kubebuilder:rbac:groups=agent-install.openshift.io,resources=agents,verbs=get;list;watch;create;update;patch;delete
Expand Down Expand Up @@ -484,6 +488,31 @@ func (r *AgentReconciler) spokeKubeClient(ctx context.Context, clusterRef *aiv1b
return r.SpokeK8sClientFactory.CreateFromSecret(secret)
}

func (r *AgentReconciler) notifyNumberOfReboots(ctx context.Context, h *models.Host, node *corev1.Node, clusterRef *aiv1beta1.ClusterReference) error {
if !isNodeReady(node) {
return nil
}
secret, err := spokeKubeconfigSecret(ctx, r.Log, r.Client, r.APIReader, clusterRef)
if err != nil {
r.Log.WithError(err).Errorf("failed to get spoke secret for cluster %s/%s", clusterRef.Namespace, clusterRef.Name)
return err
}
kubeconfig, err := spoke_k8s_client.KubeconfigFromSecret(secret)
if err != nil {
r.Log.WithError(err).Errorf("failed to get kubeconfig from secret for cluster %s/%s", clusterRef.Namespace, clusterRef.Name)
return err
}
dbg := oc.NewDebug(kubeconfig)
reboots, err := dbg.RebootsForNode(node.Name)
if err != nil {
r.Log.WithError(err).Errorf("failed to get number of reboots for node %s, cluster %s/%s", node.Name,
clusterRef.Namespace, clusterRef.Name)
return err
}
events.SendRebootsForNodeEvent(ctx, r.EventSender, *h.ID, node.Name, h.InfraEnvID, h.ClusterID, int64(reboots))
return nil
}

// Attempt to approve CSRs for agent. If already approved then the node will be marked as done
// requeue means that approval will be attempted again
func (r *AgentReconciler) tryApproveDay2CSRs(ctx context.Context, agent *aiv1beta1.Agent, node *corev1.Node, client spoke_k8s_client.SpokeK8sClient) {
Expand Down Expand Up @@ -787,6 +816,9 @@ func (r *AgentReconciler) updateStatus(ctx context.Context, log logrus.FieldLogg
log.WithError(err).Errorf("Failed to apply labels for day2 node %s/%s", agent.Namespace, agent.Name)
return ctrl.Result{RequeueAfter: defaultRequeueAfterOnError}, err
}
if err = r.notifyNumberOfReboots(ctx, h, node, agent.Spec.ClusterDeploymentName); err != nil {
log.WithError(err).Errorf("Failed to notify number of reboots for day2 node %s/%s", agent.Namespace, agent.Name)
}
if err = r.UpdateDay2InstallPogress(ctx, h, agent, node); err != nil {
return ctrl.Result{RequeueAfter: defaultRequeueAfterOnError}, err
}
Expand Down
84 changes: 84 additions & 0 deletions internal/oc/debug.go
@@ -0,0 +1,84 @@
package oc

import (
"io"
"os"
"strings"

"github.com/go-errors/errors"
"github.com/openshift/assisted-service/pkg/executer"
)

type Debug interface {
RebootsForNode(nodeName string) (int, error)
}

type debug struct {
kubeconfig []byte
}

func NewDebug(kubeconfig []byte) Debug {
return &debug{
kubeconfig: kubeconfig,
}
}

func (d *debug) kubeconfigFile() (string, error) {
var n int

f, err := os.CreateTemp("", "kubeconfig")
if err != nil {
return "", err
}
defer f.Close()
numWritten := 0
for n, err = f.Write(d.kubeconfig[numWritten:]); err == io.ErrShortWrite; n, err = f.Write(d.kubeconfig[numWritten:]) {
numWritten += n
}
if err != nil {
return "", err
}
return f.Name(), nil
}

func (d *debug) runDebug(entity string, args ...string) (string, string, error) {
kubeconfigFname, err := d.kubeconfigFile()
if err != nil {
return "", "", err
}
defer func() {
_ = os.RemoveAll(kubeconfigFname)
}()
execArgs := append([]string{
"debug",
"--kubeconfig",
kubeconfigFname,
entity,
"--",
}, args...)
exec := &executer.CommonExecuter{}
out, errOut, exitCode := exec.Execute("oc", execArgs...)
if exitCode != 0 {
return "", "", errors.Errorf("oc debug failed to execute: %s", errOut)
}
return out, errOut, nil
}

func (d *debug) RebootsForNode(nodeName string) (int, error) {
out, _, err := d.runDebug("node/"+nodeName,
"chroot",
"/host",
"last",
"reboot")
if err != nil {
return 0, err
}
lines := strings.Split(out, "\n")
numReboots := 0
for _, line := range lines {
if strings.HasPrefix(line, "reboot ") {
numReboots++
}
}
return numReboots, nil
}
6 changes: 3 additions & 3 deletions internal/spoke_k8s_client/factory.go
Expand Up @@ -41,7 +41,7 @@ func (cf *spokeK8sClientFactory) CreateFromRawKubeconfig(kubeconfig []byte) (Spo
}

func (cf *spokeK8sClientFactory) CreateFromSecret(secret *corev1.Secret) (SpokeK8sClient, error) {
kubeconfigData, err := kubeconfigFromSecret(secret)
kubeconfigData, err := KubeconfigFromSecret(secret)
if err != nil {
return nil, err
}
Expand All @@ -66,7 +66,7 @@ func (cf *spokeK8sClientFactory) CreateFromStorageKubeconfig(ctx context.Context
}

func (cf *spokeK8sClientFactory) ClientAndSetFromSecret(secret *corev1.Secret) (SpokeK8sClient, *kubernetes.Clientset, error) {
kubeconfig, err := kubeconfigFromSecret(secret)
kubeconfig, err := KubeconfigFromSecret(secret)
if err != nil {
cf.log.WithError(err).Error("failed to get kubeconfig from secret")
return nil, nil, err
Expand All @@ -75,7 +75,7 @@ func (cf *spokeK8sClientFactory) ClientAndSetFromSecret(secret *corev1.Secret) (
return cf.clientAndSetForKubeconfig(kubeconfig)
}

func kubeconfigFromSecret(secret *corev1.Secret) ([]byte, error) {
func KubeconfigFromSecret(secret *corev1.Secret) ([]byte, error) {
if secret.Data == nil {
return nil, errors.Errorf("Secret %s/%s does not contain any data", secret.Namespace, secret.Name)
}
Expand Down

0 comments on commit 8f31d44

Please sign in to comment.