Skip to content

Commit

Permalink
OCPBUGS-17788: Improved error handling for missing MC
Browse files Browse the repository at this point in the history
the error for when a MachineConfig is missing is very vague. Usually the error has little to do with the cause. Specify steps users can take to alleviate the situations.

add this in the form of a metric and an alert if the metric gets triggered.

Signed-off-by: Charlie Doern <cdoern@redhat.com>
  • Loading branch information
cdoern committed Jan 12, 2024
1 parent e0dda82 commit a6bff82
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 30 deletions.
13 changes: 13 additions & 0 deletions install/0000_90_machine-config-operator_01_prometheus-rules.yaml
Expand Up @@ -160,3 +160,16 @@ spec:
Moreover, OOM kill is expected which negatively influences the pod scheduling.
If this happens on container level, the descheduler will not be able to detect it, as it works on the pod level.
To fix this, increase memory of the affected node of control plane nodes.
- name: mcd-missing-mc
rules:
- alert: MissingMachineConfig
expr: |
mcd_missing_mc > 0
labels:
namespace: openshift-machine-config-operator
severity: warning
annotations:
summary: "This keeps track of Machine Config failures. Specifically a common failure on install when a rendered Machine Config is missing. Triggered when this error happens once."
description: >-
Could not find config {{ $labels.mc }} in-cluster, this likely indicates the MachineConfigs in-cluster has changed during the install process.
If you are seeing this when installing the cluster, please compare the in-cluster rendered machineconfigs to s/etc/mcs-machine-config-content.json
82 changes: 54 additions & 28 deletions pkg/daemon/daemon.go
Expand Up @@ -767,8 +767,12 @@ func (dn *Daemon) syncNode(key string) error {
}

// Pass to the shared update prep method
ufc, err := dn.prepUpdateFromCluster()
ufc, err, missingMC := dn.prepUpdateFromCluster()
if err != nil {
if apierrors.IsNotFound(err) {
mcdMissingMC.WithLabelValues(missingMC).Inc()
}

return fmt.Errorf("prepping update: %w", err)
}

Expand All @@ -791,7 +795,13 @@ func (dn *Daemon) syncNode(key string) error {
return err
}

if err := dn.triggerUpdate(ufc.currentConfig, ufc.desiredConfig, ufc.currentImage, ufc.desiredImage); err != nil {
if err, missingMC := dn.triggerUpdate(ufc.currentConfig, ufc.desiredConfig, ufc.currentImage, ufc.desiredImage); err != nil {
if apierrors.IsNotFound(err) {
// if MC was not found, let user know where they can find more info on this.
if apierrors.IsNotFound(err) {
mcdMissingMC.WithLabelValues(missingMC).Inc()
}
}
return err
}
} else {
Expand Down Expand Up @@ -1900,8 +1910,12 @@ func (dn *Daemon) checkStateOnFirstRun() error {
// Update our cached copy
dn.node = node

currentConfigName, _ := getNodeAnnotation(dn.node, constants.CurrentMachineConfigAnnotationKey)
state, err := dn.getStateAndConfigs()
if err != nil {
if apierrors.IsNotFound(err) {
mcdMissingMC.WithLabelValues(currentConfigName).Inc()
}
return err
}

Expand Down Expand Up @@ -1968,7 +1982,8 @@ func (dn *Daemon) checkStateOnFirstRun() error {

if forceFileExists() {
logSystem("Skipping on-disk validation; %s present", constants.MachineConfigDaemonForceFile)
return dn.triggerUpdate(state.currentConfig, state.desiredConfig, state.currentImage, state.desiredImage)
err, _ := dn.triggerUpdate(state.currentConfig, state.desiredConfig, state.currentImage, state.desiredImage)
return err
}

// When upgrading the OS, it is possible that the SSH key location will
Expand Down Expand Up @@ -1999,7 +2014,15 @@ func (dn *Daemon) checkStateOnFirstRun() error {
}
// currentConfig != desiredConfig, and we're not booting up into the desiredConfig.
// Kick off an update.
return dn.triggerUpdate(state.currentConfig, state.desiredConfig, state.currentImage, state.desiredImage)
err, missingMC := dn.triggerUpdate(state.currentConfig, state.desiredConfig, state.currentImage, state.desiredImage)
if err != nil && apierrors.IsNotFound(err) {
// if this is first run, the MC we went to get DNE this means there was a content mismatch
// bwtn what user provided and what was rendered. State that here.
if apierrors.IsNotFound(err) {
mcdMissingMC.WithLabelValues(missingMC).Inc()
}
}
return err
}

func (dn *Daemon) isInDesiredConfig(state *stateAndConfigs) bool {
Expand Down Expand Up @@ -2102,18 +2125,21 @@ func (dn *Daemon) runOnceFromMachineConfig(machineConfig mcfgv1.MachineConfig, c
panic("running in onceFrom mode with a remote MachineConfig without a cluster")
}
// NOTE: This case expects a cluster to exists already.
ufc, err := dn.prepUpdateFromCluster()
ufc, err, missingMC := dn.prepUpdateFromCluster()
if err != nil {
if err := dn.nodeWriter.SetDegraded(err); err != nil {
return err
}
if apierrors.IsNotFound(err) {
mcdMissingMC.WithLabelValues(missingMC).Inc()
}
return err
}
if ufc.currentConfig == nil || ufc.desiredConfig == nil {
return nil
}
// At this point we have verified we need to update
if err := dn.triggerUpdateWithMachineConfig(ufc.currentConfig, &machineConfig, false); err != nil {
if err, _ := dn.triggerUpdateWithMachineConfig(ufc.currentConfig, &machineConfig, false); err != nil {
dn.nodeWriter.SetDegraded(err)
return err
}
Expand Down Expand Up @@ -2167,39 +2193,39 @@ type updateFromCluster struct {
// prepUpdateFromCluster handles the shared update prepping functionality for
// flows that expect the cluster to already be available. Returns true if an
// update is required, false otherwise.
func (dn *Daemon) prepUpdateFromCluster() (*updateFromCluster, error) {
func (dn *Daemon) prepUpdateFromCluster() (*updateFromCluster, error, string) {
desiredConfigName, err := getNodeAnnotationExt(dn.node, constants.DesiredMachineConfigAnnotationKey, true)
if err != nil {
return nil, err
return nil, err, ""
}

desiredConfig, err := dn.mcLister.Get(desiredConfigName)
if err != nil {
return nil, err
return nil, err, desiredConfigName
}
// currentConfig is always expected to be there as loadNodeAnnotations
// is one of the very first calls when the daemon starts.
currentConfigName, err := getNodeAnnotation(dn.node, constants.CurrentMachineConfigAnnotationKey)
if err != nil {
return nil, err
return nil, err, ""
}
currentConfig, err := dn.mcLister.Get(currentConfigName)
if err != nil {
return nil, err
return nil, err, currentConfigName
}
state, err := getNodeAnnotation(dn.node, constants.MachineConfigDaemonStateAnnotationKey)
if err != nil {
return nil, err
return nil, err, ""
}

odc, err := dn.getCurrentConfigOnDisk()
if err != nil && !os.IsNotExist(err) {
return nil, err
return nil, err, ""
}

desiredImage, err := getNodeAnnotationExt(dn.node, constants.DesiredImageAnnotationKey, true)
if err != nil {
return nil, err
return nil, err, ""
}

if odc != nil && odc.currentConfig.GetName() != currentConfig.GetName() {
Expand All @@ -2208,13 +2234,13 @@ func (dn *Daemon) prepUpdateFromCluster() (*updateFromCluster, error) {
desiredConfig: desiredConfig,
currentImage: odc.currentImage,
desiredImage: desiredImage,
}, nil
}, nil, ""
}

if odc == nil {
odc, err = dn.getCurrentConfigFromNode()
if err != nil {
return nil, err
return nil, err, ""
}
}

Expand All @@ -2224,7 +2250,7 @@ func (dn *Daemon) prepUpdateFromCluster() (*updateFromCluster, error) {
if state == constants.MachineConfigDaemonStateDone {
// No actual update to the config
klog.V(2).Info("No updating is required")
return nil, nil
return nil, nil, ""
}
// This seems like it shouldn't happen...let's just warn for now.
klog.Warningf("current+desiredConfig is %s but state is %s", currentConfigName, state)
Expand All @@ -2234,7 +2260,7 @@ func (dn *Daemon) prepUpdateFromCluster() (*updateFromCluster, error) {
if state == constants.MachineConfigDaemonStateDone {
// No actual update to the config
klog.V(2).Info("No updating is required")
return nil, nil
return nil, nil, ""
}
// This seems like it shouldn't happen...let's just warn for now.
klog.Warningf("current+desiredConfig is %s, current+desiredImage is %s but state is %s", currentConfigName, odc.currentImage, state)
Expand All @@ -2246,7 +2272,7 @@ func (dn *Daemon) prepUpdateFromCluster() (*updateFromCluster, error) {
desiredConfig: desiredConfig,
currentImage: odc.currentImage,
desiredImage: desiredImage,
}, nil
}, nil, ""
}

// completeUpdate marks the node as schedulable again, then deletes the
Expand Down Expand Up @@ -2284,7 +2310,7 @@ func (dn *Daemon) completeUpdate(desiredConfigName string) error {
return nil
}

func (dn *Daemon) triggerUpdate(currentConfig, desiredConfig *mcfgv1.MachineConfig, currentImage, desiredImage string) error {
func (dn *Daemon) triggerUpdate(currentConfig, desiredConfig *mcfgv1.MachineConfig, currentImage, desiredImage string) (error, string) {
// If both of the image annotations are empty, this is a regular MachineConfig update.
if desiredImage == "" && currentImage == "" {
return dn.triggerUpdateWithMachineConfig(currentConfig, desiredConfig, true)
Expand All @@ -2297,39 +2323,39 @@ func (dn *Daemon) triggerUpdate(currentConfig, desiredConfig *mcfgv1.MachineConf
// non-layered config without admin intervention; so we should emit an error
// for now.
if desiredImage == "" && currentImage != "" {
return fmt.Errorf("rolling back from a layered to non-layered configuration is not currently supported")
return fmt.Errorf("rolling back from a layered to non-layered configuration is not currently supported"), ""
}

// Shut down the Config Drift Monitor since we'll be performing an update
// and the config will "drift" while the update is occurring.
dn.stopConfigDriftMonitor()

klog.Infof("Performing layered OS update")
return dn.updateOnClusterBuild(currentConfig, desiredConfig, currentImage, desiredImage, true)
return dn.updateOnClusterBuild(currentConfig, desiredConfig, currentImage, desiredImage, true), ""
}

// triggerUpdateWithMachineConfig starts the update. It queries the cluster for
// the current and desired config if they weren't passed.
func (dn *Daemon) triggerUpdateWithMachineConfig(currentConfig, desiredConfig *mcfgv1.MachineConfig, skipCertificateWrite bool) error {
func (dn *Daemon) triggerUpdateWithMachineConfig(currentConfig, desiredConfig *mcfgv1.MachineConfig, skipCertificateWrite bool) (error, string) {
if currentConfig == nil {
ccAnnotation, err := getNodeAnnotation(dn.node, constants.CurrentMachineConfigAnnotationKey)
if err != nil {
return err
return err, ""
}
currentConfig, err = dn.mcLister.Get(ccAnnotation)
if err != nil {
return err
return err, ccAnnotation
}
}

if desiredConfig == nil {
dcAnnotation, err := getNodeAnnotation(dn.node, constants.DesiredMachineConfigAnnotationKey)
if err != nil {
return err
return err, ""
}
desiredConfig, err = dn.mcLister.Get(dcAnnotation)
if err != nil {
return err
return err, dcAnnotation
}
}

Expand All @@ -2338,7 +2364,7 @@ func (dn *Daemon) triggerUpdateWithMachineConfig(currentConfig, desiredConfig *m
dn.stopConfigDriftMonitor()

// run the update process. this function doesn't currently return.
return dn.update(currentConfig, desiredConfig, skipCertificateWrite)
return dn.update(currentConfig, desiredConfig, skipCertificateWrite), ""
}

// validateKernelArguments checks that the current boot has all arguments specified
Expand Down
2 changes: 1 addition & 1 deletion pkg/daemon/daemon_test.go
Expand Up @@ -528,7 +528,7 @@ func TestPrepUpdateFromClusterOnDiskDrift(t *testing.T) {
dn.node = node
dn.currentConfigPath = currentConfigPath
dn.currentImagePath = currentImagePath
ufc, err := dn.prepUpdateFromCluster()
ufc, err, _ := dn.prepUpdateFromCluster()
test.verify(t, ufc, err)
})
}
Expand Down
6 changes: 6 additions & 0 deletions pkg/daemon/metrics.go
Expand Up @@ -57,6 +57,12 @@ var (
Name: "mcd_config_drift",
Help: "timestamp for config drift",
})
// mcdMissingMC tracks the missing machine config error
mcdMissingMC = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "mcd_missing_mc",
Help: "total number of times a MC was reported missing",
}, []string{"mc"})
)

// Updates metric with new labels & timestamp, deletes any existing
Expand Down
3 changes: 2 additions & 1 deletion pkg/daemon/update.go
Expand Up @@ -193,7 +193,8 @@ func (dn *Daemon) performPostConfigChangeAction(postConfigChangeActions []string
}

// currentConfig != desiredConfig, kick off an update
return dn.triggerUpdateWithMachineConfig(state.currentConfig, state.desiredConfig, true)
err, _ = dn.triggerUpdateWithMachineConfig(state.currentConfig, state.desiredConfig, true)
return err
}

func setRunningKargsWithCmdline(config *mcfgv1.MachineConfig, requestedKargs []string, cmdline []byte) error {
Expand Down

0 comments on commit a6bff82

Please sign in to comment.