Skip to content

Commit

Permalink
Tighten the rules for modifying Tuned Profiles
Browse files Browse the repository at this point in the history
NTO operands allow updating Tuned Profiles.  This is intentional and by design
as some host information needs to be communicated back to the NTO operator,
but this also allows a successful local attacker potentially affect node-level
configuration of other cluster nodes.

This change addresses the situation in two ways.  First, scoped RBAC
permissions on Profile.status subresource is used to disallow Node-level
write access to Profile.spec.  Second, the Node resource is used to provide
status loops back to NTO using the kubelet credential to write an annotation
to the Node resource.

This change also simplifies the mechanism for accepting kernel
command-line parameters as calculated by the NTO operands.  Now, all NTO
operands must agree on the calculated kernel command-line parameters.

ClusterOperator/node-tuning now also reports operand version.  The
operand version changes only when all operand replicas have successfully
upgraded and are ready.  This information is used to block PPC
Reconcilation loop until the operator and operand RELEASE_VERSION agree.
This is a short-term measure to prevent from multiple node reboots
during upgrades.

Other fixes:
  - Disallow MC updates during upgrades when kernel command-line
    parameters of nodes within a MCP do not match.
  - ClusterOperator/node-tuning object was sometimes giving false
    information based on an old Metadata.Generation.

Resolves: OCPBUGS-17585
  • Loading branch information
jmencak committed Aug 30, 2023
1 parent 5517330 commit c756bbd
Show file tree
Hide file tree
Showing 14 changed files with 317 additions and 131 deletions.
8 changes: 8 additions & 0 deletions assets/tuned/manifests/ds-tuned.yaml
Expand Up @@ -65,6 +65,10 @@ spec:
name: lib-modules
mountPropagation: HostToContainer
readOnly: true
- mountPath: /var/lib/kubelet
name: var-lib-kubelet
mountPropagation: HostToContainer
readOnly: true
- mountPath: /host
name: host
mountPropagation: HostToContainer
Expand Down Expand Up @@ -116,6 +120,10 @@ spec:
path: /lib/modules
type: Directory
name: lib-modules
- hostPath:
path: /var/lib/kubelet
type: Directory
name: var-lib-kubelet
- name: host
hostPath:
path: /
Expand Down
6 changes: 2 additions & 4 deletions manifests/20-profile.crd.yaml
Expand Up @@ -81,10 +81,6 @@ spec:
is for internal use only and its fields may be changed/removed in the
future.
properties:
bootcmdline:
description: kernel parameters calculated by tuned for the active
Tuned profile
type: string
conditions:
description: conditions represents the state of the per-node Profile
application
Expand Down Expand Up @@ -126,3 +122,5 @@ spec:
type: object
served: true
storage: true
subresources:
status: {}
5 changes: 4 additions & 1 deletion manifests/40-rbac.yaml
Expand Up @@ -154,7 +154,10 @@ rules:
verbs: ["get","list","watch"]
- apiGroups: ["tuned.openshift.io"]
resources: ["profiles"]
verbs: ["get","list","update","watch","patch"]
verbs: ["get","list","watch"]
- apiGroups: ["tuned.openshift.io"]
resources: ["profiles/status"]
verbs: ["update"]
- apiGroups: ["security.openshift.io"]
resources: ["securitycontextconstraints"]
verbs: ["use"]
Expand Down
23 changes: 7 additions & 16 deletions pkg/apis/tuned/v1/tuned_types.go
Expand Up @@ -19,18 +19,12 @@ const (
// that reflects the node tuning operator status.
TunedClusterOperatorResourceName = "node-tuning"

// Annotation on Profiles to denote the operand version responsible for calculating and reporting
// the Profile status.
GeneratedByOperandVersionAnnotationKey string = "tuned.openshift.io/generated-by-operand-version"

// Tuned 'TunedRenderedResourceName' CR's .metadata.generation. This annotation is used on resources
// to note the Tuned 'TunedRenderedResourceName' generation based on which the resources with this
// annotation were created/updated.
RendredTunedGenerationAnnotationKey string = "tuned.openshift.io/rendered-tuned-generation"

// The value of this annotation is the TuneD profile based on which the resource with this annotation was
// created/updated.
TunedProfileAnnotationKey string = "tuned.openshift.io/tuned-profile"
// Name of the NTO operand for versioning in ClusterOperator.
TunedOperandName = "openshift-tuned"

// TunedBootcmdlineAnnotationKey is a Node-specific annotation denoting kernel command-line parameters
// calculated by TuneD for the current profile applied to that Node.
TunedBootcmdlineAnnotationKey string = "tuned.openshift.io/bootcmdline"
)

/////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -146,6 +140,7 @@ type TunedList struct {
/////////////////////////////////////////////////////////////////////////////////
// +genclient
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// +kubebuilder:subresource:status

// Profile is a specification for a Profile resource.
type Profile struct {
Expand Down Expand Up @@ -176,10 +171,6 @@ type ProfileConfig struct {
// ProfileStatus is the status for a Profile resource; the status is for internal use only
// and its fields may be changed/removed in the future.
type ProfileStatus struct {
// kernel parameters calculated by tuned for the active Tuned profile
// +optional
Bootcmdline string `json:"bootcmdline"`

// the current profile in use by the Tuned daemon
TunedProfile string `json:"tunedProfile"`

Expand Down
119 changes: 58 additions & 61 deletions pkg/operator/controller.go
Expand Up @@ -655,6 +655,10 @@ func (c *Controller) syncProfile(tuned *tunedv1.Tuned, nodeName string) error {

metrics.ProfileCalculated(profileMf.Name, tunedProfileName)

if err != nil {
return fmt.Errorf("failed to get Tuned %s: %v", tunedv1.TunedRenderedResourceName, err)
}

profile, err := c.listers.TunedProfiles.Get(profileMf.Name)
if err != nil {
if errors.IsNotFound(err) {
Expand All @@ -672,7 +676,6 @@ func (c *Controller) syncProfile(tuned *tunedv1.Tuned, nodeName string) error {
profileMf.Spec.Config.Debug = operand.Debug
profileMf.Spec.Config.TuneDConfig = operand.TuneDConfig
profileMf.Status.Conditions = tunedpkg.InitializeStatusConditions()
c.pc.state.profileLastUpdatedGeneration[profileMf.Name] = 1
_, err = c.clients.Tuned.TunedV1().Profiles(ntoconfig.WatchNamespace()).Create(context.TODO(), profileMf, metav1.CreateOptions{})
if err != nil {
return fmt.Errorf("failed to create Profile %s: %v", profileMf.Name, err)
Expand All @@ -697,27 +700,18 @@ func (c *Controller) syncProfile(tuned *tunedv1.Tuned, nodeName string) error {
return fmt.Errorf("failed to get ProviderName: %v", err)
}

// profileLastUpdatedGeneration is set for each Profile when it is created or updated.
// In the case where it is not already set due to an operator restart, set it and
// clear profile.Status.Bootcmdline instead of syncing MachineConfigs.
_, lastUpdateGenSet := c.pc.state.profileLastUpdatedGeneration[profile.Name]

if ntoconfig.InHyperShift() {
// nodePoolName is the name of the NodePool which the Node corresponding to this Profile
// is a part of. If nodePoolName is the empty string, it either means that Node label
// based matching was used, or we don't know the NodePool, so we should not sync the
// MachineConfigs.
if nodePoolName != "" && lastUpdateGenSet {
if profile.Generation > c.pc.state.profileLastUpdatedGeneration[profile.Name]+1 {
klog.Infof("refusing to sync MachineConfig because Profile %s is more than 1 generation ahead of last update by operator", profile.Name)
} else {
if profile.Status.TunedProfile == tunedProfileName && profileApplied(profile) {
// Synchronize MachineConfig only once the (calculated) TuneD profile 'tunedProfileName'
// has been successfully applied.
err := c.syncMachineConfigHyperShift(nodePoolName, profile)
if err != nil {
return fmt.Errorf("failed to update Profile %s: %v", profile.Name, err)
}
if nodePoolName != "" {
if profile.Status.TunedProfile == tunedProfileName && profileApplied(profile) {
// Synchronize MachineConfig only once the (calculated) TuneD profile 'tunedProfileName'
// has been successfully applied.
err := c.syncMachineConfigHyperShift(nodePoolName, profile)
if err != nil {
return fmt.Errorf("failed to update Profile %s: %v", profile.Name, err)
}
}
}
Expand All @@ -729,21 +723,19 @@ func (c *Controller) syncProfile(tuned *tunedv1.Tuned, nodeName string) error {
if profile.Status.TunedProfile == tunedProfileName && profileApplied(profile) {
// Synchronize MachineConfig only once the (calculated) TuneD profile 'tunedProfileName'
// has been successfully applied.
err := c.syncMachineConfig(getMachineConfigNameForPools(pools), mcLabels, profile)
err := c.syncMachineConfig(pools, mcLabels, profile)
if err != nil {
return fmt.Errorf("failed to update Profile %s: %v", profile.Name, err)
}
}
}
}

// Only update Profile if the spec needs to be changed OR if !lastUpdateGenSet.
// Update, triggering the operand to update the Profile status.
// Only update Profile if the spec needs to be changed.
if profile.Spec.Config.TunedProfile == tunedProfileName &&
profile.Spec.Config.Debug == operand.Debug &&
reflect.DeepEqual(profile.Spec.Config.TuneDConfig, operand.TuneDConfig) &&
profile.Spec.Config.ProviderName == providerName &&
lastUpdateGenSet {
profile.Spec.Config.ProviderName == providerName {
klog.V(2).Infof("syncProfile(): no need to update Profile %s", nodeName)
return nil
}
Expand All @@ -754,9 +746,7 @@ func (c *Controller) syncProfile(tuned *tunedv1.Tuned, nodeName string) error {
profile.Spec.Config.ProviderName = providerName
profile.Status.Conditions = tunedpkg.InitializeStatusConditions()

c.pc.state.profileLastUpdatedGeneration[profile.Name] = profile.Generation + 1 // after this update

klog.V(2).Infof("syncProfile(): updating Profile %s [%s]. Last operator updated generation: %d", profile.Name, tunedProfileName, c.pc.state.profileLastUpdatedGeneration[profile.Name])
klog.V(2).Infof("syncProfile(): updating Profile %s [%s]", profile.Name, tunedProfileName)
_, err = c.clients.Tuned.TunedV1().Profiles(ntoconfig.WatchNamespace()).Update(context.TODO(), profile, metav1.UpdateOptions{})
if err != nil {
return fmt.Errorf("failed to update Profile %s: %v", profile.Name, err)
Expand All @@ -775,25 +765,30 @@ func (c *Controller) getProviderName(nodeName string) (string, error) {
return util.GetProviderName(node.Spec.ProviderID), nil
}

func (c *Controller) syncMachineConfig(name string, labels map[string]string, profile *tunedv1.Profile) error {
func (c *Controller) syncMachineConfig(pools []*mcfgv1.MachineConfigPool, labels map[string]string, profile *tunedv1.Profile) error {
var (
kernelArguments []string
)

if v := profile.ObjectMeta.Annotations[tunedv1.GeneratedByOperandVersionAnnotationKey]; v != os.Getenv("RELEASE_VERSION") {
// This looks like an update triggered by an old (not-yet-upgraded) operand. Ignore it.
klog.Infof("refusing to sync MachineConfig %q due to Profile %q change generated by operand version %q", name, profile.Name, v)
name := getMachineConfigNameForPools(pools)
nodes, err := c.pc.getNodesForPool(pools[0])
if err != nil {
return err
}

bootcmdline, bootcmdlineSet := c.pc.state.bootcmdline[profile.Name]
if !bootcmdlineSet {
klog.V(2).Infof("syncMachineConfig(): bootcmdline for %s not cached, sync cancelled", profile.Name)
return nil
}

bootcmdline := profile.Status.Bootcmdline
if ok := c.allNodesAgreeOnBootcmdline(nodes); !ok {
return fmt.Errorf("not all %d Nodes in MCP %v agree on bootcmdline: %s", len(nodes), pools[0].ObjectMeta.Name, bootcmdline)
}

kernelArguments = util.SplitKernelArguments(bootcmdline)

annotations := map[string]string{
GeneratedByControllerVersionAnnotationKey: version.Version,
tunedv1.RendredTunedGenerationAnnotationKey: profile.ObjectMeta.Annotations[tunedv1.RendredTunedGenerationAnnotationKey],
tunedv1.TunedProfileAnnotationKey: profile.Status.TunedProfile,
}
annotations := map[string]string{GeneratedByControllerVersionAnnotationKey: version.Version}

mc, err := c.listers.MachineConfigs.Get(name)
if err != nil {
Expand Down Expand Up @@ -823,17 +818,6 @@ func (c *Controller) syncMachineConfig(name string, labels map[string]string, pr
// No update needed
klog.V(2).Infof("syncMachineConfig(): MachineConfig %s doesn't need updating", mc.ObjectMeta.Name)
return nil
} else if mcAnnotationsMatch(mc.ObjectMeta.Annotations, annotations) {
// Kernel arguments differ and they were generated based on the same TuneD profile and Tuned/rendered CR.
machineCount, err := c.pc.getMachineCountForMachineConfigPool(mc.Name[len(MachineConfigPrefix)+1:])
if err != nil {
return err
} else if machineCount > 1 {
klog.Warningf("refusing to update MachineConfig %s for %s due to kernel arguments change with unchanged input configuration (%s/%s). Node(s) with different (CPU) topology in the same MCP?",
mc.Name, profile.Name, annotations[tunedv1.RendredTunedGenerationAnnotationKey], annotations[tunedv1.TunedProfileAnnotationKey])
c.bootcmdlineConflict[profile.Name] = true
return nil
}
}
mc = mc.DeepCopy() // never update the objects from cache
mc.ObjectMeta.Annotations = mcNew.ObjectMeta.Annotations
Expand All @@ -852,6 +836,26 @@ func (c *Controller) syncMachineConfig(name string, labels map[string]string, pr
return nil
}

// allNodesAgreeOnBootcmdline returns true if the current cached annotation 'TunedBootcmdlineAnnotationKey'
// of all Nodes in slice 'nodes' has the same value.
func (c *Controller) allNodesAgreeOnBootcmdline(nodes []*corev1.Node) bool {
if len(nodes) == 0 {
return true
}

match := true
bootcmdline := c.pc.state.bootcmdline[nodes[0].ObjectMeta.Name]
for _, node := range nodes[1:] {
if bootcmdline != c.pc.state.bootcmdline[node.ObjectMeta.Name] {
klog.V(2).Infof("found a conflicting bootcmdline %q for Node %q", c.pc.state.bootcmdline[node.ObjectMeta.Name], node.ObjectMeta.Name)
c.bootcmdlineConflict[node.ObjectMeta.Name] = true
match = false
}
}

return match
}

func (c *Controller) syncMachineConfigHyperShift(nodePoolName string, profile *tunedv1.Profile) error {
var (
kernelArguments []string
Expand All @@ -860,13 +864,16 @@ func (c *Controller) syncMachineConfigHyperShift(nodePoolName string, profile *t
mcName := MachineConfigPrefix + "-" + nodePoolName
configMapName := mcConfigMapName(nodePoolName)

if v := profile.ObjectMeta.Annotations[tunedv1.GeneratedByOperandVersionAnnotationKey]; v != os.Getenv("RELEASE_VERSION") {
// This looks like an update triggered by an old (not-yet-upgraded) operand. Ignore it.
klog.Infof("refusing to sync MachineConfig ConfigMap %q due to Profile %q change generated by operand version %q", configMapName, profile.Name, v)
return nil
nodes, err := c.getNodesForNodePool(nodePoolName)
if err != nil {
return fmt.Errorf("could not fetch a list of Nodes for NodePool %s: %v", nodePoolName, err)
}

bootcmdline := c.pc.state.bootcmdline[profile.Name]
if ok := c.allNodesAgreeOnBootcmdline(nodes); !ok {
return fmt.Errorf("not all %d Nodes in NodePool %v agree on bootcmdline: %s", len(nodes), nodePoolName, bootcmdline)
}

bootcmdline := profile.Status.Bootcmdline
kernelArguments = util.SplitKernelArguments(bootcmdline)

annotations := map[string]string{GeneratedByControllerVersionAnnotationKey: version.Version}
Expand Down Expand Up @@ -1401,16 +1408,6 @@ func (c *Controller) NeedLeaderElection() bool {
return true
}

func mcAnnotationsMatch(old, new map[string]string) bool {
for _, key := range []string{tunedv1.RendredTunedGenerationAnnotationKey, tunedv1.TunedProfileAnnotationKey} {
if old[key] != new[key] {
return false
}
}

return true
}

func tunedMapFromList(tuneds []tunedv1.Tuned) map[string]tunedv1.Tuned {
ret := map[string]tunedv1.Tuned{}
for _, tuned := range tuneds {
Expand Down
18 changes: 18 additions & 0 deletions pkg/operator/hypershift.go
Expand Up @@ -16,6 +16,7 @@ import (
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
serializer "k8s.io/apimachinery/pkg/runtime/serializer/json"
yamlutil "k8s.io/apimachinery/pkg/util/yaml"
Expand Down Expand Up @@ -172,6 +173,23 @@ func (c *Controller) getObjFromTunedConfigMap() ([]tunedv1.Tuned, error) {
return cmTuneds, nil
}

// getNodesForNodePool uses 'hypershiftNodePoolLabel' to return all Nodes which are in
// the NodePool 'nodePoolName'.
func (c *Controller) getNodesForNodePool(nodePoolName string) ([]*corev1.Node, error) {
nodes := []*corev1.Node{}

selector := labels.SelectorFromValidatedSet(
map[string]string{
hypershiftNodePoolLabel: nodePoolName,
})
nodes, err := c.pc.listers.Nodes.List(selector)
if err != nil {
return nodes, err
}

return nodes, err
}

// parseManifests parses a YAML or JSON document that may contain one or more
// kubernetes resources.
func parseTunedManifests(data []byte, nodePoolName string) ([]tunedv1.Tuned, error) {
Expand Down
30 changes: 30 additions & 0 deletions pkg/operator/mc.go
Expand Up @@ -259,6 +259,36 @@ func (pc *ProfileCalculator) getPoolsForNode(node *corev1.Node) ([]*mcfgv1.Machi
return []*mcfgv1.MachineConfigPool{worker}, nil
}

// getNodesForPool returns a list of Nodes for MachineConfigPool 'pool'.
func (pc *ProfileCalculator) getNodesForPool(pool *mcfgv1.MachineConfigPool) ([]*corev1.Node, error) {
selector, err := metav1.LabelSelectorAsSelector(pool.Spec.NodeSelector)
if err != nil {
return nil, fmt.Errorf("invalid label selector %s in MachineConfigPool %s: %v", util.ObjectInfo(selector), pool.ObjectMeta.Name, err)
}

initialNodes, err := pc.listers.Nodes.List(selector)
if err != nil {
return nil, err
}

nodes := []*corev1.Node{}
for _, n := range initialNodes {
p, err := pc.getPrimaryPoolForNode(n)
if err != nil {
klog.Warningf("cannot get pool for node %q: %v", n.Name, err)
continue
}
if p == nil {
continue
}
if p.Name != pool.Name {
continue
}
nodes = append(nodes, n)
}
return nodes, nil
}

// getPrimaryPoolForNode uses getPoolsForNode and returns the first one which is the one the node targets
func (pc *ProfileCalculator) getPrimaryPoolForNode(node *corev1.Node) (*mcfgv1.MachineConfigPool, error) {
pools, err := pc.getPoolsForNode(node)
Expand Down

0 comments on commit c756bbd

Please sign in to comment.