Skip to content

Commit

Permalink
Support instance-manager pod for v2 data engine on selected nodes
Browse files Browse the repository at this point in the history
When a kubernetes node is labeled with "node.longhorn.io/disable-v2-data-engine: "true"",
the v2 data engine on the node is not supported.

Longhorn 7015

Signed-off-by: Derek Su <derek.su@suse.com>
  • Loading branch information
derekbit authored and David Ko committed Jan 3, 2024
1 parent 420b779 commit 91bc9cb
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 7 deletions.
43 changes: 36 additions & 7 deletions controller/node_controller.go
Expand Up @@ -1061,22 +1061,41 @@ func (nc *NodeController) syncInstanceManagers(node *longhorn.Node) error {
return fmt.Errorf("instance manager %v nodeID %v is not consistent with the label %v=%v",
im.Name, im.Spec.NodeID, types.GetLonghornLabelKey(types.LonghornLabelNode), im.Labels[types.GetLonghornLabelKey(types.LonghornLabelNode)])
}

runningOrStartingInstanceFound := false
if im.Status.CurrentState == longhorn.InstanceManagerStateRunning && im.DeletionTimestamp == nil {
// nolint:all
for _, instance := range types.ConsolidateInstances(im.Status.InstanceEngines, im.Status.InstanceReplicas, im.Status.Instances) {
if instance.Status.State == longhorn.InstanceStateRunning || instance.Status.State == longhorn.InstanceStateStarting {
runningOrStartingInstanceFound = true
break
}
}
}

cleanupRequired := true

if im.Spec.Image == defaultInstanceManagerImage && im.Spec.DataEngine == dataEngine {
// Create default instance manager if needed.
defaultInstanceManagerCreated = true
cleanupRequired = false

if datastore.IsDataEngineV2(dataEngine) {
disabled, err := nc.ds.IsV2DataEngineDisabledForNode(node.Name)
if err != nil {
return errors.Wrapf(err, "failed to check if v2 data engine is disabled on node %v", node.Name)
}
if disabled && !runningOrStartingInstanceFound {
log.Infof("Cleaning up instance manager %v since v2 data engine is disabled for node %v", im.Name, node.Name)
cleanupRequired = true
}
}
} else {
// Clean up old instance managers if there is no running instance.
if im.Status.CurrentState == longhorn.InstanceManagerStateRunning && im.DeletionTimestamp == nil {
for _, instance := range types.ConsolidateInstances(im.Status.InstanceEngines, im.Status.InstanceReplicas, im.Status.Instances) {
if instance.Status.State == longhorn.InstanceStateRunning || instance.Status.State == longhorn.InstanceStateStarting {
cleanupRequired = false
break
}
}
if runningOrStartingInstanceFound {
cleanupRequired = false
}

if im.Status.CurrentState == longhorn.InstanceManagerStateUnknown && im.DeletionTimestamp == nil {
cleanupRequired = false
log.Debugf("Skipping cleaning up non-default unknown instance manager %s", im.Name)
Expand All @@ -1094,6 +1113,16 @@ func (nc *NodeController) syncInstanceManagers(node *longhorn.Node) error {
if err != nil {
return err
}
if datastore.IsDataEngineV2(dataEngine) {
disabled, err := nc.ds.IsV2DataEngineDisabledForNode(node.Name)
if err != nil {
return errors.Wrapf(err, "failed to check if v2 data engine is disabled on node %v", node.Name)
}
if disabled {
continue
}
}

log.Infof("Creating default instance manager %v, image: %v, dataEngine: %v", imName, defaultInstanceManagerImage, dataEngine)
if _, err := nc.createInstanceManager(node, imName, defaultInstanceManagerImage, imType, dataEngine); err != nil {
return err
Expand Down
13 changes: 13 additions & 0 deletions datastore/longhorn.go
Expand Up @@ -4916,3 +4916,16 @@ func (s *DataStore) GetDataEngines() map[longhorn.DataEngineType]struct{} {

return dataEngines
}

// IsV2VolumeDisabledForNode returns true if the node disables v2 data engine
func (s *DataStore) IsV2DataEngineDisabledForNode(nodeName string) (bool, error) {
kubeNode, err := s.GetKubernetesNodeRO(nodeName)
if err != nil {
return false, err
}
val, ok := kubeNode.Labels[types.NodeDisableV2DataEngineLabelKey]
if ok && val == types.NodeDisableV2DataEngineLabelKeyTrue {
return true, nil
}
return false, nil
}
11 changes: 11 additions & 0 deletions scheduler/replica_scheduler.go
Expand Up @@ -118,6 +118,17 @@ func (rcs *ReplicaScheduler) getNodeCandidates(nodesInfo map[string]*longhorn.No

nodeCandidates = map[string]*longhorn.Node{}
for _, node := range nodesInfo {
if datastore.IsDataEngineV2(schedulingReplica.Spec.DataEngine) {
disabled, err := rcs.ds.IsV2DataEngineDisabledForNode(node.Name)
if err != nil {
logrus.WithError(err).Errorf("Failed to check if v2 data engine is disabled on node %v", node.Name)
return nil, util.NewMultiError(longhorn.ErrorReplicaScheduleSchedulingFailed)
}
if disabled {
continue
}
}

if isReady, _ := rcs.ds.CheckDataEngineImageReadiness(schedulingReplica.Spec.Image, schedulingReplica.Spec.DataEngine, node.Name); isReady {
nodeCandidates[node.Name] = node
}
Expand Down
2 changes: 2 additions & 0 deletions types/types.go
Expand Up @@ -124,6 +124,8 @@ const (
NodeCreateDefaultDiskLabelKey = "node.longhorn.io/create-default-disk"
NodeCreateDefaultDiskLabelValueTrue = "true"
NodeCreateDefaultDiskLabelValueConfig = "config"
NodeDisableV2DataEngineLabelKey = "node.longhorn.io/disable-v2-data-engine"
NodeDisableV2DataEngineLabelKeyTrue = "true"
KubeNodeDefaultDiskConfigAnnotationKey = "node.longhorn.io/default-disks-config"
KubeNodeDefaultNodeTagConfigAnnotationKey = "node.longhorn.io/default-node-tags"

Expand Down

0 comments on commit 91bc9cb

Please sign in to comment.