From 38a506432fc81ed212acbfecbe17a0e859e333cf Mon Sep 17 00:00:00 2001 From: Chin2691 Date: Tue, 19 May 2026 14:30:15 +0530 Subject: [PATCH] fix: OCPBUGS-86073: validate duplicate failureDomain names and topology (Nutanix, vSphere) The installer does not detect when multiple failure domains point to the same underlying infrastructure. A user can configure two failure domains with different names but identical topology (same Prism Element and subnets on Nutanix, or same server/datacenter/computeCluster/ datastore/networks/resourcePool on vSphere). The installer accepts this without any warning, giving users a false sense of zone-level fault tolerance when none exists. Additionally, Nutanix failure domain validation did not check for duplicate names, unlike vSphere which already had this check. This commit adds: - Nutanix: reject failure domains with duplicate names - Nutanix: reject failure domains with identical topology (same prismElement UUID and subnet UUIDs) - vSphere: reject failure domains with identical topology (same server, datacenter, computeCluster, datastore, networks, and resourcePool) All three scenarios were confirmed on live 4.22 RC3 clusters before this fix. Co-authored-by: Cursor --- pkg/types/nutanix/validation/platform.go | 30 +++++++++- pkg/types/nutanix/validation/platform_test.go | 59 +++++++++++++++++++ pkg/types/vsphere/validation/platform.go | 26 ++++++++ pkg/types/vsphere/validation/platform_test.go | 12 +++- 4 files changed, 125 insertions(+), 2 deletions(-) diff --git a/pkg/types/nutanix/validation/platform.go b/pkg/types/nutanix/validation/platform.go index b981d3555a..73cdad526e 100644 --- a/pkg/types/nutanix/validation/platform.go +++ b/pkg/types/nutanix/validation/platform.go @@ -3,6 +3,8 @@ package validation import ( "fmt" "regexp" + "sort" + "strings" "k8s.io/apimachinery/pkg/util/validation/field" @@ -103,11 +105,28 @@ func ValidatePlatform(p *nutanix.Platform, fldPath *field.Path, c *types.Install if err != nil { allErrs = append(allErrs, field.InternalError(fldPath.Child("failureDomain", "name"), fmt.Errorf("fail to compile the pattern %q: %w", pattern, err))) } else { - for _, fd := range p.FailureDomains { + fdNames := make(map[string]int) + fdTopologies := make(map[string]string) + + for idx, fd := range p.FailureDomains { if !rexp.MatchString(fd.Name) { allErrs = append(allErrs, field.Invalid(fldPath.Child("failureDomain", "name"), fd.Name, fmt.Sprintf("failureDomain name should match the pattern %q.", pattern))) } + if prevIdx, exists := fdNames[fd.Name]; exists { + allErrs = append(allErrs, field.Duplicate(fldPath.Child("failureDomains").Index(idx).Child("name"), fmt.Sprintf("failure domain name %q is already used by failureDomains[%d]", fd.Name, prevIdx))) + } else { + fdNames[fd.Name] = idx + } + + topoKey := nutanixFailureDomainTopologyKey(fd) + if prevName, exists := fdTopologies[topoKey]; exists { + allErrs = append(allErrs, field.Invalid(fldPath.Child("failureDomains").Index(idx), fd.Name, + fmt.Sprintf("failure domain %q has identical topology (same prismElement and subnets) as %q; this provides no additional fault tolerance", fd.Name, prevName))) + } else { + fdTopologies[topoKey] = fd.Name + } + if fd.PrismElement.UUID == "" { allErrs = append(allErrs, field.Required(fldPath.Child("failureDomain", "prismElement", "uuid"), "failureDomain prismElement uuid cannot be empty")) } @@ -153,6 +172,15 @@ func validateLoadBalancer(lbType configv1.PlatformLoadBalancerType) bool { } } +// nutanixFailureDomainTopologyKey builds a comparable key from the infrastructure-defining +// fields of a failure domain: Prism Element UUID and sorted subnet UUIDs. +func nutanixFailureDomainTopologyKey(fd nutanix.FailureDomain) string { + subnets := make([]string, len(fd.SubnetUUIDs)) + copy(subnets, fd.SubnetUUIDs) + sort.Strings(subnets) + return fmt.Sprintf("pe=%s;subnets=%s", fd.PrismElement.UUID, strings.Join(subnets, ",")) +} + // validateSubnets validates the input subnetUUIDs meet the configuration requirements. func validateSubnets(fldPath *field.Path, subnetUUIDs []string) field.ErrorList { var errs field.ErrorList diff --git a/pkg/types/nutanix/validation/platform_test.go b/pkg/types/nutanix/validation/platform_test.go index 7ac9019697..966897af1d 100644 --- a/pkg/types/nutanix/validation/platform_test.go +++ b/pkg/types/nutanix/validation/platform_test.go @@ -389,6 +389,65 @@ func TestValidatePlatform(t *testing.T) { }(), expectedError: `^test-path\.failureDomain\.dataSourceImages: Required value: failureDomain "fd-1": both the dataSourceImage's uuid and name are empty, you need to configure one\.$`, }, + { + name: "failureDomain with duplicate name", + platform: func() *nutanix.Platform { + p := validPlatform() + p.FailureDomains = []nutanix.FailureDomain{ + { + Name: "fd-1", + PrismElement: nutanix.PrismElement{UUID: "fd-pe-uuid-1", Endpoint: nutanix.PrismEndpoint{Address: "fd-pe-1", Port: 9440}}, + SubnetUUIDs: []string{"fd-subnet-uuid-1"}, + }, + { + Name: "fd-1", + PrismElement: nutanix.PrismElement{UUID: "fd-pe-uuid-2", Endpoint: nutanix.PrismEndpoint{Address: "fd-pe-2", Port: 9440}}, + SubnetUUIDs: []string{"fd-subnet-uuid-2"}, + }, + } + return p + }(), + expectedError: `test-path\.failureDomains\[1\]\.name: Duplicate value: "failure domain name \\"fd-1\\" is already used by failureDomains\[0\]"`, + }, + { + name: "failureDomain with duplicate topology same prismElement and subnet", + platform: func() *nutanix.Platform { + p := validPlatform() + p.FailureDomains = []nutanix.FailureDomain{ + { + Name: "fd-1", + PrismElement: nutanix.PrismElement{UUID: "fd-pe-uuid", Endpoint: nutanix.PrismEndpoint{Address: "fd-pe", Port: 9440}}, + SubnetUUIDs: []string{"fd-subnet-uuid"}, + }, + { + Name: "fd-2", + PrismElement: nutanix.PrismElement{UUID: "fd-pe-uuid", Endpoint: nutanix.PrismEndpoint{Address: "fd-pe", Port: 9440}}, + SubnetUUIDs: []string{"fd-subnet-uuid"}, + }, + } + return p + }(), + expectedError: `test-path\.failureDomains\[1\]: Invalid value: "fd-2": failure domain "fd-2" has identical topology \(same prismElement and subnets\) as "fd-1"; this provides no additional fault tolerance`, + }, + { + name: "valid failureDomain with different prismElements", + platform: func() *nutanix.Platform { + p := validPlatform() + p.FailureDomains = []nutanix.FailureDomain{ + { + Name: "fd-1", + PrismElement: nutanix.PrismElement{UUID: "fd-pe-uuid-1", Endpoint: nutanix.PrismEndpoint{Address: "fd-pe-1", Port: 9440}}, + SubnetUUIDs: []string{"fd-subnet-uuid"}, + }, + { + Name: "fd-2", + PrismElement: nutanix.PrismElement{UUID: "fd-pe-uuid-2", Endpoint: nutanix.PrismEndpoint{Address: "fd-pe-2", Port: 9440}}, + SubnetUUIDs: []string{"fd-subnet-uuid"}, + }, + } + return p + }(), + }, { name: "valid failureDomain", platform: func() *nutanix.Platform { diff --git a/pkg/types/vsphere/validation/platform.go b/pkg/types/vsphere/validation/platform.go index 75209d8bc0..77b24b0009 100644 --- a/pkg/types/vsphere/validation/platform.go +++ b/pkg/types/vsphere/validation/platform.go @@ -5,6 +5,7 @@ import ( "net" "path/filepath" "regexp" + "sort" "strings" "github.com/sirupsen/logrus" @@ -163,6 +164,7 @@ func validateFailureDomains(p *vsphere.Platform, platformFldPath *field.Path, fl var associatedVCenter *vsphere.VCenter zoneNames := make(map[string]string) + fdTopologies := make(map[string]string) for index, failureDomain := range p.FailureDomains { if regionName, ok := zoneNames[failureDomain.Zone]; !ok { @@ -171,6 +173,14 @@ func validateFailureDomains(p *vsphere.Platform, platformFldPath *field.Path, fl allErrs = append(allErrs, field.Invalid(fldPath.Child("zone"), failureDomain.Zone, fmt.Sprintf("cannot be used more than once for the failure domain region %q", failureDomain.Region))) } + topoKey := vsphereFailureDomainTopologyKey(failureDomain) + if prevName, exists := fdTopologies[topoKey]; exists { + allErrs = append(allErrs, field.Invalid(fldPath.Index(index), failureDomain.Name, + fmt.Sprintf("failure domain %q has identical topology (same server, datacenter, computeCluster, datastore, networks, resourcePool) as %q; this provides no additional fault tolerance", failureDomain.Name, prevName))) + } else { + fdTopologies[topoKey] = failureDomain.Name + } + if failureDomain.ZoneType == "" && failureDomain.RegionType == "" { logrus.Debug("using the defaults regionType is Datacenter and zoneType is ComputeCluster") } @@ -340,6 +350,22 @@ func validateFailureDomains(p *vsphere.Platform, platformFldPath *field.Path, fl return allErrs } +// vsphereFailureDomainTopologyKey builds a comparable key from the infrastructure-defining +// fields of a failure domain topology: server, datacenter, compute cluster, datastore, +// networks, and resource pool. +func vsphereFailureDomainTopologyKey(fd vsphere.FailureDomain) string { + networks := make([]string, len(fd.Topology.Networks)) + copy(networks, fd.Topology.Networks) + sort.Strings(networks) + return fmt.Sprintf("server=%s;dc=%s;cluster=%s;ds=%s;nets=%s;rp=%s", + fd.Server, + fd.Topology.Datacenter, + fd.Topology.ComputeCluster, + fd.Topology.Datastore, + strings.Join(networks, ","), + fd.Topology.ResourcePool) +} + // validateDiskType checks that the specified diskType is valid. func validateDiskType(p *vsphere.Platform, fldPath *field.Path) field.ErrorList { allErrs := field.ErrorList{} diff --git a/pkg/types/vsphere/validation/platform_test.go b/pkg/types/vsphere/validation/platform_test.go index 099188dd99..767977d59c 100644 --- a/pkg/types/vsphere/validation/platform_test.go +++ b/pkg/types/vsphere/validation/platform_test.go @@ -378,7 +378,7 @@ func TestValidatePlatform(t *testing.T) { for i := range p.FailureDomains { p.FailureDomains[i].Topology.ComputeCluster = "/test-datacenter/host/HostClusterFolder/test-cluster" - p.FailureDomains[i].Topology.ResourcePool = "/test-datacenter/host/HostClusterFolder/test-cluster/Resources/test-resourcepool" + p.FailureDomains[i].Topology.ResourcePool = fmt.Sprintf("/test-datacenter/host/HostClusterFolder/test-cluster/Resources/test-resourcepool-%d", i) p.FailureDomains[i].Topology.Datastore = "/test-datacenter/datastore/StorageFolder/test-datastore" p.FailureDomains[i].Topology.Folder = "/test-datacenter/vm/VMFolder/test-folder" } @@ -552,6 +552,16 @@ func TestValidatePlatform(t *testing.T) { }(), expectedError: `test-path\.failureDomains\.name\[1\]: Duplicate value: "test-east-1a"`, }, + { + name: "Multi-zone platform failureDomain duplicate topology", + platform: func() *vsphere.Platform { + p := validPlatform() + p.FailureDomains[1].Topology = p.FailureDomains[0].Topology + p.FailureDomains[1].Server = p.FailureDomains[0].Server + return p + }(), + expectedError: `test-path\.failureDomains\[1\]: Invalid value: "test-east-2a": failure domain "test-east-2a" has identical topology .* as "test-east-1a"; this provides no additional fault tolerance`, + }, { name: "Multi-zone platform failureDomain zone missing tag category", platform: func() *vsphere.Platform {