diff --git a/cloud/ociutil/ociutil.go b/cloud/ociutil/ociutil.go index 93976190..93e85403 100644 --- a/cloud/ociutil/ociutil.go +++ b/cloud/ociutil/ociutil.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "net/http" + "strings" "time" lb "github.com/oracle/cluster-api-provider-oci/cloud/services/loadbalancer" @@ -172,6 +173,40 @@ func BuildClusterTags(ClusterResourceUID string) map[string]string { return tags } +// NOTE: Currently we only key off the documented "Out of host capacity" error. +// If OCI starts surfacing additional codes/messages we can expand this list. +// Reference: https://docs.oracle.com/en-us/iaas/Content/Compute/Tasks/troubleshooting-out-of-host-capacity.htm +var outOfCapacityErrorCodes = map[string]struct{}{ + "OUTOFHOSTCAPACITY": {}, +} + +var outOfCapacityErrorMessages = []string{ + "out of host capacity", +} + +// IsOutOfHostCapacityError returns true when the OCI service error indicates that the fault domain ran out of capacity. +func IsOutOfHostCapacityError(err error) bool { + if err == nil { + return false + } + err = errors.Cause(err) + serviceErr, ok := common.IsServiceError(err) + if !ok { + return false + } + code := serviceErr.GetCode() + if _, found := outOfCapacityErrorCodes[strings.ToUpper(code)]; found { + return true + } + message := strings.ToLower(serviceErr.GetMessage()) + for _, fragment := range outOfCapacityErrorMessages { + if strings.Contains(message, fragment) { + return true + } + } + return false +} + // DerefString returns the string value if the pointer isn't nil, otherwise returns empty string func DerefString(s *string) string { if s != nil { diff --git a/cloud/ociutil/ociutil_test.go b/cloud/ociutil/ociutil_test.go index 37449c0b..dee7f5a5 100644 --- a/cloud/ociutil/ociutil_test.go +++ b/cloud/ociutil/ociutil_test.go @@ -83,3 +83,64 @@ func TestAddToDefaultClusterTags(t *testing.T) { } } } + +func TestIsOutOfHostCapacityError(t *testing.T) { + testCases := []struct { + name string + err error + expected bool + }{ + { + name: "matches by code", + err: fakeServiceError{code: "OutOfHostCapacity", message: "any"}, + expected: true, + }, + { + name: "matches by message", + err: fakeServiceError{code: "Other", message: "Instance launch failed due to out of host capacity"}, + expected: true, + }, + { + name: "non service error", + err: fmt.Errorf("boom"), + expected: false, + }, + { + name: "nil error", + err: nil, + expected: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + actual := IsOutOfHostCapacityError(tc.err) + if actual != tc.expected { + t.Fatalf("expected %t but got %t for test %s", tc.expected, actual, tc.name) + } + }) + } +} + +type fakeServiceError struct { + code string + message string +} + +func (f fakeServiceError) Error() string { + return f.message +} + +func (f fakeServiceError) GetHTTPStatusCode() int { + return 400 +} + +func (f fakeServiceError) GetMessage() string { + return f.message +} + +func (f fakeServiceError) GetCode() string { + return f.code +} + +func (f fakeServiceError) GetOpcRequestID() string { return "" } diff --git a/cloud/scope/machine.go b/cloud/scope/machine.go index 64d58163..96fa3578 100644 --- a/cloud/scope/machine.go +++ b/cloud/scope/machine.go @@ -23,6 +23,7 @@ import ( "fmt" "math/big" "net/url" + "sort" "strconv" "github.com/go-logr/logr" @@ -82,6 +83,11 @@ type MachineScope struct { WorkRequestsClient wr.Client } +type faultDomainAttempt struct { + AvailabilityDomain string + FaultDomain string +} + // NewMachineScope creates a MachineScope given the MachineScopeParams func NewMachineScope(params MachineScopeParams) (*MachineScope, error) { if params.Machine == nil { @@ -232,6 +238,7 @@ func (m *MachineScope) GetOrCreateMachine(ctx context.Context) (*core.Instance, // the random number generated is between zero and two, whereas we need a number between one and three failureDomain = common.String(strconv.Itoa(int(randomFailureDomain.Int64()) + 1)) availabilityDomain = m.OCIClusterAccessor.GetFailureDomains()[*failureDomain].Attributes[AvailabilityDomain] + faultDomain = m.OCIClusterAccessor.GetFailureDomains()[*failureDomain].Attributes[FaultDomain] } metadata := m.OCIMachine.Spec.Metadata @@ -287,14 +294,112 @@ func (m *MachineScope) GetOrCreateMachine(ctx context.Context) (*core.Instance, launchDetails.PreemptibleInstanceConfig = m.getPreemptibleInstanceConfig() launchDetails.PlatformConfig = m.getPlatformConfig() launchDetails.LaunchVolumeAttachments = m.getLaunchVolumeAttachments() - req := core.LaunchInstanceRequest{LaunchInstanceDetails: launchDetails, - OpcRetryToken: ociutil.GetOPCRetryToken(string(m.OCIMachine.UID))} - resp, err := m.ComputeClient.LaunchInstance(ctx, req) - if err != nil { - return nil, err - } else { - return &resp.Instance, nil + // Build the list of availability/fault domain combinations we will try + // when launching the instance (primary FD first, then fallbacks). + faultDomains := m.buildFaultDomainLaunchAttempts(availabilityDomain, faultDomain) + return m.launchInstanceWithFaultDomainRetry(ctx, launchDetails, faultDomains) +} + +func (m *MachineScope) launchInstanceWithFaultDomainRetry(ctx context.Context, baseDetails core.LaunchInstanceDetails, attempts []faultDomainAttempt) (*core.Instance, error) { + if len(attempts) == 0 { + attempts = append(attempts, faultDomainAttempt{ + AvailabilityDomain: ociutil.DerefString(baseDetails.AvailabilityDomain), + }) + } + + opcRetryToken := ociutil.GetOPCRetryToken(string(m.OCIMachine.UID)) + var lastErr error + for idx, attempt := range attempts { + details := baseDetails + if attempt.AvailabilityDomain != "" { + details.AvailabilityDomain = common.String(attempt.AvailabilityDomain) + } + if attempt.FaultDomain != "" { + details.FaultDomain = common.String(attempt.FaultDomain) + } else { + details.FaultDomain = nil + } + + resp, err := m.ComputeClient.LaunchInstance(ctx, core.LaunchInstanceRequest{ + LaunchInstanceDetails: details, + OpcRetryToken: opcRetryToken, + }) + if err == nil { + return &resp.Instance, nil + } + lastAttempt := idx == len(attempts)-1 + if !ociutil.IsOutOfHostCapacityError(err) || lastAttempt { + return nil, err + } + lastErr = err + m.Logger.Info("Fault domain has run out of host capacity, retrying in a different domain", "faultDomain", attempt.FaultDomain) } + return nil, lastErr +} + +const defaultFaultDomainKey = "__no_fault_domain__" + +func (m *MachineScope) buildFaultDomainLaunchAttempts(availabilityDomain, initialFaultDomain string) []faultDomainAttempt { + var attempts []faultDomainAttempt + seen := make(map[string]bool) + addAttempt := func(fd string) { + key := fd + if fd == "" { + key = defaultFaultDomainKey + } + if seen[key] { + return + } + seen[key] = true + attempts = append(attempts, faultDomainAttempt{ + AvailabilityDomain: availabilityDomain, + FaultDomain: fd, + }) + } + + addAttempt(initialFaultDomain) + if availabilityDomain == "" { + return attempts + } + + // Prefer fault domains exposed via the Cluster status. This respects + // Cluster API's scheduling decisions before falling back to raw OCI data. + failureDomains := m.OCIClusterAccessor.GetFailureDomains() + if len(failureDomains) > 0 { + keys := make([]string, 0, len(failureDomains)) + for key := range failureDomains { + keys = append(keys, key) + } + sort.Strings(keys) + for _, key := range keys { + spec := failureDomains[key] + if spec.Attributes[AvailabilityDomain] != availabilityDomain { + continue + } + fd := spec.Attributes[FaultDomain] + if fd == "" { + continue + } + addAttempt(fd) + } + } + + if len(attempts) > 1 { + return attempts + } + + // If the cluster status didn't enumerate any additional fault domains, + // fall back to the cached availability-domain data gathered from OCI so we + // can still iterate through every physical fault domain in that AD. + if adMap := m.OCIClusterAccessor.GetAvailabilityDomains(); adMap != nil { + if adEntry, ok := adMap[availabilityDomain]; ok { + for _, fd := range adEntry.FaultDomains { + addAttempt(fd) + } + } + } + + return attempts } func (m *MachineScope) getFreeFormTags() map[string]string { diff --git a/cloud/scope/machine_test.go b/cloud/scope/machine_test.go index c7086394..1593386f 100644 --- a/cloud/scope/machine_test.go +++ b/cloud/scope/machine_test.go @@ -320,6 +320,130 @@ func TestInstanceReconciliation(t *testing.T) { })).Return(core.LaunchInstanceResponse{}, nil) }, }, + { + name: "retries alternate fault domains on capacity errors", + errorExpected: false, + testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) { + setupAllParams(ms) + ociCluster := ms.OCIClusterAccessor.(OCISelfManagedCluster).OCICluster + ociCluster.Status.FailureDomains = map[string]clusterv1.FailureDomainSpec{ + "1": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + "FaultDomain": "FAULT-DOMAIN-1", + }, + }, + "2": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + "FaultDomain": "FAULT-DOMAIN-2", + }, + }, + } + ms.Machine.Spec.FailureDomain = common.String("1") + computeClient.EXPECT().ListInstances(gomock.Any(), gomock.Eq(core.ListInstancesRequest{ + DisplayName: common.String("name"), + CompartmentId: common.String("test"), + })).Return(core.ListInstancesResponse{}, nil) + + first := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestMatcher(request, "FAULT-DOMAIN-1") + })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity")) + + second := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestMatcher(request, "FAULT-DOMAIN-2") + })).Return(core.LaunchInstanceResponse{ + Instance: core.Instance{ + Id: common.String("instance-id"), + }, + }, nil) + + gomock.InOrder(first, second) + }, + }, + { + name: "returns error after exhausting all fault domains", + errorExpected: true, + errorSubStringMatch: true, + matchError: errors.New("out of host capacity"), + testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) { + setupAllParams(ms) + ociCluster := ms.OCIClusterAccessor.(OCISelfManagedCluster).OCICluster + ociCluster.Status.FailureDomains = map[string]clusterv1.FailureDomainSpec{ + "1": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + "FaultDomain": "FAULT-DOMAIN-1", + }, + }, + "2": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + "FaultDomain": "FAULT-DOMAIN-2", + }, + }, + } + ms.Machine.Spec.FailureDomain = common.String("1") + computeClient.EXPECT().ListInstances(gomock.Any(), gomock.Eq(core.ListInstancesRequest{ + DisplayName: common.String("name"), + CompartmentId: common.String("test"), + })).Return(core.ListInstancesResponse{}, nil) + + first := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestMatcher(request, "FAULT-DOMAIN-1") + })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity")) + + second := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestMatcher(request, "FAULT-DOMAIN-2") + })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity in fd2")) + + gomock.InOrder(first, second) + }, + }, + { + name: "retries fault domains discovered from availability domain cache", + errorExpected: false, + testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) { + setupAllParams(ms) + ociCluster := ms.OCIClusterAccessor.(OCISelfManagedCluster).OCICluster + ociCluster.Status.FailureDomains = map[string]clusterv1.FailureDomainSpec{ + "1": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + }, + }, + } + ociCluster.Spec.AvailabilityDomains = map[string]infrastructurev1beta2.OCIAvailabilityDomain{ + "ad1": { + Name: "ad1", + FaultDomains: []string{"FAULT-DOMAIN-1", "FAULT-DOMAIN-2"}, + }, + } + ms.Machine.Spec.FailureDomain = common.String("1") + computeClient.EXPECT().ListInstances(gomock.Any(), gomock.Eq(core.ListInstancesRequest{ + DisplayName: common.String("name"), + CompartmentId: common.String("test"), + })).Return(core.ListInstancesResponse{}, nil) + + first := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestMatcher(request, "") + })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity")) + + second := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestMatcher(request, "FAULT-DOMAIN-1") + })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity")) + + third := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestMatcher(request, "FAULT-DOMAIN-2") + })).Return(core.LaunchInstanceResponse{ + Instance: core.Instance{ + Id: common.String("instance-id"), + }, + }, nil) + + gomock.InOrder(first, second, third) + }, + }, { name: "check all params together", errorExpected: false, @@ -1487,6 +1611,26 @@ func instanceCompartmentIDMatcher(request interface{}, matchStr string) error { return nil } +func faultDomainRequestMatcher(request interface{}, matchStr string) error { + r, ok := request.(core.LaunchInstanceRequest) + if !ok { + return errors.New("expecting LaunchInstanceRequest type") + } + if matchStr == "" { + if r.LaunchInstanceDetails.FaultDomain != nil { + return errors.New("expected fault domain to be nil") + } + return nil + } + if r.LaunchInstanceDetails.FaultDomain == nil { + return errors.New("fault domain was nil") + } + if *r.LaunchInstanceDetails.FaultDomain != matchStr { + return errors.New(fmt.Sprintf("expecting fault domain %s but got %s", matchStr, *r.LaunchInstanceDetails.FaultDomain)) + } + return nil +} + func platformConfigMatcher(actual interface{}, expected core.PlatformConfig) error { r, ok := actual.(core.LaunchInstanceRequest) if !ok { @@ -2925,3 +3069,35 @@ func setupAllParams(ms *MachineScope) { ociCluster.Spec.OCIResourceIdentifier = "resource_uid" ms.OCIMachine.UID = "machineuid" } + +type testServiceError struct { + code string + message string +} + +func newOutOfCapacityServiceError(message string) error { + return testServiceError{ + code: "OutOfHostCapacity", + message: message, + } +} + +func (t testServiceError) Error() string { + return t.message +} + +func (t testServiceError) GetHTTPStatusCode() int { + return 400 +} + +func (t testServiceError) GetMessage() string { + return t.message +} + +func (t testServiceError) GetCode() string { + return t.code +} + +func (t testServiceError) GetOpcRequestID() string { + return "" +}