Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions cloud/ociutil/ociutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"fmt"
"net/http"
"strings"
"time"

lb "github.com/oracle/cluster-api-provider-oci/cloud/services/loadbalancer"
Expand Down Expand Up @@ -172,6 +173,40 @@ func BuildClusterTags(ClusterResourceUID string) map[string]string {
return tags
}

// NOTE: Currently we only key off the documented "Out of host capacity" error.
// If OCI starts surfacing additional codes/messages we can expand this list.
// Reference: https://docs.oracle.com/en-us/iaas/Content/Compute/Tasks/troubleshooting-out-of-host-capacity.htm
var outOfCapacityErrorCodes = map[string]struct{}{
"OUTOFHOSTCAPACITY": {},
}

var outOfCapacityErrorMessages = []string{
"out of host capacity",
}

// IsOutOfHostCapacityError returns true when the OCI service error indicates that the fault domain ran out of capacity.
func IsOutOfHostCapacityError(err error) bool {
if err == nil {
return false
}
err = errors.Cause(err)
serviceErr, ok := common.IsServiceError(err)
if !ok {
return false
}
code := serviceErr.GetCode()
if _, found := outOfCapacityErrorCodes[strings.ToUpper(code)]; found {
return true
}
message := strings.ToLower(serviceErr.GetMessage())
for _, fragment := range outOfCapacityErrorMessages {
if strings.Contains(message, fragment) {
return true
}
}
return false
}

// DerefString returns the string value if the pointer isn't nil, otherwise returns empty string
func DerefString(s *string) string {
if s != nil {
Expand Down
61 changes: 61 additions & 0 deletions cloud/ociutil/ociutil_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,64 @@ func TestAddToDefaultClusterTags(t *testing.T) {
}
}
}

func TestIsOutOfHostCapacityError(t *testing.T) {
testCases := []struct {
name string
err error
expected bool
}{
{
name: "matches by code",
err: fakeServiceError{code: "OutOfHostCapacity", message: "any"},
expected: true,
},
{
name: "matches by message",
err: fakeServiceError{code: "Other", message: "Instance launch failed due to out of host capacity"},
expected: true,
},
{
name: "non service error",
err: fmt.Errorf("boom"),
expected: false,
},
{
name: "nil error",
err: nil,
expected: false,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
actual := IsOutOfHostCapacityError(tc.err)
if actual != tc.expected {
t.Fatalf("expected %t but got %t for test %s", tc.expected, actual, tc.name)
}
})
}
}

type fakeServiceError struct {
code string
message string
}

func (f fakeServiceError) Error() string {
return f.message
}

func (f fakeServiceError) GetHTTPStatusCode() int {
return 400
}

func (f fakeServiceError) GetMessage() string {
return f.message
}

func (f fakeServiceError) GetCode() string {
return f.code
}

func (f fakeServiceError) GetOpcRequestID() string { return "" }
119 changes: 112 additions & 7 deletions cloud/scope/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"fmt"
"math/big"
"net/url"
"sort"
"strconv"

"github.com/go-logr/logr"
Expand Down Expand Up @@ -82,6 +83,11 @@ type MachineScope struct {
WorkRequestsClient wr.Client
}

type faultDomainAttempt struct {
AvailabilityDomain string
FaultDomain string
}

// NewMachineScope creates a MachineScope given the MachineScopeParams
func NewMachineScope(params MachineScopeParams) (*MachineScope, error) {
if params.Machine == nil {
Expand Down Expand Up @@ -232,6 +238,7 @@ func (m *MachineScope) GetOrCreateMachine(ctx context.Context) (*core.Instance,
// the random number generated is between zero and two, whereas we need a number between one and three
failureDomain = common.String(strconv.Itoa(int(randomFailureDomain.Int64()) + 1))
availabilityDomain = m.OCIClusterAccessor.GetFailureDomains()[*failureDomain].Attributes[AvailabilityDomain]
faultDomain = m.OCIClusterAccessor.GetFailureDomains()[*failureDomain].Attributes[FaultDomain]
}

metadata := m.OCIMachine.Spec.Metadata
Expand Down Expand Up @@ -287,14 +294,112 @@ func (m *MachineScope) GetOrCreateMachine(ctx context.Context) (*core.Instance,
launchDetails.PreemptibleInstanceConfig = m.getPreemptibleInstanceConfig()
launchDetails.PlatformConfig = m.getPlatformConfig()
launchDetails.LaunchVolumeAttachments = m.getLaunchVolumeAttachments()
req := core.LaunchInstanceRequest{LaunchInstanceDetails: launchDetails,
OpcRetryToken: ociutil.GetOPCRetryToken(string(m.OCIMachine.UID))}
resp, err := m.ComputeClient.LaunchInstance(ctx, req)
if err != nil {
return nil, err
} else {
return &resp.Instance, nil
// Build the list of availability/fault domain combinations we will try
// when launching the instance (primary FD first, then fallbacks).
faultDomains := m.buildFaultDomainLaunchAttempts(availabilityDomain, faultDomain)
return m.launchInstanceWithFaultDomainRetry(ctx, launchDetails, faultDomains)
}

func (m *MachineScope) launchInstanceWithFaultDomainRetry(ctx context.Context, baseDetails core.LaunchInstanceDetails, attempts []faultDomainAttempt) (*core.Instance, error) {
if len(attempts) == 0 {
attempts = append(attempts, faultDomainAttempt{
AvailabilityDomain: ociutil.DerefString(baseDetails.AvailabilityDomain),
})
}

opcRetryToken := ociutil.GetOPCRetryToken(string(m.OCIMachine.UID))
var lastErr error
for idx, attempt := range attempts {
details := baseDetails
if attempt.AvailabilityDomain != "" {
details.AvailabilityDomain = common.String(attempt.AvailabilityDomain)
}
if attempt.FaultDomain != "" {
details.FaultDomain = common.String(attempt.FaultDomain)
} else {
details.FaultDomain = nil
}

resp, err := m.ComputeClient.LaunchInstance(ctx, core.LaunchInstanceRequest{
LaunchInstanceDetails: details,
OpcRetryToken: opcRetryToken,
})
if err == nil {
return &resp.Instance, nil
}
lastAttempt := idx == len(attempts)-1
if !ociutil.IsOutOfHostCapacityError(err) || lastAttempt {
return nil, err
}
lastErr = err
m.Logger.Info("Fault domain has run out of host capacity, retrying in a different domain", "faultDomain", attempt.FaultDomain)
}
return nil, lastErr
}

const defaultFaultDomainKey = "__no_fault_domain__"

func (m *MachineScope) buildFaultDomainLaunchAttempts(availabilityDomain, initialFaultDomain string) []faultDomainAttempt {
var attempts []faultDomainAttempt
seen := make(map[string]bool)
addAttempt := func(fd string) {
key := fd
if fd == "" {
key = defaultFaultDomainKey
}
if seen[key] {
return
}
seen[key] = true
attempts = append(attempts, faultDomainAttempt{
AvailabilityDomain: availabilityDomain,
FaultDomain: fd,
})
}

addAttempt(initialFaultDomain)
if availabilityDomain == "" {
return attempts
}

// Prefer fault domains exposed via the Cluster status. This respects
// Cluster API's scheduling decisions before falling back to raw OCI data.
failureDomains := m.OCIClusterAccessor.GetFailureDomains()
if len(failureDomains) > 0 {
keys := make([]string, 0, len(failureDomains))
for key := range failureDomains {
keys = append(keys, key)
}
sort.Strings(keys)
for _, key := range keys {
spec := failureDomains[key]
if spec.Attributes[AvailabilityDomain] != availabilityDomain {
continue
}
fd := spec.Attributes[FaultDomain]
if fd == "" {
continue
}
addAttempt(fd)
}
}

if len(attempts) > 1 {
return attempts
}

// If the cluster status didn't enumerate any additional fault domains,
// fall back to the cached availability-domain data gathered from OCI so we
// can still iterate through every physical fault domain in that AD.
if adMap := m.OCIClusterAccessor.GetAvailabilityDomains(); adMap != nil {
if adEntry, ok := adMap[availabilityDomain]; ok {
for _, fd := range adEntry.FaultDomains {
addAttempt(fd)
}
}
}

return attempts
}

func (m *MachineScope) getFreeFormTags() map[string]string {
Expand Down
Loading