api/v1/lib/extras/scheduler/controller/rules.go

package controller

import (
	"context"
	"fmt"
	"log"
	"time"

	. "github.com/mesos/mesos-go/api/v1/lib/extras/scheduler/eventrules"
	"github.com/mesos/mesos-go/api/v1/lib/extras/store"
	"github.com/mesos/mesos-go/api/v1/lib/scheduler"
	"github.com/mesos/mesos-go/api/v1/lib/scheduler/calls"
)

// ErrEvent errors are generated by LiftErrors upon receiving an ERROR event from Mesos.
type ErrEvent string

func (e ErrEvent) Error() string {
	return string(e)
}

// LiftErrors extract the error message from a scheduler error event and returns it as an ErrEvent
// so that downstream rules/handlers may continue processing.
func LiftErrors() Rule {
	return func(ctx context.Context, e *scheduler.Event, err error, chain Chain) (context.Context, *scheduler.Event, error) {
		if err != nil {
			return chain(ctx, e, err)
		}
		if e.GetType() == scheduler.Event_ERROR {
			// it's recommended that we abort and re-try subscribing; returning an
			// error here will cause the event loop to terminate and the connection
			// will be reset.
			return chain(ctx, e, ErrEvent(e.GetError().GetMessage()))
		}
		return chain(ctx, e, nil)
	}
}

// StateError is returned when the system encounters an unresolvable state transition error and
// should likely exit.
type StateError string

func (err StateError) Error() string { return string(err) }

func TrackSubscription(frameworkIDStore store.Singleton, failoverTimeout time.Duration) Rule {
	return func(ctx context.Context, e *scheduler.Event, err error, chain Chain) (context.Context, *scheduler.Event, error) {
		if err != nil {
			return chain(ctx, e, err)
		}
		if e.GetType() == scheduler.Event_SUBSCRIBED {
			var (
				storedFrameworkID, err = frameworkIDStore.Get()
				frameworkID            = e.GetSubscribed().GetFrameworkID().GetValue()
			)
			if err != nil && err != store.ErrNotFound {
				return chain(ctx, e, err)
			}
			// order of `if` statements are important: tread carefully w/ respect to future changes
			if frameworkID == "" {
				// sanity check, should **never** happen
				return chain(ctx, e, StateError("mesos sent an empty frameworkID?!"))
			}
			if storedFrameworkID != "" && storedFrameworkID != frameworkID && failoverTimeout > 0 {
				return chain(ctx, e, StateError(fmt.Sprintf(
					"frameworkID changed unexpectedly; failover exceeded timeout? (%s).", failoverTimeout)))
			}
			if storedFrameworkID != frameworkID {
				frameworkIDStore.Set(frameworkID)
			}
		}
		return chain(ctx, e, nil)
	}
}

// AckStatusUpdates sends an acknowledgement of a task status update back to mesos and drops the event if
// sending the ack fails. If successful, the specified err param (if any) is forwarded. Acknowledgements
// are only attempted for task status updates tagged with a UUID.
func AckStatusUpdates(caller calls.Caller) Rule {
	return AckStatusUpdatesF(func() calls.Caller { return caller })
}

// AckStatusUpdatesF is a functional adapter for AckStatusUpdates, useful for cases where the caller may
// change over time. An error that occurs while ack'ing the status update is returned as a calls.AckError.
func AckStatusUpdatesF(callerLookup func() calls.Caller) Rule {
	return func(ctx context.Context, e *scheduler.Event, err error, chain Chain) (context.Context, *scheduler.Event, error) {
		// aggressively attempt to ack updates: even if there's pre-existing error state attempt
		// to acknowledge all status updates.
		origErr := err
		if e.GetType() == scheduler.Event_UPDATE {
			var (
				s    = e.GetUpdate().GetStatus()
				uuid = s.GetUUID()
			)
			// only ACK non-empty UUID's, as per mesos scheduler spec
			if len(uuid) > 0 {
				ack := calls.Acknowledge(
					s.GetAgentID().GetValue(),
					s.TaskID.Value,
					uuid,
				)
				err = calls.CallNoData(ctx, callerLookup(), ack)
				if err != nil {
					// TODO(jdef): not sure how important this is; if there's an error ack'ing
					// because we beacame disconnected, then we'll just reconnect later and
					// Mesos will ask us to ACK anyway -- why pay special attention to these
					// call failures vs others?
					err = &calls.AckError{Ack: ack, Cause: err}
					return ctx, e, Error2(origErr, err) // drop (do not propagate to chain)
				}
			}
		}
		return chain(ctx, e, origErr)
	}
}

// DefaultEventLabel is, by default, logged as the first argument by DefaultEventLogger
const DefaultEventLabel = "event"

// DefaultEventLogger logs the event via the `log` package.
func DefaultEventLogger(eventLabel string) func(*scheduler.Event) {
	if eventLabel == "" {
		return func(e *scheduler.Event) { log.Println(e) }
	}
	return func(e *scheduler.Event) { log.Println(eventLabel, e) }
}

// LogEvents returns a rule that logs scheduler events to the EventLogger
func LogEvents(f func(*scheduler.Event)) Rule {
	if f == nil {
		f = DefaultEventLogger(DefaultEventLabel)
	}
	return Rule(func(ctx context.Context, e *scheduler.Event, err error, chain Chain) (context.Context, *scheduler.Event, error) {
		f(e)
		return chain(ctx, e, err)
	})
}

// AckOperationUpdates acknowledges an offer operation status update sent to the scheduler by the master.
// The AgentID isn't part of the event reported by the master, so it cannot be included in the generated ACK.
func AckOperationUpdates(caller calls.Caller) Rule {
	return AckOperationUpdatesF(func() calls.Caller { return caller })
}

// AckOperationUpdatesF is a functional adapter for AckOperationUpdates, useful for cases where the caller may
// change over time. An error that occurs while ack'ing the status update is returned as a calls.AckError.
func AckOperationUpdatesF(callerLookup func() calls.Caller) Rule {
	return func(ctx context.Context, e *scheduler.Event, err error, chain Chain) (context.Context, *scheduler.Event, error) {
		// aggressively attempt to ack updates: even if there's pre-existing error state attempt
		// to acknowledge all offer operation status updates.
		origErr := err
		if e.GetType() == scheduler.Event_UPDATE_OPERATION_STATUS {
			var (
				s    = e.GetUpdateOperationStatus().GetStatus()
				uuid = s.GetUUID().GetValue()
			)
			// only ACK non-empty UUID's, as per mesos scheduler spec.
			if len(uuid) > 0 {
				// the fact that we're receiving this offer operation status update means that the
				// framework supplied an operation_id to the master when executing the offer operation,
				// therefore the operation_id included in the status object here should be non-empty.
				opID := s.GetOperationID().GetValue()
				if opID == "" {
					panic("expected non-empty offer operation ID for offer operation status update")
				}
				// try to extract a resource provider ID; we can safely assume that all converted resources
				// are for the same provider ID (including a non-specified one).
				rpID := ""
				conv := s.GetConvertedResources()
				for i := range conv {
					id := conv[i].GetProviderID().GetValue()
					if id != "" {
						rpID = id
						break
					}
				}
				ack := calls.AcknowledgeOperationStatus(
					"",   // agentID: optional
					rpID, // optional
					uuid,
					opID,
				)
				err = calls.CallNoData(ctx, callerLookup(), ack)
				if err != nil {
					// TODO(jdef): not sure how important this is; if there's an error ack'ing
					// because we became disconnected, then we'll just reconnect later and
					// Mesos will ask us to ACK anyway -- why pay special attention to these
					// call failures vs others?
					err = &calls.AckError{Ack: ack, Cause: err}
					return ctx, e, Error2(origErr, err) // drop (do not propagate to chain)
				}
			}
		}
		return chain(ctx, e, origErr)
	}
}