This repository has been archived by the owner on Oct 9, 2023. It is now read-only.
/
handler.go
261 lines (236 loc) · 10.3 KB
/
handler.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
package controller
import (
"context"
"fmt"
"reflect"
"runtime/debug"
"time"
"github.com/lyft/flytestdlib/contextutils"
"github.com/lyft/flytestdlib/promutils/labeled"
"github.com/lyft/flytepropeller/pkg/apis/flyteworkflow/v1alpha1"
"github.com/lyft/flytepropeller/pkg/controller/config"
"github.com/lyft/flytepropeller/pkg/controller/workflowstore"
"github.com/lyft/flytestdlib/logger"
"github.com/lyft/flytestdlib/promutils"
"github.com/prometheus/client_golang/prometheus"
"github.com/lyft/flytepropeller/pkg/controller/executors"
)
// TODO Lets move everything to use controller runtime
type propellerMetrics struct {
Scope promutils.Scope
DeepCopyTime promutils.StopWatch
RawWorkflowTraversalTime labeled.StopWatch
SystemError labeled.Counter
AbortError labeled.Counter
PanicObserved labeled.Counter
RoundSkipped prometheus.Counter
WorkflowNotFound prometheus.Counter
StreakLength labeled.Counter
RoundTime labeled.StopWatch
}
func newPropellerMetrics(scope promutils.Scope) *propellerMetrics {
roundScope := scope.NewSubScope("round")
return &propellerMetrics{
Scope: scope,
DeepCopyTime: roundScope.MustNewStopWatch("deepcopy", "Total time to deep copy wf object", time.Millisecond),
RawWorkflowTraversalTime: labeled.NewStopWatch("raw", "Total time to traverse the workflow", time.Millisecond, roundScope, labeled.EmitUnlabeledMetric),
SystemError: labeled.NewCounter("system_error", "Failure to reconcile a workflow, system error", roundScope, labeled.EmitUnlabeledMetric),
AbortError: labeled.NewCounter("abort_error", "Failure to abort a workflow, system error", roundScope, labeled.EmitUnlabeledMetric),
PanicObserved: labeled.NewCounter("panic", "Panic during handling or aborting workflow", roundScope, labeled.EmitUnlabeledMetric),
RoundSkipped: roundScope.MustNewCounter("skipped", "Round Skipped because of stale workflow"),
WorkflowNotFound: roundScope.MustNewCounter("not_found", "workflow not found in the cache"),
StreakLength: labeled.NewCounter("streak_length", "Number of consecutive rounds used in fast follow mode", roundScope, labeled.EmitUnlabeledMetric),
RoundTime: labeled.NewStopWatch("round_time", "Total time taken by one round traversing, copying and storing a workflow", time.Millisecond, roundScope, labeled.EmitUnlabeledMetric),
}
}
func RecordSystemError(w *v1alpha1.FlyteWorkflow, err error) *v1alpha1.FlyteWorkflow {
// Let's mark these as system errors.
// We only want to increase failed attempts and discard any other partial changes to the CRD.
wfDeepCopy := w.DeepCopy()
wfDeepCopy.GetExecutionStatus().IncFailedAttempts()
wfDeepCopy.GetExecutionStatus().SetMessage(err.Error())
return wfDeepCopy
}
type Propeller struct {
wfStore workflowstore.FlyteWorkflow
workflowExecutor executors.Workflow
metrics *propellerMetrics
cfg *config.Config
}
func (p *Propeller) Initialize(ctx context.Context) error {
return p.workflowExecutor.Initialize(ctx)
}
func (p *Propeller) TryMutateWorkflow(ctx context.Context, originalW *v1alpha1.FlyteWorkflow) (*v1alpha1.FlyteWorkflow, error) {
t := p.metrics.DeepCopyTime.Start()
mutableW := originalW.DeepCopy()
t.Stop()
ctx = contextutils.WithWorkflowID(ctx, mutableW.GetID())
if execID := mutableW.GetExecutionID(); execID.WorkflowExecutionIdentifier != nil {
ctx = contextutils.WithProjectDomain(ctx, mutableW.GetExecutionID().Project, mutableW.GetExecutionID().Domain)
}
ctx = contextutils.WithResourceVersion(ctx, mutableW.GetResourceVersion())
maxRetries := uint32(p.cfg.MaxWorkflowRetries)
if IsDeleted(mutableW) || (mutableW.Status.FailedAttempts > maxRetries) {
var err error
func() {
defer func() {
if r := recover(); r != nil {
stack := debug.Stack()
err = fmt.Errorf("panic when aborting workflow, Stack: [%s]", string(stack))
logger.Errorf(ctx, err.Error())
p.metrics.PanicObserved.Inc(ctx)
}
}()
err = p.workflowExecutor.HandleAbortedWorkflow(ctx, mutableW, maxRetries)
}()
if err != nil {
p.metrics.AbortError.Inc(ctx)
return nil, err
}
return mutableW, nil
}
if !mutableW.GetExecutionStatus().IsTerminated() {
var err error
SetFinalizerIfEmpty(mutableW, FinalizerKey)
func() {
t := p.metrics.RawWorkflowTraversalTime.Start(ctx)
defer func() {
t.Stop()
if r := recover(); r != nil {
stack := debug.Stack()
err = fmt.Errorf("panic when reconciling workflow, Stack: [%s]", string(stack))
logger.Errorf(ctx, err.Error())
p.metrics.PanicObserved.Inc(ctx)
}
}()
err = p.workflowExecutor.HandleFlyteWorkflow(ctx, mutableW)
}()
if err != nil {
logger.Errorf(ctx, "Error when trying to reconcile workflow. Error [%v]. Error Type[%v]. Is nill [%v]",
err, reflect.TypeOf(err))
p.metrics.SystemError.Inc(ctx)
return nil, err
}
} else {
logger.Warn(ctx, "Workflow is marked as terminated but doesn't have the completed label, marking it as completed.")
}
return mutableW, nil
}
// reconciler compares the actual state with the desired, and attempts to
// converge the two. It then updates the GetExecutionStatus block of the FlyteWorkflow resource
// with the current status of the resource.
// Every FlyteWorkflow transitions through the following
//
// The Workflow to be worked on is identified for the given namespace and executionID (which is the name of the workflow)
// The return value should be an error, in the case, we wish to retry this workflow
// <pre>
//
// +--------+ +---------+ +------------+ +---------+
// | | | | | | | |
// | Ready +--------> Running +--------> Succeeding +-----> Success |
// | | | | | | | |
// +--------+ +---------+ +------------+ +---------+
// | |
// | |
// | +----v----+ +---------------------+ +--------+
// | | | | (optional) | | |
// +-------------> Failing +--------> HandlingFailureNode +--------> Failed |
// | | | | | |
// +---------+ +---------------------+ +--------+
// </pre>
func (p *Propeller) Handle(ctx context.Context, namespace, name string) error {
logger.Infof(ctx, "Processing Workflow.")
defer logger.Infof(ctx, "Completed processing workflow.")
// Get the FlyteWorkflow resource with this namespace/name
w, fetchErr := p.wfStore.Get(ctx, namespace, name)
if fetchErr != nil {
if workflowstore.IsNotFound(fetchErr) {
p.metrics.WorkflowNotFound.Inc()
logger.Warningf(ctx, "Workflow namespace[%v]/name[%v] not found, may be deleted.", namespace, name)
return nil
}
if workflowstore.IsWorkflowStale(fetchErr) {
p.metrics.RoundSkipped.Inc()
logger.Warningf(ctx, "Workflow namespace[%v]/name[%v] Stale.", namespace, name)
return nil
}
logger.Warningf(ctx, "Failed to GetWorkflow, retrying with back-off", fetchErr)
return fetchErr
}
if w.GetExecutionStatus().IsTerminated() {
if HasCompletedLabel(w) && !HasFinalizer(w) {
logger.Debugf(ctx, "Workflow is terminated.")
// This workflow had previously completed, let us ignore it
return nil
}
}
streak := 0
defer p.metrics.StreakLength.Add(ctx, float64(streak))
maxLength := p.cfg.MaxStreakLength
if maxLength <= 0 {
maxLength = 1
}
for streak = 0; streak < maxLength; streak++ {
t := p.metrics.RoundTime.Start(ctx)
mutatedWf, err := p.TryMutateWorkflow(ctx, w)
if err != nil {
// NOTE We are overriding the deepcopy here, as we are essentially ingnoring all mutations
// We only want to increase failed attempts and discard any other partial changes to the CRD.
mutatedWf = RecordSystemError(w, err)
p.metrics.SystemError.Inc(ctx)
} else if mutatedWf == nil {
logger.Errorf(ctx, "Should not happen! Mutation resulted in a nil workflow!")
return nil
} else {
if !w.GetExecutionStatus().IsTerminated() {
// No updates in the status we detected, we will skip writing to KubeAPI
if mutatedWf.Status.Equals(&w.Status) {
logger.Info(ctx, "WF hasn't been updated in this round.")
t.Stop()
return nil
}
}
if mutatedWf.GetExecutionStatus().IsTerminated() {
// If the end result is a terminated workflow, we remove the labels
// We add a completed label so that we can avoid polling for this workflow
SetCompletedLabel(mutatedWf, time.Now())
ResetFinalizers(mutatedWf)
}
}
// TODO we will need to call updatestatus when it is supported. But to preserve metadata like (label/finalizer) we will need to use update
// update the GetExecutionStatus block of the FlyteWorkflow resource. UpdateStatus will not
// allow changes to the Spec of the resource, which is ideal for ensuring
// nothing other than resource status has been updated.
newWf, updateErr := p.wfStore.Update(ctx, mutatedWf, workflowstore.PriorityClassCritical)
if updateErr != nil {
t.Stop()
return updateErr
}
if err != nil {
t.Stop()
// An error was encountered during the round. Let us return, so that we can back-off gracefully
return err
}
if mutatedWf.GetExecutionStatus().IsTerminated() || newWf.ResourceVersion == mutatedWf.ResourceVersion {
// Workflow is terminated (no need to continue) or no status was changed, we can wait
logger.Infof(ctx, "Will not fast follow, Reason: Wf terminated? %v, Version matched? %v",
mutatedWf.GetExecutionStatus().IsTerminated(), newWf.ResourceVersion == mutatedWf.ResourceVersion)
t.Stop()
return nil
}
logger.Infof(ctx, "FastFollow Enabled. Detected State change, we will try another round. StreakLength [%d]", streak)
w = newWf
t.Stop()
}
logger.Infof(ctx, "Streak ended at [%d]/Max: [%d]", streak, maxLength)
return nil
}
func NewPropellerHandler(_ context.Context, cfg *config.Config, wfStore workflowstore.FlyteWorkflow, executor executors.Workflow, scope promutils.Scope) *Propeller {
metrics := newPropellerMetrics(scope)
return &Propeller{
metrics: metrics,
wfStore: wfStore,
workflowExecutor: executor,
cfg: cfg,
}
}