diff --git a/README.md b/README.md
index d4bded64..e26a03f5 100644
--- a/README.md
+++ b/README.md
@@ -54,10 +54,11 @@ A Framework represents an application with a set of Tasks:
2. Partitioned to different heterogeneous TaskRoles which share the same lifecycle
3. Ordered in the same homogeneous TaskRole by TaskIndex
4. With consistent identity {FrameworkName}-{TaskRoleName}-{TaskIndex} as PodName
-5. With fine grained [RetryPolicy](doc/user-manual.md#RetryPolicy) for each Task and the whole Framework
-6. With fine grained [FrameworkAttemptCompletionPolicy](doc/user-manual.md#FrameworkAttemptCompletionPolicy) for each TaskRole
-7. With PodGracefulDeletionTimeoutSec for each Task to [tune Consistency vs Availability](doc/user-manual.md#FrameworkConsistencyAvailability)
-8. With fine grained [Status](pkg/apis/frameworkcontroller/v1/types.go) for each TaskAttempt/Task, each TaskRole and the whole FrameworkAttempt/Framework
+5. With fine grained [ExecutionType](doc/user-manual.md#FrameworkExecutionType) to Start/Stop the whole Framework
+6. With fine grained [RetryPolicy](doc/user-manual.md#RetryPolicy) for each Task and the whole Framework
+7. With fine grained [FrameworkAttemptCompletionPolicy](doc/user-manual.md#FrameworkAttemptCompletionPolicy) for each TaskRole
+8. With PodGracefulDeletionTimeoutSec for each Task to [tune Consistency vs Availability](doc/user-manual.md#FrameworkConsistencyAvailability)
+9. With fine grained [Status](pkg/apis/frameworkcontroller/v1/types.go) for each TaskAttempt/Task, each TaskRole and the whole FrameworkAttempt/Framework
### Controller Feature
1. Highly generalized as it is built for all kinds of applications
diff --git a/doc/user-manual.md b/doc/user-manual.md
index f2daec3e..6538a23b 100644
--- a/doc/user-manual.md
+++ b/doc/user-manual.md
@@ -2,6 +2,7 @@
## Index
- [Framework Interop](#FrameworkInterop)
+ - [Framework ExecutionType](#FrameworkExecutionType)
- [Container EnvironmentVariable](#ContainerEnvironmentVariable)
- [Pod Failure Classification](#PodFailureClassification)
- [Predefined CompletionCode](#PredefinedCompletionCode)
@@ -38,7 +39,7 @@ As Framework is actually a [Kubernetes CRD](https://kubernetes.io/docs/concepts/
### Supported Interoperation
| API Kind | Operations |
|:---- |:---- |
-| Framework | [CREATE](#CREATE_Framework) [DELETE](#DELETE_Framework) [GET](#GET_Framework) [LIST](#LIST_Frameworks) [WATCH](#WATCH_Framework) [WATCH_LIST](#WATCH_LIST_Frameworks)
[PATCH](#PATCH_Framework) ([Stop](#Stop_Framework), [Add TaskRole](#Add_TaskRole), [Delete TaskRole](#Delete_TaskRole), [Add/Delete Task](#Add_Delete_Task)) |
+| Framework | [CREATE](#CREATE_Framework) [DELETE](#DELETE_Framework) [GET](#GET_Framework) [LIST](#LIST_Frameworks) [WATCH](#WATCH_Framework) [WATCH_LIST](#WATCH_LIST_Frameworks)
[PATCH](#PATCH_Framework) ([Start](#Start_Framework), [Stop](#Stop_Framework), [Add TaskRole](#Add_TaskRole), [Delete TaskRole](#Delete_TaskRole), [Add/Delete Task](#Add_Delete_Task)) |
| [ConfigMap](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#configmap-v1-core) | All operations except for [CREATE](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#create-configmap-v1-core) [PUT](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#replace-configmap-v1-core) [PATCH](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#patch-configmap-v1-core) |
| [Pod](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#pod-v1-core) | All operations except for [CREATE](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#create-pod-v1-core) [PUT](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#replace-pod-v1-core) [PATCH](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#patch-pod-v1-core) |
@@ -55,6 +56,8 @@ Type: application/json or application/yaml
Create the specified Framework.
+Any [ExecutionType](#FrameworkExecutionType) can be specified to create the Framework.
+
**Response**
| Code | Body | Description |
@@ -65,6 +68,38 @@ Create the specified Framework.
| Conflict(409) | [Status](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#status-v1-meta) | The specified Framework already exists. |
#### PATCH Framework
+##### Start Framework
+**Request**
+
+ PATCH /apis/frameworkcontroller.microsoft.com/v1/namespaces/{FrameworkNamespace}/frameworks/{FrameworkName}
+
+Body:
+
+```json
+[
+ {
+ "op": "replace",
+ "path": "/spec/executionType",
+ "value": "Start"
+ }
+]
+```
+
+Type: application/json-patch+json
+
+**Description**
+
+Start the specified Framework whose [ExecutionType](#FrameworkExecutionType) should be `Create`.
+
+Before the Start, the Framework will not start to run or complete, but the object of the Framework is created, see [Framework PreStart Example](#FrameworkExecutionTypePreStartExample).
+
+**Response**
+
+| Code | Body | Description |
+|:---- |:---- |:---- |
+| OK(200) | [Framework](../pkg/apis/frameworkcontroller/v1/types.go) | Return current Framework. |
+| NotFound(404) | [Status](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#status-v1-meta) | The specified Framework is not found. |
+
##### Stop Framework
**Request**
@@ -86,9 +121,9 @@ Type: application/json-patch+json
**Description**
-Stop the specified Framework:
+Stop the specified Framework whose [ExecutionType](#FrameworkExecutionType) should be `Create` or `Start`.
-All running containers of the Framework will be stopped while the object of the Framework is still kept.
+After the Stop, the Framework will start to complete, but the object of the Framework will not be deleted, see [Framework PostStop Example](#FrameworkExecutionTypePostStopExample).
**Response**
@@ -346,6 +381,100 @@ Watch the change events of all Frameworks (in the specified FrameworkNamespace).
|:---- |:---- |:---- |
| OK(200) | [WatchEvent](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#watchevent-v1-meta) | Streaming the change events of all Frameworks (in the specified FrameworkNamespace). |
+## Framework ExecutionType
+Framework [ExecutionType](../pkg/apis/frameworkcontroller/v1/types.go) can be specified to control the execution of the Framework:
+1. You can just [Create Framework](#CREATE_Framework) with `Create` ExecutionType, which does not also start it at the same time.
+ - This is useful when you need to do some PreStart actions depend on the Framework object, see [Framework PreStart Example](#FrameworkExecutionTypePreStartExample). And once these actions are done, you can safely [Start Framework](#Start_Framework).
+2. You can just [Stop Framework](#Stop_Framework), which does not also delete it at the same time.
+ - This is useful when you need to do some PostStop actions depend on the Framework object, see [Framework PostStop Example](#FrameworkExecutionTypePostStopExample). And once these actions are done, you can safely [Delete Framework](#DELETE_Framework).
+
+### Example
+#### Framework PreStart Example
+In this example, you need to run a [Framework which depends on a ServiceAccount](https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account), but the ServiceAccount also depends on the Framework object to be [OwnerReferences](https://kubernetes.io/docs/concepts/workloads/controllers/garbage-collection/#owners-and-dependents), so you cannot directly [Create Framework](#CREATE_Framework) with ExecutionType `Start`.
+1. [Create Framework](#CREATE_Framework) with `Create` ExecutionType and a ServiceAccount reference as below, then the Framework will stay as AttemptCreationPending:
+```yaml
+apiVersion: frameworkcontroller.microsoft.com/v1
+kind: Framework
+metadata:
+ name: prestart
+spec:
+ executionType: Create
+ retryPolicy:
+ fancyRetryPolicy: false
+ maxRetryCount: 0
+ taskRoles:
+ - name: a
+ taskNumber: 4
+ frameworkAttemptCompletionPolicy:
+ minFailedTaskCount: 4
+ minSucceededTaskCount: 1
+ task:
+ retryPolicy:
+ fancyRetryPolicy: false
+ maxRetryCount: 0
+ podGracefulDeletionTimeoutSec: 600
+ pod:
+ spec:
+ restartPolicy: Never
+ serviceAccountName: prestart
+ containers:
+ - name: ubuntu
+ image: ubuntu:trusty
+ command: ["sh", "-c", "printenv && sleep infinity"]
+```
+2. Use above creation response's `metadata.uid` to override below {{FrameworkUID}}, and [Create ServiceAccount](https://v1-14.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#create-serviceaccount-v1-core) with above Framework reference as below:
+```yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: prestart
+ ownerReferences:
+ - apiVersion: frameworkcontroller.microsoft.com/v1
+ kind: Framework
+ name: prestart
+ uid: {{FrameworkUID}}
+ controller: true
+ blockOwnerDeletion: true
+```
+3. [Start Framework](#Start_Framework), then the Framework will start to run successfully.
+4. [Delete Framework](#DELETE_Framework), then both the Framework and above ServiceAccount will be deleted.
+
+#### Framework PostStop Example
+In this example, you need to stop a Framework whose final stopped Framework object needs to be [pushed to/pulled by external systems](#FrameworkPodHistory), so you cannot directly [Delete Framework](#DELETE_Framework).
+1. [Create Framework](#CREATE_Framework) as below:
+```yaml
+apiVersion: frameworkcontroller.microsoft.com/v1
+kind: Framework
+metadata:
+ name: poststop
+spec:
+ executionType: Start
+ retryPolicy:
+ fancyRetryPolicy: false
+ maxRetryCount: 0
+ taskRoles:
+ - name: a
+ taskNumber: 4
+ frameworkAttemptCompletionPolicy:
+ minFailedTaskCount: 4
+ minSucceededTaskCount: 1
+ task:
+ retryPolicy:
+ fancyRetryPolicy: false
+ maxRetryCount: 0
+ podGracefulDeletionTimeoutSec: 600
+ pod:
+ spec:
+ restartPolicy: Never
+ containers:
+ - name: ubuntu
+ image: ubuntu:trusty
+ command: ["sh", "-c", "printenv && sleep infinity"]
+```
+2. [Stop Framework](#Stop_Framework), then the Framework will be stopped, i.e. FrameworkCompleted.
+3. [Get Framework](#GET_Framework), and archive it into a DataBase first.
+4. [Delete Framework](#DELETE_Framework), then the Framework will be deleted.
+
## Container EnvironmentVariable
[Container EnvironmentVariable](../pkg/apis/frameworkcontroller/v1/constants.go)
@@ -713,7 +842,7 @@ Besides these general [Framework ConsistencyGuarantees](#ConsistencyGuarantees),
To safely run large scale Framework, i.e. the total task number in a single Framework is greater than 300, you just need to enable the [LargeFrameworkCompression](../pkg/apis/frameworkcontroller/v1/config.go). However, you may also need to decompress the Framework by yourself.
## Framework and Pod History
-By leveraging the [LogObjectSnapshot](../pkg/apis/frameworkcontroller/v1/config.go), external systems, such as [Fluentd](https://www.fluentd.org) and [ElasticSearch](https://www.elastic.co/products/elasticsearch), can collect and process Framework and Pod history snapshots even if it was retried or deleted, such as persistence, metrics conversion, visualization, alerting, acting, analysis, etc.
+By leveraging the [LogObjectSnapshot](../pkg/apis/frameworkcontroller/v1/config.go), external systems, such as [Fluentd](https://www.fluentd.org) and [ElasticSearch](https://www.elastic.co/products/elasticsearch), can collect and process Framework and Pod history snapshots even if it was retried or deleted, such as for persistence, metrics conversion, visualization, alerting, acting, analysis, etc.
## Framework and Task State Machine
### Framework State Machine
diff --git a/example/framework/extension/frameworkbarrier.yaml b/example/framework/extension/frameworkbarrier.yaml
index 33b96c2e..6926a6d0 100644
--- a/example/framework/extension/frameworkbarrier.yaml
+++ b/example/framework/extension/frameworkbarrier.yaml
@@ -59,7 +59,7 @@ spec:
- name: frameworkbarrier-volume
mountPath: /mnt/frameworkbarrier
# [PREREQUISITE]
- # User needs to create a service account in the same namespace of this
+ # User needs to create a ServiceAccount in the same namespace of this
# Framework with granted permission for frameworkbarrier, if the k8s
# cluster enforces authorization.
# For example, if the cluster enforces RBAC:
diff --git a/example/framework/scenario/tensorflow/ps/cpu/tensorflowdistributedtrainingwithcpu.yaml b/example/framework/scenario/tensorflow/ps/cpu/tensorflowdistributedtrainingwithcpu.yaml
index e10e000e..f8197c0a 100644
--- a/example/framework/scenario/tensorflow/ps/cpu/tensorflowdistributedtrainingwithcpu.yaml
+++ b/example/framework/scenario/tensorflow/ps/cpu/tensorflowdistributedtrainingwithcpu.yaml
@@ -69,7 +69,7 @@ spec:
- name: data-volume
mountPath: /mnt/data
# [PREREQUISITE]
- # User needs to create a service account for frameworkbarrier, if the
+ # User needs to create a ServiceAccount for frameworkbarrier, if the
# k8s cluster enforces authorization.
# See more in ./example/framework/extension/frameworkbarrier.yaml
serviceAccountName: frameworkbarrier
diff --git a/example/framework/scenario/tensorflow/ps/gpu/tensorflowdistributedtrainingwithdefaultscheduledgpu.yaml b/example/framework/scenario/tensorflow/ps/gpu/tensorflowdistributedtrainingwithdefaultscheduledgpu.yaml
index b5a0e50c..8d20d594 100644
--- a/example/framework/scenario/tensorflow/ps/gpu/tensorflowdistributedtrainingwithdefaultscheduledgpu.yaml
+++ b/example/framework/scenario/tensorflow/ps/gpu/tensorflowdistributedtrainingwithdefaultscheduledgpu.yaml
@@ -75,7 +75,7 @@ spec:
- name: data-volume
mountPath: /mnt/data
# [PREREQUISITE]
- # User needs to create a service account for frameworkbarrier, if the
+ # User needs to create a ServiceAccount for frameworkbarrier, if the
# k8s cluster enforces authorization.
# See more in ./example/framework/extension/frameworkbarrier.yaml
serviceAccountName: frameworkbarrier
diff --git a/example/framework/scenario/tensorflow/ps/gpu/tensorflowdistributedtrainingwithhivedscheduledgpu.yaml b/example/framework/scenario/tensorflow/ps/gpu/tensorflowdistributedtrainingwithhivedscheduledgpu.yaml
index 2bb76d6e..9749168d 100644
--- a/example/framework/scenario/tensorflow/ps/gpu/tensorflowdistributedtrainingwithhivedscheduledgpu.yaml
+++ b/example/framework/scenario/tensorflow/ps/gpu/tensorflowdistributedtrainingwithhivedscheduledgpu.yaml
@@ -95,7 +95,7 @@ spec:
- name: data-volume
mountPath: /mnt/data
# [PREREQUISITE]
- # User needs to create a service account for frameworkbarrier, if the
+ # User needs to create a ServiceAccount for frameworkbarrier, if the
# k8s cluster enforces authorization.
# See more in ./example/framework/extension/frameworkbarrier.yaml
serviceAccountName: frameworkbarrier
diff --git a/example/run/README.md b/example/run/README.md
index 3af894d5..c6b58d67 100644
--- a/example/run/README.md
+++ b/example/run/README.md
@@ -16,7 +16,7 @@ Notes:
### Prerequisite
-If the k8s cluster enforces [Authorization](https://kubernetes.io/docs/reference/access-authn-authz/authorization/#using-flags-for-your-authorization-module), you need to first create a [Service Account](https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account) with granted permission for FrameworkController. For example, if the cluster enforces [RBAC](https://kubernetes.io/docs/reference/access-authn-authz/rbac/#kubectl-create-clusterrolebinding):
+If the k8s cluster enforces [Authorization](https://kubernetes.io/docs/reference/access-authn-authz/authorization/#using-flags-for-your-authorization-module), you need to first create a [ServiceAccount](https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account) with granted permission for FrameworkController. For example, if the cluster enforces [RBAC](https://kubernetes.io/docs/reference/access-authn-authz/rbac/#kubectl-create-clusterrolebinding):
```shell
kubectl create serviceaccount frameworkcontroller --namespace default
kubectl create clusterrolebinding frameworkcontroller \
@@ -26,7 +26,7 @@ kubectl create clusterrolebinding frameworkcontroller \
### Run
-Run FrameworkController with above Service Account and the [k8s inClusterConfig](https://kubernetes.io/docs/tasks/access-application-cluster/access-cluster/#accessing-the-api-from-a-pod):
+Run FrameworkController with above ServiceAccount and the [k8s inClusterConfig](https://kubernetes.io/docs/tasks/access-application-cluster/access-cluster/#accessing-the-api-from-a-pod):
#### Run with [default config](../../example/config/default/frameworkcontroller.yaml)
```shell
@@ -51,7 +51,7 @@ spec:
labels:
app: frameworkcontroller
spec:
- # Using the service account with granted permission
+ # Using the ServiceAccount with granted permission
# if the k8s cluster enforces authorization.
serviceAccountName: frameworkcontroller
containers:
@@ -115,7 +115,7 @@ spec:
labels:
app: frameworkcontroller
spec:
- # Using the service account with granted permission
+ # Using the ServiceAccount with granted permission
# if the k8s cluster enforces authorization.
serviceAccountName: frameworkcontroller
containers:
@@ -133,8 +133,8 @@ spec:
"cp /frameworkcontroller-config/frameworkcontroller.yaml . &&
./start.sh"]
volumeMounts:
- - name: frameworkcontroller-config
- mountPath: /frameworkcontroller-config
+ - name: frameworkcontroller-config
+ mountPath: /frameworkcontroller-config
volumes:
- name: frameworkcontroller-config
configMap:
diff --git a/pkg/apis/frameworkcontroller/v1/crd.go b/pkg/apis/frameworkcontroller/v1/crd.go
index 0a71edb1..0b2d3dee 100644
--- a/pkg/apis/frameworkcontroller/v1/crd.go
+++ b/pkg/apis/frameworkcontroller/v1/crd.go
@@ -74,6 +74,7 @@ func buildFrameworkValidation() *apiExtensions.CustomResourceValidation {
Properties: map[string]apiExtensions.JSONSchemaProps{
"executionType": {
Enum: []apiExtensions.JSON{
+ {Raw: []byte(common.Quote(string(ExecutionCreate)))},
{Raw: []byte(common.Quote(string(ExecutionStart)))},
{Raw: []byte(common.Quote(string(ExecutionStop)))},
},
diff --git a/pkg/apis/frameworkcontroller/v1/types.go b/pkg/apis/frameworkcontroller/v1/types.go
index e5adf44e..e4df46c6 100644
--- a/pkg/apis/frameworkcontroller/v1/types.go
+++ b/pkg/apis/frameworkcontroller/v1/types.go
@@ -70,8 +70,7 @@ type Framework struct {
// Spec
//////////////////////////////////////////////////////////////////////////////////////////////////
type FrameworkSpec struct {
- Description string `json:"description"`
- // Only support to update from ExecutionStart to ExecutionStop
+ Description string `json:"description"`
ExecutionType ExecutionType `json:"executionType"`
RetryPolicy RetryPolicySpec `json:"retryPolicy"`
TaskRoles []*TaskRoleSpec `json:"taskRoles"`
@@ -115,11 +114,23 @@ type TaskSpec struct {
Pod core.PodTemplateSpec `json:"pod"`
}
+// User can set any ExecutionType when create a Framework, and then he can choose
+// to change the ExecutionType or not.
+// However, only below changes are supported:
+// 1. ExecutionCreate -> ExecutionStart/ExecutionStop
+// 2. ExecutionStart -> ExecutionStop
type ExecutionType string
const (
+ // The Framework will be kept in FrameworkAttemptCreationPending.
+ // So it will never start to run or complete.
+ ExecutionCreate ExecutionType = "Create"
+ // The Framework will be transitioned from FrameworkAttemptCreationPending.
+ // So it will immediately start to run.
ExecutionStart ExecutionType = "Start"
- ExecutionStop ExecutionType = "Stop"
+ // The Framework will be transitioned to FrameworkCompleted.
+ // So it will immediately start to complete.
+ ExecutionStop ExecutionType = "Stop"
)
// RetryPolicySpec can be configured for the whole Framework and each TaskRole
diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go
index 170c72dc..bd35a0ac 100644
--- a/pkg/controller/controller.go
+++ b/pkg/controller/controller.go
@@ -1214,6 +1214,12 @@ func (c *FrameworkController) syncFrameworkState(f *ci.Framework) (err error) {
return nil
}
+ if f.Spec.ExecutionType == ci.ExecutionCreate {
+ klog.Infof(logPfx + "Skip to createFrameworkAttempt: " +
+ "User has requested to just create the Framework without starting it")
+ return nil
+ }
+
if f.Spec.ExecutionType == ci.ExecutionStop {
diag := "User has requested to stop the Framework"
klog.Info(logPfx + diag)