From 79001343ea51df77b5351c2eee89d43ea583dc65 Mon Sep 17 00:00:00 2001 From: Yuqi Wang Date: Fri, 7 Aug 2020 16:58:34 +0800 Subject: [PATCH] Support Create ExecutionType: Just create without start --- README.md | 9 +- doc/user-manual.md | 137 +++++++++++++++++- .../framework/extension/frameworkbarrier.yaml | 2 +- .../tensorflowdistributedtrainingwithcpu.yaml | 2 +- ...ibutedtrainingwithdefaultscheduledgpu.yaml | 2 +- ...tributedtrainingwithhivedscheduledgpu.yaml | 2 +- example/run/README.md | 12 +- pkg/apis/frameworkcontroller/v1/crd.go | 1 + pkg/apis/frameworkcontroller/v1/types.go | 17 ++- pkg/controller/controller.go | 6 + 10 files changed, 169 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index d4bded64..e26a03f5 100644 --- a/README.md +++ b/README.md @@ -54,10 +54,11 @@ A Framework represents an application with a set of Tasks: 2. Partitioned to different heterogeneous TaskRoles which share the same lifecycle 3. Ordered in the same homogeneous TaskRole by TaskIndex 4. With consistent identity {FrameworkName}-{TaskRoleName}-{TaskIndex} as PodName -5. With fine grained [RetryPolicy](doc/user-manual.md#RetryPolicy) for each Task and the whole Framework -6. With fine grained [FrameworkAttemptCompletionPolicy](doc/user-manual.md#FrameworkAttemptCompletionPolicy) for each TaskRole -7. With PodGracefulDeletionTimeoutSec for each Task to [tune Consistency vs Availability](doc/user-manual.md#FrameworkConsistencyAvailability) -8. With fine grained [Status](pkg/apis/frameworkcontroller/v1/types.go) for each TaskAttempt/Task, each TaskRole and the whole FrameworkAttempt/Framework +5. With fine grained [ExecutionType](doc/user-manual.md#FrameworkExecutionType) to Start/Stop the whole Framework +6. With fine grained [RetryPolicy](doc/user-manual.md#RetryPolicy) for each Task and the whole Framework +7. With fine grained [FrameworkAttemptCompletionPolicy](doc/user-manual.md#FrameworkAttemptCompletionPolicy) for each TaskRole +8. With PodGracefulDeletionTimeoutSec for each Task to [tune Consistency vs Availability](doc/user-manual.md#FrameworkConsistencyAvailability) +9. With fine grained [Status](pkg/apis/frameworkcontroller/v1/types.go) for each TaskAttempt/Task, each TaskRole and the whole FrameworkAttempt/Framework ### Controller Feature 1. Highly generalized as it is built for all kinds of applications diff --git a/doc/user-manual.md b/doc/user-manual.md index f2daec3e..6538a23b 100644 --- a/doc/user-manual.md +++ b/doc/user-manual.md @@ -2,6 +2,7 @@ ## Index - [Framework Interop](#FrameworkInterop) + - [Framework ExecutionType](#FrameworkExecutionType) - [Container EnvironmentVariable](#ContainerEnvironmentVariable) - [Pod Failure Classification](#PodFailureClassification) - [Predefined CompletionCode](#PredefinedCompletionCode) @@ -38,7 +39,7 @@ As Framework is actually a [Kubernetes CRD](https://kubernetes.io/docs/concepts/ ### Supported Interoperation | API Kind | Operations | |:---- |:---- | -| Framework | [CREATE](#CREATE_Framework) [DELETE](#DELETE_Framework) [GET](#GET_Framework) [LIST](#LIST_Frameworks) [WATCH](#WATCH_Framework) [WATCH_LIST](#WATCH_LIST_Frameworks)
[PATCH](#PATCH_Framework) ([Stop](#Stop_Framework), [Add TaskRole](#Add_TaskRole), [Delete TaskRole](#Delete_TaskRole), [Add/Delete Task](#Add_Delete_Task)) | +| Framework | [CREATE](#CREATE_Framework) [DELETE](#DELETE_Framework) [GET](#GET_Framework) [LIST](#LIST_Frameworks) [WATCH](#WATCH_Framework) [WATCH_LIST](#WATCH_LIST_Frameworks)
[PATCH](#PATCH_Framework) ([Start](#Start_Framework), [Stop](#Stop_Framework), [Add TaskRole](#Add_TaskRole), [Delete TaskRole](#Delete_TaskRole), [Add/Delete Task](#Add_Delete_Task)) | | [ConfigMap](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#configmap-v1-core) | All operations except for [CREATE](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#create-configmap-v1-core) [PUT](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#replace-configmap-v1-core) [PATCH](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#patch-configmap-v1-core) | | [Pod](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#pod-v1-core) | All operations except for [CREATE](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#create-pod-v1-core) [PUT](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#replace-pod-v1-core) [PATCH](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#patch-pod-v1-core) | @@ -55,6 +56,8 @@ Type: application/json or application/yaml Create the specified Framework. +Any [ExecutionType](#FrameworkExecutionType) can be specified to create the Framework. + **Response** | Code | Body | Description | @@ -65,6 +68,38 @@ Create the specified Framework. | Conflict(409) | [Status](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#status-v1-meta) | The specified Framework already exists. | #### PATCH Framework +##### Start Framework +**Request** + + PATCH /apis/frameworkcontroller.microsoft.com/v1/namespaces/{FrameworkNamespace}/frameworks/{FrameworkName} + +Body: + +```json +[ + { + "op": "replace", + "path": "/spec/executionType", + "value": "Start" + } +] +``` + +Type: application/json-patch+json + +**Description** + +Start the specified Framework whose [ExecutionType](#FrameworkExecutionType) should be `Create`. + +Before the Start, the Framework will not start to run or complete, but the object of the Framework is created, see [Framework PreStart Example](#FrameworkExecutionTypePreStartExample). + +**Response** + +| Code | Body | Description | +|:---- |:---- |:---- | +| OK(200) | [Framework](../pkg/apis/frameworkcontroller/v1/types.go) | Return current Framework. | +| NotFound(404) | [Status](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#status-v1-meta) | The specified Framework is not found. | + ##### Stop Framework **Request** @@ -86,9 +121,9 @@ Type: application/json-patch+json **Description** -Stop the specified Framework: +Stop the specified Framework whose [ExecutionType](#FrameworkExecutionType) should be `Create` or `Start`. -All running containers of the Framework will be stopped while the object of the Framework is still kept. +After the Stop, the Framework will start to complete, but the object of the Framework will not be deleted, see [Framework PostStop Example](#FrameworkExecutionTypePostStopExample). **Response** @@ -346,6 +381,100 @@ Watch the change events of all Frameworks (in the specified FrameworkNamespace). |:---- |:---- |:---- | | OK(200) | [WatchEvent](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#watchevent-v1-meta) | Streaming the change events of all Frameworks (in the specified FrameworkNamespace). | +## Framework ExecutionType +Framework [ExecutionType](../pkg/apis/frameworkcontroller/v1/types.go) can be specified to control the execution of the Framework: +1. You can just [Create Framework](#CREATE_Framework) with `Create` ExecutionType, which does not also start it at the same time. + - This is useful when you need to do some PreStart actions depend on the Framework object, see [Framework PreStart Example](#FrameworkExecutionTypePreStartExample). And once these actions are done, you can safely [Start Framework](#Start_Framework). +2. You can just [Stop Framework](#Stop_Framework), which does not also delete it at the same time. + - This is useful when you need to do some PostStop actions depend on the Framework object, see [Framework PostStop Example](#FrameworkExecutionTypePostStopExample). And once these actions are done, you can safely [Delete Framework](#DELETE_Framework). + +### Example +#### Framework PreStart Example +In this example, you need to run a [Framework which depends on a ServiceAccount](https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account), but the ServiceAccount also depends on the Framework object to be [OwnerReferences](https://kubernetes.io/docs/concepts/workloads/controllers/garbage-collection/#owners-and-dependents), so you cannot directly [Create Framework](#CREATE_Framework) with ExecutionType `Start`. +1. [Create Framework](#CREATE_Framework) with `Create` ExecutionType and a ServiceAccount reference as below, then the Framework will stay as AttemptCreationPending: +```yaml +apiVersion: frameworkcontroller.microsoft.com/v1 +kind: Framework +metadata: + name: prestart +spec: + executionType: Create + retryPolicy: + fancyRetryPolicy: false + maxRetryCount: 0 + taskRoles: + - name: a + taskNumber: 4 + frameworkAttemptCompletionPolicy: + minFailedTaskCount: 4 + minSucceededTaskCount: 1 + task: + retryPolicy: + fancyRetryPolicy: false + maxRetryCount: 0 + podGracefulDeletionTimeoutSec: 600 + pod: + spec: + restartPolicy: Never + serviceAccountName: prestart + containers: + - name: ubuntu + image: ubuntu:trusty + command: ["sh", "-c", "printenv && sleep infinity"] +``` +2. Use above creation response's `metadata.uid` to override below {{FrameworkUID}}, and [Create ServiceAccount](https://v1-14.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.14/#create-serviceaccount-v1-core) with above Framework reference as below: +```yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prestart + ownerReferences: + - apiVersion: frameworkcontroller.microsoft.com/v1 + kind: Framework + name: prestart + uid: {{FrameworkUID}} + controller: true + blockOwnerDeletion: true +``` +3. [Start Framework](#Start_Framework), then the Framework will start to run successfully. +4. [Delete Framework](#DELETE_Framework), then both the Framework and above ServiceAccount will be deleted. + +#### Framework PostStop Example +In this example, you need to stop a Framework whose final stopped Framework object needs to be [pushed to/pulled by external systems](#FrameworkPodHistory), so you cannot directly [Delete Framework](#DELETE_Framework). +1. [Create Framework](#CREATE_Framework) as below: +```yaml +apiVersion: frameworkcontroller.microsoft.com/v1 +kind: Framework +metadata: + name: poststop +spec: + executionType: Start + retryPolicy: + fancyRetryPolicy: false + maxRetryCount: 0 + taskRoles: + - name: a + taskNumber: 4 + frameworkAttemptCompletionPolicy: + minFailedTaskCount: 4 + minSucceededTaskCount: 1 + task: + retryPolicy: + fancyRetryPolicy: false + maxRetryCount: 0 + podGracefulDeletionTimeoutSec: 600 + pod: + spec: + restartPolicy: Never + containers: + - name: ubuntu + image: ubuntu:trusty + command: ["sh", "-c", "printenv && sleep infinity"] +``` +2. [Stop Framework](#Stop_Framework), then the Framework will be stopped, i.e. FrameworkCompleted. +3. [Get Framework](#GET_Framework), and archive it into a DataBase first. +4. [Delete Framework](#DELETE_Framework), then the Framework will be deleted. + ## Container EnvironmentVariable [Container EnvironmentVariable](../pkg/apis/frameworkcontroller/v1/constants.go) @@ -713,7 +842,7 @@ Besides these general [Framework ConsistencyGuarantees](#ConsistencyGuarantees), To safely run large scale Framework, i.e. the total task number in a single Framework is greater than 300, you just need to enable the [LargeFrameworkCompression](../pkg/apis/frameworkcontroller/v1/config.go). However, you may also need to decompress the Framework by yourself. ## Framework and Pod History -By leveraging the [LogObjectSnapshot](../pkg/apis/frameworkcontroller/v1/config.go), external systems, such as [Fluentd](https://www.fluentd.org) and [ElasticSearch](https://www.elastic.co/products/elasticsearch), can collect and process Framework and Pod history snapshots even if it was retried or deleted, such as persistence, metrics conversion, visualization, alerting, acting, analysis, etc. +By leveraging the [LogObjectSnapshot](../pkg/apis/frameworkcontroller/v1/config.go), external systems, such as [Fluentd](https://www.fluentd.org) and [ElasticSearch](https://www.elastic.co/products/elasticsearch), can collect and process Framework and Pod history snapshots even if it was retried or deleted, such as for persistence, metrics conversion, visualization, alerting, acting, analysis, etc. ## Framework and Task State Machine ### Framework State Machine diff --git a/example/framework/extension/frameworkbarrier.yaml b/example/framework/extension/frameworkbarrier.yaml index 33b96c2e..6926a6d0 100644 --- a/example/framework/extension/frameworkbarrier.yaml +++ b/example/framework/extension/frameworkbarrier.yaml @@ -59,7 +59,7 @@ spec: - name: frameworkbarrier-volume mountPath: /mnt/frameworkbarrier # [PREREQUISITE] - # User needs to create a service account in the same namespace of this + # User needs to create a ServiceAccount in the same namespace of this # Framework with granted permission for frameworkbarrier, if the k8s # cluster enforces authorization. # For example, if the cluster enforces RBAC: diff --git a/example/framework/scenario/tensorflow/ps/cpu/tensorflowdistributedtrainingwithcpu.yaml b/example/framework/scenario/tensorflow/ps/cpu/tensorflowdistributedtrainingwithcpu.yaml index e10e000e..f8197c0a 100644 --- a/example/framework/scenario/tensorflow/ps/cpu/tensorflowdistributedtrainingwithcpu.yaml +++ b/example/framework/scenario/tensorflow/ps/cpu/tensorflowdistributedtrainingwithcpu.yaml @@ -69,7 +69,7 @@ spec: - name: data-volume mountPath: /mnt/data # [PREREQUISITE] - # User needs to create a service account for frameworkbarrier, if the + # User needs to create a ServiceAccount for frameworkbarrier, if the # k8s cluster enforces authorization. # See more in ./example/framework/extension/frameworkbarrier.yaml serviceAccountName: frameworkbarrier diff --git a/example/framework/scenario/tensorflow/ps/gpu/tensorflowdistributedtrainingwithdefaultscheduledgpu.yaml b/example/framework/scenario/tensorflow/ps/gpu/tensorflowdistributedtrainingwithdefaultscheduledgpu.yaml index b5a0e50c..8d20d594 100644 --- a/example/framework/scenario/tensorflow/ps/gpu/tensorflowdistributedtrainingwithdefaultscheduledgpu.yaml +++ b/example/framework/scenario/tensorflow/ps/gpu/tensorflowdistributedtrainingwithdefaultscheduledgpu.yaml @@ -75,7 +75,7 @@ spec: - name: data-volume mountPath: /mnt/data # [PREREQUISITE] - # User needs to create a service account for frameworkbarrier, if the + # User needs to create a ServiceAccount for frameworkbarrier, if the # k8s cluster enforces authorization. # See more in ./example/framework/extension/frameworkbarrier.yaml serviceAccountName: frameworkbarrier diff --git a/example/framework/scenario/tensorflow/ps/gpu/tensorflowdistributedtrainingwithhivedscheduledgpu.yaml b/example/framework/scenario/tensorflow/ps/gpu/tensorflowdistributedtrainingwithhivedscheduledgpu.yaml index 2bb76d6e..9749168d 100644 --- a/example/framework/scenario/tensorflow/ps/gpu/tensorflowdistributedtrainingwithhivedscheduledgpu.yaml +++ b/example/framework/scenario/tensorflow/ps/gpu/tensorflowdistributedtrainingwithhivedscheduledgpu.yaml @@ -95,7 +95,7 @@ spec: - name: data-volume mountPath: /mnt/data # [PREREQUISITE] - # User needs to create a service account for frameworkbarrier, if the + # User needs to create a ServiceAccount for frameworkbarrier, if the # k8s cluster enforces authorization. # See more in ./example/framework/extension/frameworkbarrier.yaml serviceAccountName: frameworkbarrier diff --git a/example/run/README.md b/example/run/README.md index 3af894d5..c6b58d67 100644 --- a/example/run/README.md +++ b/example/run/README.md @@ -16,7 +16,7 @@ Notes: ### Prerequisite -If the k8s cluster enforces [Authorization](https://kubernetes.io/docs/reference/access-authn-authz/authorization/#using-flags-for-your-authorization-module), you need to first create a [Service Account](https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account) with granted permission for FrameworkController. For example, if the cluster enforces [RBAC](https://kubernetes.io/docs/reference/access-authn-authz/rbac/#kubectl-create-clusterrolebinding): +If the k8s cluster enforces [Authorization](https://kubernetes.io/docs/reference/access-authn-authz/authorization/#using-flags-for-your-authorization-module), you need to first create a [ServiceAccount](https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account) with granted permission for FrameworkController. For example, if the cluster enforces [RBAC](https://kubernetes.io/docs/reference/access-authn-authz/rbac/#kubectl-create-clusterrolebinding): ```shell kubectl create serviceaccount frameworkcontroller --namespace default kubectl create clusterrolebinding frameworkcontroller \ @@ -26,7 +26,7 @@ kubectl create clusterrolebinding frameworkcontroller \ ### Run -Run FrameworkController with above Service Account and the [k8s inClusterConfig](https://kubernetes.io/docs/tasks/access-application-cluster/access-cluster/#accessing-the-api-from-a-pod): +Run FrameworkController with above ServiceAccount and the [k8s inClusterConfig](https://kubernetes.io/docs/tasks/access-application-cluster/access-cluster/#accessing-the-api-from-a-pod): #### Run with [default config](../../example/config/default/frameworkcontroller.yaml) ```shell @@ -51,7 +51,7 @@ spec: labels: app: frameworkcontroller spec: - # Using the service account with granted permission + # Using the ServiceAccount with granted permission # if the k8s cluster enforces authorization. serviceAccountName: frameworkcontroller containers: @@ -115,7 +115,7 @@ spec: labels: app: frameworkcontroller spec: - # Using the service account with granted permission + # Using the ServiceAccount with granted permission # if the k8s cluster enforces authorization. serviceAccountName: frameworkcontroller containers: @@ -133,8 +133,8 @@ spec: "cp /frameworkcontroller-config/frameworkcontroller.yaml . && ./start.sh"] volumeMounts: - - name: frameworkcontroller-config - mountPath: /frameworkcontroller-config + - name: frameworkcontroller-config + mountPath: /frameworkcontroller-config volumes: - name: frameworkcontroller-config configMap: diff --git a/pkg/apis/frameworkcontroller/v1/crd.go b/pkg/apis/frameworkcontroller/v1/crd.go index 0a71edb1..0b2d3dee 100644 --- a/pkg/apis/frameworkcontroller/v1/crd.go +++ b/pkg/apis/frameworkcontroller/v1/crd.go @@ -74,6 +74,7 @@ func buildFrameworkValidation() *apiExtensions.CustomResourceValidation { Properties: map[string]apiExtensions.JSONSchemaProps{ "executionType": { Enum: []apiExtensions.JSON{ + {Raw: []byte(common.Quote(string(ExecutionCreate)))}, {Raw: []byte(common.Quote(string(ExecutionStart)))}, {Raw: []byte(common.Quote(string(ExecutionStop)))}, }, diff --git a/pkg/apis/frameworkcontroller/v1/types.go b/pkg/apis/frameworkcontroller/v1/types.go index e5adf44e..e4df46c6 100644 --- a/pkg/apis/frameworkcontroller/v1/types.go +++ b/pkg/apis/frameworkcontroller/v1/types.go @@ -70,8 +70,7 @@ type Framework struct { // Spec ////////////////////////////////////////////////////////////////////////////////////////////////// type FrameworkSpec struct { - Description string `json:"description"` - // Only support to update from ExecutionStart to ExecutionStop + Description string `json:"description"` ExecutionType ExecutionType `json:"executionType"` RetryPolicy RetryPolicySpec `json:"retryPolicy"` TaskRoles []*TaskRoleSpec `json:"taskRoles"` @@ -115,11 +114,23 @@ type TaskSpec struct { Pod core.PodTemplateSpec `json:"pod"` } +// User can set any ExecutionType when create a Framework, and then he can choose +// to change the ExecutionType or not. +// However, only below changes are supported: +// 1. ExecutionCreate -> ExecutionStart/ExecutionStop +// 2. ExecutionStart -> ExecutionStop type ExecutionType string const ( + // The Framework will be kept in FrameworkAttemptCreationPending. + // So it will never start to run or complete. + ExecutionCreate ExecutionType = "Create" + // The Framework will be transitioned from FrameworkAttemptCreationPending. + // So it will immediately start to run. ExecutionStart ExecutionType = "Start" - ExecutionStop ExecutionType = "Stop" + // The Framework will be transitioned to FrameworkCompleted. + // So it will immediately start to complete. + ExecutionStop ExecutionType = "Stop" ) // RetryPolicySpec can be configured for the whole Framework and each TaskRole diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index 170c72dc..bd35a0ac 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -1214,6 +1214,12 @@ func (c *FrameworkController) syncFrameworkState(f *ci.Framework) (err error) { return nil } + if f.Spec.ExecutionType == ci.ExecutionCreate { + klog.Infof(logPfx + "Skip to createFrameworkAttempt: " + + "User has requested to just create the Framework without starting it") + return nil + } + if f.Spec.ExecutionType == ci.ExecutionStop { diag := "User has requested to stop the Framework" klog.Info(logPfx + diag)