Skip to content
This repository was archived by the owner on Nov 16, 2023. It is now read-only.

[BREAKING CHANGE]: Refine AnnotationKey, LabelKey and EnvName #6

Merged
merged 1 commit into from
Jan 17, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions example/framework/basic/batchstatefulfailed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,19 @@ spec:
# To locate a specific Task during its whole lifecycle regardless of
# any retry:
# Consistent Identity:
# PodName = {FrameworkName}-{TaskRoleName}-{TaskIndex}
# PodNamespace = {FrameworkNamespace}
# PodName = {FrameworkName}-{TaskRoleName}-{TaskIndex}
# Consistent Environment Variable Value:
# ${FRAMEWORK_NAME}, ${TASKROLE_NAME}, ${TASK_INDEX}
# ${CONFIGMAP_NAME}, ${POD_NAME}, ${POD_NAMESPACE}
# ${FC_FRAMEWORK_NAMESPACE},
# ${FC_FRAMEWORK_NAME}, ${FC_TASKROLE_NAME}, ${FC_TASK_INDEX},
# ${FC_CONFIGMAP_NAME}, ${FC_POD_NAME}
#
# To locate a specific execution attempt of a specific Task:
# Attempt Specific Environment Variable Value:
# ${FRAMEWORK_ATTEMPT_ID}, ${TASK_ATTEMPT_ID}
# ${FC_FRAMEWORK_ATTEMPT_ID}, ${FC_TASK_ATTEMPT_ID}
#
# To locate a specific execution attempt instance of a specific Task:
# Attempt Instance Specific Environment Variable Value:
# ${FRAMEWORK_ATTEMPT_INSTANCE_UID}, ${CONFIGMAP_UID}
# ${TASK_ATTEMPT_INSTANCE_UID}, ${POD_UID}
# ${FC_FRAMEWORK_ATTEMPT_INSTANCE_UID}, ${FC_CONFIGMAP_UID}
# ${FC_TASK_ATTEMPT_INSTANCE_UID}, ${FC_POD_UID}
command: ["sh", "-c", "printenv && sleep 60 && exit 1"]
4 changes: 2 additions & 2 deletions example/framework/basic/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ metadata:
spec:
selector:
# Using predefined labels
FRAMEWORK_NAME: service
TASKROLE_NAME: server
FC_FRAMEWORK_NAME: service
FC_TASKROLE_NAME: server
# Also can use customized labels
#app: server
ports:
Expand Down
4 changes: 2 additions & 2 deletions example/framework/basic/servicestateful.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ metadata:
spec:
selector:
# See comments in service.yaml
FRAMEWORK_NAME: servicestateful
TASKROLE_NAME: serverstateful
FC_FRAMEWORK_NAME: servicestateful
FC_TASKROLE_NAME: serverstateful
ports:
- port: 80
type: NodePort
22 changes: 11 additions & 11 deletions example/framework/extension/frameworkbarrier.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,15 @@ spec:
containers:
- name: ubuntu
image: ubuntu:trusty
# Using /mnt/frameworkbarrier/injector.sh to inject environment
# variables, such as:
# {TaskRoleName}_ips=
# Using /mnt/frameworkbarrier/injector.sh to inject environment variables,
# such as:
# FB_{UpperCase({TaskRoleName})}_IPS=
# {Task[0].PodIP},...,
# {Task[TaskRole.TaskNumber-1].PodIP}
# {TaskRoleName}_addresses=
# {Task[0].PodIP}:${{TaskRoleName}_port},...,
# {Task[TaskRole.TaskNumber-1].PodIP}:${{TaskRoleName}_port}
# Note, the environment variable {TaskRoleName}_port should be
# FB_{UpperCase({TaskRoleName})}_ADDRESSES=
# {Task[0].PodIP}:${FB_{UpperCase({TaskRoleName})}_PORT},...,
# {Task[TaskRole.TaskNumber-1].PodIP}:${FB_{UpperCase({TaskRoleName})}_PORT}
# Note, the environment variable FB_{UpperCase({TaskRoleName})}_PORT should be
# provided by the caller in advance.
#
# User may need to tweak these environment variables to its own
Expand All @@ -48,8 +48,8 @@ spec:
# /mnt/frameworkbarrier/framework.json.
command: [
"sh", "-c",
"server_port=4001 worker_port=5001 . /mnt/frameworkbarrier/injector.sh && printenv &&
server_port=4002 worker_port=5002 . /mnt/frameworkbarrier/injector.sh && printenv &&
"FB_SERVER_PORT=4001 FB_WORKER_PORT=5001 . /mnt/frameworkbarrier/injector.sh && printenv &&
FB_SERVER_PORT=4002 FB_WORKER_PORT=5002 . /mnt/frameworkbarrier/injector.sh && printenv &&
sleep 60"]
ports:
- containerPort: 4001
Expand Down Expand Up @@ -104,8 +104,8 @@ spec:
image: ubuntu:trusty
command: [
"sh", "-c",
"server_port=4001 worker_port=5001 . /mnt/frameworkbarrier/injector.sh && printenv &&
server_port=4002 worker_port=5002 . /mnt/frameworkbarrier/injector.sh && printenv &&
"FB_SERVER_PORT=4001 FB_WORKER_PORT=5001 . /mnt/frameworkbarrier/injector.sh && printenv &&
FB_SERVER_PORT=4002 FB_WORKER_PORT=5002 . /mnt/frameworkbarrier/injector.sh && printenv &&
sleep 60"]
ports:
- containerPort: 5001
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,22 +43,22 @@ spec:
# For the tf_cnn_benchmarks usage, see
# https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks
workingDir: /tensorflow/benchmarks/scripts/tf_cnn_benchmarks
# Using /mnt/frameworkbarrier/injector.sh to inject environment
# variables without the need for image invasion and k8s DNS:
# {TaskRoleName}_addresses=
# {Task[0].PodIP}:${{TaskRoleName}_port},...,
# {Task[TaskRole.TaskNumber-1].PodIP}:${{TaskRoleName}_port}
# Using /mnt/frameworkbarrier/injector.sh to inject environment variables
# without the need for image invasion and k8s DNS:
# FB_{UpperCase({TaskRoleName})}_ADDRESSES=
# {Task[0].PodIP}:${FB_{UpperCase({TaskRoleName})}_PORT},...,
# {Task[TaskRole.TaskNumber-1].PodIP}:${FB_{UpperCase({TaskRoleName})}_PORT}
# See more in ./example/framework/extension/frameworkbarrier.yaml
command: [
"sh", "-c",
"ps_port=4001 worker_port=5001 . /mnt/frameworkbarrier/injector.sh &&
python tf_cnn_benchmarks.py --job_name=ps --task_index=${TASK_INDEX}
--ps_hosts=${ps_addresses} --worker_hosts=${worker_addresses}
"FB_PS_PORT=4001 FB_WORKER_PORT=5001 . /mnt/frameworkbarrier/injector.sh &&
python tf_cnn_benchmarks.py --job_name=ps --task_index=${FC_TASK_INDEX}
--ps_hosts=${FB_PS_ADDRESSES} --worker_hosts=${FB_WORKER_ADDRESSES}
--variable_update=parameter_server --cross_replica_sync=false
--model=alexnet --batch_size=8 --num_batches=10
--device=cpu --local_parameter_device=cpu --data_format=NHWC
--data_name=cifar10 --data_dir=/mnt/data/cifar-10-batches-py
--train_dir=/mnt/data/${FRAMEWORK_NAME}/output"]
--train_dir=/mnt/data/${FC_FRAMEWORK_NAME}/output"]
ports:
- containerPort: 4001
volumeMounts:
Expand Down Expand Up @@ -129,14 +129,14 @@ spec:
workingDir: /tensorflow/benchmarks/scripts/tf_cnn_benchmarks
command: [
"sh", "-c",
"ps_port=4001 worker_port=5001 . /mnt/frameworkbarrier/injector.sh &&
python tf_cnn_benchmarks.py --job_name=worker --task_index=${TASK_INDEX}
--ps_hosts=${ps_addresses} --worker_hosts=${worker_addresses}
"FB_PS_PORT=4001 FB_WORKER_PORT=5001 . /mnt/frameworkbarrier/injector.sh &&
python tf_cnn_benchmarks.py --job_name=worker --task_index=${FC_TASK_INDEX}
--ps_hosts=${FB_PS_ADDRESSES} --worker_hosts=${FB_WORKER_ADDRESSES}
--variable_update=parameter_server --cross_replica_sync=false
--model=alexnet --batch_size=8 --num_batches=10
--device=cpu --local_parameter_device=cpu --data_format=NHWC
--data_name=cifar10 --data_dir=/mnt/data/cifar-10-batches-py
--train_dir=/mnt/data/${FRAMEWORK_NAME}/output"]
--train_dir=/mnt/data/${FC_FRAMEWORK_NAME}/output"]
ports:
- containerPort: 5001
volumeMounts:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,22 +43,22 @@ spec:
# For the tf_cnn_benchmarks usage, see
# https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks
workingDir: /tensorflow/benchmarks/scripts/tf_cnn_benchmarks
# Using /mnt/frameworkbarrier/injector.sh to inject environment
# variables without the need for image invasion and k8s DNS:
# {TaskRoleName}_addresses=
# {Task[0].PodIP}:${{TaskRoleName}_port},...,
# {Task[TaskRole.TaskNumber-1].PodIP}:${{TaskRoleName}_port}
# Using /mnt/frameworkbarrier/injector.sh to inject environment variables
# without the need for image invasion and k8s DNS:
# FB_{UpperCase({TaskRoleName})}_ADDRESSES=
# {Task[0].PodIP}:${FB_{UpperCase({TaskRoleName})}_PORT},...,
# {Task[TaskRole.TaskNumber-1].PodIP}:${FB_{UpperCase({TaskRoleName})}_PORT}
# See more in ./example/framework/extension/frameworkbarrier.yaml
command: [
"sh", "-c",
"ps_port=4001 worker_port=5001 . /mnt/frameworkbarrier/injector.sh &&
python tf_cnn_benchmarks.py --job_name=ps --task_index=${TASK_INDEX}
--ps_hosts=${ps_addresses} --worker_hosts=${worker_addresses}
"FB_PS_PORT=4001 FB_WORKER_PORT=5001 . /mnt/frameworkbarrier/injector.sh &&
python tf_cnn_benchmarks.py --job_name=ps --task_index=${FC_TASK_INDEX}
--ps_hosts=${FB_PS_ADDRESSES} --worker_hosts=${FB_WORKER_ADDRESSES}
--variable_update=parameter_server --cross_replica_sync=false
--model=alexnet --batch_size=8 --num_batches=10
--device=gpu --local_parameter_device=gpu --num_gpus=1 --data_format=NCHW
--data_name=cifar10 --data_dir=/mnt/data/cifar-10-batches-py
--train_dir=/mnt/data/${FRAMEWORK_NAME}/output"]
--train_dir=/mnt/data/${FC_FRAMEWORK_NAME}/output"]
ports:
- containerPort: 4001
resources:
Expand Down Expand Up @@ -135,14 +135,14 @@ spec:
workingDir: /tensorflow/benchmarks/scripts/tf_cnn_benchmarks
command: [
"sh", "-c",
"ps_port=4001 worker_port=5001 . /mnt/frameworkbarrier/injector.sh &&
python tf_cnn_benchmarks.py --job_name=worker --task_index=${TASK_INDEX}
--ps_hosts=${ps_addresses} --worker_hosts=${worker_addresses}
"FB_PS_PORT=4001 FB_WORKER_PORT=5001 . /mnt/frameworkbarrier/injector.sh &&
python tf_cnn_benchmarks.py --job_name=worker --task_index=${FC_TASK_INDEX}
--ps_hosts=${FB_PS_ADDRESSES} --worker_hosts=${FB_WORKER_ADDRESSES}
--variable_update=parameter_server --cross_replica_sync=false
--model=alexnet --batch_size=8 --num_batches=10
--device=gpu --local_parameter_device=gpu --num_gpus=1 --data_format=NCHW
--data_name=cifar10 --data_dir=/mnt/data/cifar-10-batches-py
--train_dir=/mnt/data/${FRAMEWORK_NAME}/output"]
--train_dir=/mnt/data/${FC_FRAMEWORK_NAME}/output"]
ports:
- containerPort: 5001
resources:
Expand Down
45 changes: 25 additions & 20 deletions pkg/apis/frameworkcontroller/v1/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,35 +46,40 @@ const (
ExtendedUnlimitedValue = -2

// For all managed objects
AnnotationKeyFrameworkName = "FRAMEWORK_NAME"
AnnotationKeyTaskRoleName = "TASKROLE_NAME"
AnnotationKeyTaskIndex = "TASK_INDEX"
AnnotationKeyConfigMapName = "CONFIGMAP_NAME"
AnnotationKeyPodName = "POD_NAME"
AnnotationKeyPodNamespace = "POD_NAMESPACE"

AnnotationKeyFrameworkAttemptID = "FRAMEWORK_ATTEMPT_ID"
AnnotationKeyFrameworkAttemptInstanceUID = "FRAMEWORK_ATTEMPT_INSTANCE_UID"
AnnotationKeyConfigMapUID = "CONFIGMAP_UID"
AnnotationKeyTaskAttemptID = "TASK_ATTEMPT_ID"

// Predefined Annotations
AnnotationKeyFrameworkNamespace = "FC_FRAMEWORK_NAMESPACE"
AnnotationKeyFrameworkName = "FC_FRAMEWORK_NAME"
AnnotationKeyTaskRoleName = "FC_TASKROLE_NAME"
AnnotationKeyTaskIndex = "FC_TASK_INDEX"
AnnotationKeyConfigMapName = "FC_CONFIGMAP_NAME"
AnnotationKeyPodName = "FC_POD_NAME"

AnnotationKeyFrameworkAttemptID = "FC_FRAMEWORK_ATTEMPT_ID"
AnnotationKeyFrameworkAttemptInstanceUID = "FC_FRAMEWORK_ATTEMPT_INSTANCE_UID"
AnnotationKeyConfigMapUID = "FC_CONFIGMAP_UID"
AnnotationKeyTaskAttemptID = "FC_TASK_ATTEMPT_ID"

// Predefined Labels
LabelKeyFrameworkName = AnnotationKeyFrameworkName
LabelKeyTaskRoleName = AnnotationKeyTaskRoleName

// For all managed containers
EnvNameFrameworkName = AnnotationKeyFrameworkName
EnvNameTaskRoleName = AnnotationKeyTaskRoleName
EnvNameTaskIndex = AnnotationKeyTaskIndex
EnvNameConfigMapName = AnnotationKeyConfigMapName
EnvNamePodName = AnnotationKeyPodName
EnvNamePodNamespace = AnnotationKeyPodNamespace
// Predefined Environment Variables
// It can be referred by the environment variable specified in the spec, i.e.
// specify the environment variable value to include "$(AnyPredefinedEnvName)".
EnvNameFrameworkNamespace = AnnotationKeyFrameworkNamespace
EnvNameFrameworkName = AnnotationKeyFrameworkName
EnvNameTaskRoleName = AnnotationKeyTaskRoleName
EnvNameTaskIndex = AnnotationKeyTaskIndex
EnvNameConfigMapName = AnnotationKeyConfigMapName
EnvNamePodName = AnnotationKeyPodName

EnvNameFrameworkAttemptID = AnnotationKeyFrameworkAttemptID
EnvNameFrameworkAttemptInstanceUID = AnnotationKeyFrameworkAttemptInstanceUID
EnvNameConfigMapUID = AnnotationKeyConfigMapUID
EnvNameTaskAttemptID = AnnotationKeyTaskAttemptID
EnvNameTaskAttemptInstanceUID = "TASK_ATTEMPT_INSTANCE_UID"
EnvNamePodUID = "POD_UID"
EnvNameTaskAttemptInstanceUID = "FC_TASK_ATTEMPT_INSTANCE_UID"
EnvNamePodUID = "FC_POD_UID"
)

var FrameworkGroupVersionKind = SchemeGroupVersion.WithKind(FrameworkKind)
Expand Down
17 changes: 8 additions & 9 deletions pkg/apis/frameworkcontroller/v1/funcs.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ func (f *Framework) NewConfigMap() *core.ConfigMap {
cm.Finalizers = []string{meta.FinalizerDeleteDependents}

cm.Annotations = map[string]string{}
cm.Annotations[AnnotationKeyFrameworkNamespace] = f.Namespace
cm.Annotations[AnnotationKeyFrameworkName] = f.Name
cm.Annotations[AnnotationKeyConfigMapName] = cm.Name
cm.Annotations[AnnotationKeyFrameworkAttemptID] = frameworkAttemptIDStr
Expand Down Expand Up @@ -305,12 +306,12 @@ func (f *Framework) NewPod(cm *core.ConfigMap, taskRoleName string, taskIndex in
if pod.Annotations == nil {
pod.Annotations = map[string]string{}
}
pod.Annotations[AnnotationKeyFrameworkNamespace] = f.Namespace
pod.Annotations[AnnotationKeyFrameworkName] = f.Name
pod.Annotations[AnnotationKeyTaskRoleName] = taskRoleName
pod.Annotations[AnnotationKeyTaskIndex] = taskIndexStr
pod.Annotations[AnnotationKeyConfigMapName] = f.ConfigMapName()
pod.Annotations[AnnotationKeyPodName] = pod.Name
pod.Annotations[AnnotationKeyPodNamespace] = pod.Namespace
pod.Annotations[AnnotationKeyFrameworkAttemptID] = frameworkAttemptIDStr
pod.Annotations[AnnotationKeyFrameworkAttemptInstanceUID] = frameworkAttemptInstanceUIDStr
pod.Annotations[AnnotationKeyConfigMapUID] = configMapUIDStr
Expand All @@ -322,13 +323,13 @@ func (f *Framework) NewPod(cm *core.ConfigMap, taskRoleName string, taskIndex in
pod.Labels[LabelKeyFrameworkName] = f.Name
pod.Labels[LabelKeyTaskRoleName] = taskRoleName

exEnvs := []core.EnvVar{
predefinedEnvs := []core.EnvVar{
{Name: EnvNameFrameworkNamespace, Value: f.Namespace},
{Name: EnvNameFrameworkName, Value: f.Name},
{Name: EnvNameTaskRoleName, Value: taskRoleName},
{Name: EnvNameTaskIndex, Value: taskIndexStr},
{Name: EnvNameConfigMapName, Value: f.ConfigMapName()},
{Name: EnvNamePodName, Value: pod.Name},
{Name: EnvNamePodNamespace, Value: pod.Namespace},
{Name: EnvNameFrameworkAttemptID, Value: frameworkAttemptIDStr},
{Name: EnvNameFrameworkAttemptInstanceUID, Value: frameworkAttemptInstanceUIDStr},
{Name: EnvNameConfigMapUID, Value: configMapUIDStr},
Expand All @@ -337,24 +338,22 @@ func (f *Framework) NewPod(cm *core.ConfigMap, taskRoleName string, taskIndex in
{Name: EnvNameTaskAttemptInstanceUID, Value: taskAttemptInstanceUIDReferStr},
}

// Prepend predefinedEnvs so that they can be referred by the environment variable
// specified in the spec.
// Change the default TerminationMessagePolicy to TerminationMessageFallbackToLogsOnError
// in case the cluster-level logging has not been setup for the cluster.
// See https://kubernetes.io/docs/concepts/cluster-administration/logging
// It is safe to do so, since it will only fall back to the tail log if the container
// is failed and the termination message file specified by the terminationMessagePath
// is not found or empty.
for i := range pod.Spec.Containers {
for _, exEnv := range exEnvs {
pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, exEnv)
}
pod.Spec.Containers[i].Env = append(predefinedEnvs, pod.Spec.Containers[i].Env...)
if len(pod.Spec.Containers[i].TerminationMessagePolicy) == 0 {
pod.Spec.Containers[i].TerminationMessagePolicy = core.TerminationMessageFallbackToLogsOnError
}
}
for i := range pod.Spec.InitContainers {
for _, exEnv := range exEnvs {
pod.Spec.InitContainers[i].Env = append(pod.Spec.InitContainers[i].Env, exEnv)
}
pod.Spec.InitContainers[i].Env = append(predefinedEnvs, pod.Spec.InitContainers[i].Env...)
if len(pod.Spec.InitContainers[i].TerminationMessagePolicy) == 0 {
pod.Spec.InitContainers[i].TerminationMessagePolicy = core.TerminationMessageFallbackToLogsOnError
}
Expand Down
Loading