Skip to content
This repository has been archived by the owner on Nov 16, 2023. It is now read-only.

clean ${PAI_WORK_DIR} before mv content to this folder #3695

Merged
merged 5 commits into from Oct 15, 2019
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 14 additions & 0 deletions src/k8s-job-exit-spec/config/k8s-job-exit-spec.yaml
Expand Up @@ -840,6 +840,20 @@ spec:
runtimeContainerPatterns:
- userLogRegex: "(?msi)CUDA_ERROR_ECC_UNCORRECTABLE.*"

- code: 248
phrase: PAIRuntimeInitContainerUnkownError
issuer: PAI_RUNTIME
causer: PAI_RUNTIME
type: PLATFORM_FAILURE
stage: RUNNING
behavior: UNKNOWN
reaction: RETRY_TO_MAX
reason: "Pai runtime init container exit with unkonwn code"
repro:
- "Pai runtime init container script exit with code 1"
solution:
- "Contact PAI Dev to fix the PAI Runtime bug"

- code: 249
phrase: PAIRuntimeSSHBarrierTimeout
issuer: PAI_RUNTIME
Expand Down
2 changes: 1 addition & 1 deletion src/kube-runtime/build/kube-runtime.dockerfile
Expand Up @@ -44,4 +44,4 @@ RUN chmod -R +x ./
# This line should be removed after using k8s client to interact with api server
RUN apk update && apk add --no-cache curl

CMD ["/bin/sh", "-c", "set -o pipefail && LOG_DIR=/usr/local/pai/logs/${FC_POD_UID} && mkdir -p ${LOG_DIR} && /kube-runtime/src/init 2>&1 | tee ${LOG_DIR}/init.log"]
CMD ["/bin/sh", "-c", "set -o pipefail && LOG_DIR=/usr/local/pai/logs/${FC_POD_UID} && mkdir -p ${LOG_DIR} && /kube-runtime/src/init 2>&1 | tee -a ${LOG_DIR}/init.log"]
35 changes: 27 additions & 8 deletions src/kube-runtime/src/init
Expand Up @@ -20,9 +20,16 @@
set -o errexit
set -o nounset
set -o pipefail
set -x

CHILD_PROCESS="UNKNOWN"

# exit code map
# FRAMEWORK_BARRIER: 200 -> 250 FrameworkBarrierTransientFailed
# 201 -> 251 JobGangAllocationTimeout
# 210 -> 252 FrameworkBarrierPermanentFailed
# PORT_CONFLICT_CHECKER: 10 -> 253 ContainerPortConflict
# PAIRuntimeInitContainerUnkownError: 248
function exit_handler()
{
EXIT_CODE=$?
Expand Down Expand Up @@ -53,14 +60,15 @@ function exit_handler()
fi

# signal triggered, do not change exit code
if [[ $EXIT_CODE -eq 130 || $EXIT_CODE -eq 131 || $EXIT_CODE -eq 132 || \
$EXIT_CODE -eq 134 || $EXIT_CODE -eq 135 || $EXIT_CODE -eq 136 || \
$EXIT_CODE -eq 137 || $EXIT_CODE -eq 139 || $EXIT_CODE -eq 141 || $EXIT_CODE -eq 143 ]]; then
exit $EXIT_CODE
fi

echo "Unknown exit code, platform error"
exit 1
case $EXIT_CODE in
130|131|132|134|135|136|137|139|141|143)
exit $EXIT_CODE
;;
*)
echo "Unknown exit code, platform error"
exit 248
yqwang-ms marked this conversation as resolved.
Show resolved Hide resolved
;;
esac
}

trap exit_handler EXIT
Expand All @@ -78,6 +86,17 @@ PAI_RUNTIME_DIR=${PAI_WORK_DIR}/runtime.d

PAI_LOG_DIR=${PAI_WORK_DIR}/logs/${FC_POD_UID}

yqwang-ms marked this conversation as resolved.
Show resolved Hide resolved
# Move previous logs to another folder. Notice: for init.log, some part will append to previous log file
LOG_FILES=$(find $PAI_LOG_DIR -maxdepth 1 -type f)
if [[ ! -z $LOG_FILES ]]; then
PRE_LOG_DIR=$PAI_LOG_DIR/prelog_$(date +%s)
yqwang-ms marked this conversation as resolved.
Show resolved Hide resolved
mkdir $PRE_LOG_DIR
mv $LOG_FILES $PRE_LOG_DIR
fi

# Clean ${PAI_WORK_DIR} since it may contain last execution content. (rarely happen, but seen in real world)
rm -rf ${PAI_WORK_DIR}/*
yqwang-ms marked this conversation as resolved.
Show resolved Hide resolved

# Move all runtime sources to PAI_WORK_DIR
mv ./* ${PAI_WORK_DIR}
cd ${PAI_WORK_DIR}
Expand Down