Skip to content
This repository has been archived by the owner on Aug 16, 2023. It is now read-only.

testing: prow/gpu-operator.sh: use the GPU Operator must-gather image #294

Merged
merged 2 commits into from
Dec 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -63,16 +63,9 @@ RUN echo -e '#!/usr/bin/env bash \n\
exec ${HOME}/testing/run "$@" \n\
' > /usr/local/bin/run; chmod ugo+x /usr/local/bin/run


# Prepare the image for must-gather support
RUN mkdir /must-gather && chmod 777 /must-gather

# Prepare osde2e results folder
RUN mkdir /test-run-results && chmod 777 /test-run-results

# Prepare the GPU-operator's gather script
RUN ln -s ${HOME}/toolbox/gpu-operator/must-gather.sh /usr/bin/gpu-operator_gather

# Ensure directory permissions are properly set
RUN mkdir -p ${HOME}/.ansible/tmp && chmod 777 ${HOME} -R

Expand Down
64 changes: 45 additions & 19 deletions testing/prow/gpu-operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -109,30 +109,56 @@ prepare_cluster_for_gpu_operator() {

collect_must_gather() {
run_in_sub_shell() {
echo "Running gpu-operator_gather ..."
/usr/bin/gpu-operator_gather &> /dev/null

# extract ARTIFACT_EXTRA_LOGS_DIR from 'source toolbox/_common.sh' without sourcing it directly

export TOOLBOX_SCRIPT_NAME=toolbox/gpu-operator/must-gather.sh
COMMON_SH=$(
bash -c 'source toolbox/_common.sh;
echo "8<--8<--8<--";
# only evaluate these variables from _common.sh
env | egrep "(^ARTIFACT_EXTRA_LOGS_DIR=)"'
)
ENV=$(echo "$COMMON_SH" | tac | sed '/8<--8<--8<--/Q' | tac) # keep only what's after the 8<--
eval $ENV
echo "Running the GPU Operator must-gather image ..."
OPERATOR_IMAGE=$(oc get pods -A -lapp=gpu-operator -o=jsonpath='{.items[0].spec.containers[0].image}' || true)

TMP_DIR="$(mktemp -d -t gpu-operator_XXXX)"

if [[ "$OPERATOR_IMAGE" ]]; then
echo "Operator image: $OPERATOR_IMAGE"

oc adm must-gather --image="$OPERATOR_IMAGE" --dest-dir="${TMP_DIR}" &> /dev/null

# ${TMP_DIR}/<image>/ should contain the file generated by
# the must-gather script. If this is empty, there wasn't a
# must-gather script in the image!
if [[ "$(ls "${TMP_DIR}"/*/* 2>/dev/null | wc -l)" == 0 ]]; then
echo "GPU Operator image failed to must-gather anything ..."
else
img_dirname=$(dirname "$(ls "${TMP_DIR}"/*/* | head -1)")
mv "$img_dirname"/* $TMP_DIR
rmdir "$img_dirname"

# extract ARTIFACT_EXTRA_LOGS_DIR from 'source toolbox/_common.sh' without sourcing it directly
export TOOLBOX_SCRIPT_NAME=toolbox/gpu-operator/must-gather.sh
COMMON_SH=$(source toolbox/_common.sh;
echo "8<--8<--8<--";
# only evaluate these variables from _common.sh
env | egrep "(^ARTIFACT_EXTRA_LOGS_DIR=)"
)
ENV=$(echo "$COMMON_SH" | sed '0,/8<--8<--8<--/d') # keep only what's after the 8<--
eval $ENV

echo "Copying must-gather results to $ARTIFACT_EXTRA_LOGS_DIR ..."
cp -r "$TMP_DIR"/* "$ARTIFACT_EXTRA_LOGS_DIR"

rmdir "$TMP_DIR"
fi
else
echo "Failed to find the GPU Operator image ..."
fi

echo "Running gpu-operator_gather ... copying results to $ARTIFACT_EXTRA_LOGS_DIR"
# Calling this until we're sure that the GPU Operator
# must-gather image captures all the information we need
echo "Running gpu_operator capture_deployment_state ..."
./run_toolbox.py gpu_operator capture_deployment_state > /dev/null || true

cp -r /must-gather/* "$ARTIFACT_EXTRA_LOGS_DIR"
echo "Running gpu_operator capture_deployment_state ... done."

echo "Running gpu-operator_gather ... finished."

(cat "$ARTIFACT_EXTRA_LOGS_DIR"/*__gpu_operator__get_csv_version/gpu_operator.version || echo MISSING) > ${ARTIFACT_DIR}/operator.version
(cat "$ARTIFACT_EXTRA_LOGS_DIR"/*__gpu_operator__wait_deployment/gpu_operator.version 2> /dev/null || echo MISSING) > ${ARTIFACT_DIR}/operator.version

echo "Versions collected."
echo "Operator versions collected."
}

# run the function above in a subshell to avoid polluting the local `env`.
Expand Down
88 changes: 0 additions & 88 deletions toolbox/gpu-operator/must-gather.sh

This file was deleted.