Skip to content
This repository has been archived by the owner on Aug 16, 2023. It is now read-only.

Various refactoring and fixes #108

Merged
merged 7 commits into from Apr 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/sync-release-branches.yml
Expand Up @@ -27,7 +27,7 @@ jobs:
set -x;
git show -s;
git fetch origin --unshallow;
for branch in release-{4.6,4.7,4.8,4.9};
for branch in release-{4.5,4.6,4.7,4.8,4.9};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dear god.... we should drop 4.5 soon

do
git checkout $branch;
git show -s;
Expand Down
1 change: 0 additions & 1 deletion roles/capture_environment/tasks/main.yml
Expand Up @@ -4,7 +4,6 @@
oc version -o json
| jq --raw-output '.openshiftVersion'
> {{ artifact_extra_logs_dir }}/ocp.version
register: ocp_full_version

- name: Store OpenShift YAML version
shell:
Expand Down
3 changes: 1 addition & 2 deletions roles/cluster_scaleup/tasks/main.yml
Expand Up @@ -11,8 +11,7 @@
when: cluster_has_machinetype.stdout != ""
block:
- name: Cluster has running machines with the requested machine type
debug:
msg: "Cluster has running machines with type '{{ cluster_scaleup_machine_instance_type }}', no need to scale-up"
debug: msg="Cluster has running machines with type '{{ cluster_scaleup_machine_instance_type }}', no need to scale-up"

- name: Cluster already scaled-up, end the play
# Warning: this will prevent using multiple roles side by side ...
Expand Down
3 changes: 1 addition & 2 deletions roles/cluster_scaleup/tasks/scaleup.yml
Expand Up @@ -83,5 +83,4 @@
failed_when: false

- name: Fail because of the cluster scale-up failed
fail:
msg: Failing because of cluster scale-up failed
fail: msg="Failing because of cluster scale-up failed"
3 changes: 1 addition & 2 deletions roles/entitlement_test_wait/tasks/main.yml
Expand Up @@ -47,8 +47,7 @@
when: entitlement_inspect_on_failure == 'yes'

- name: Failed because the entitlement test Pod did not succeed.
fail:
msg: Failed because the entitlement test Pod did not succeed.
fail: msg="Failed because the entitlement test Pod did not succeed."

always:
- name: Get the test Pod logs
Expand Down
21 changes: 8 additions & 13 deletions roles/gpu_operator_deploy_custom_commit/tasks/build.yml
Expand Up @@ -61,12 +61,11 @@
failed_when: false

- name: Suggest how to retry the build
debug:
msg: "Suggestion: retry with 'oc new-build bc/helper-image-builder -n gpu-operator-ci-utils'"
debug: msg="To retry the build, run 'oc new-build bc/helper-image-builder -n gpu-operator-ci-utils'"

- name: Fail because the custom operator image failed to build
fail:
msg: Fail because the custom operator image failed to build
fail: msg="Fail because the custom operator image failed to build"


- name: Apply the operator image builder script manifest
command: oc apply -f "{{ gpu_operator_image_builder_script }}"
Expand All @@ -77,8 +76,8 @@
register: builder_secret
rescue:
- name: Failed to find the builder-dockercfg secret
fail:
msg: "No builder-dockercfg secret in the 'gpu-operator-ci' namespace ..."
fail: msg="No builder-dockercfg secret in the 'gpu-operator-ci' namespace ..."


- name: Delete the operator image builder pod, if any
command: oc delete -f "{{ gpu_operator_image_builder_pod }}" --ignore-not-found=true
Expand All @@ -92,8 +91,7 @@
when: has_operator_image.rc != 0
block:
- name: The operator image does not exist, build it
debug:
msg: The operator image does not exist, build it
debug: msg="The operator image does not exist, build it"

- name: Apply the operator image builder pod manifest
shell:
Expand All @@ -119,9 +117,7 @@

- name: Fail if the operator image failed to be built
when: "'Failed' in wait_image_builder_pod.stdout or 'Error' in wait_image_builder_pod.stdout"
fail:
msg: The operator image failed to build

fail: msg="The operator image failed to build"

rescue:
- name: Get info about the operator image build failure (debug)
Expand All @@ -133,5 +129,4 @@
failed_when: false

- name: Fail because the operator image failed to build
fail:
msg: Fail because the operator image failed to build
fail: msg="Fail because the operator image failed to build"
14 changes: 12 additions & 2 deletions roles/gpu_operator_deploy_custom_commit/tasks/deploy.yml
Expand Up @@ -17,6 +17,17 @@
"{{ gpu_operator_image_tag }}"

rescue:
- name: "Get the state of the GPU operator image (debug: ImagePullBackOff)"
command: oc get imagestreamtag/ci-artifacts:gpu-operator-ci-image -n gpu-operator-ci
failed_when: false

- name: "Get the name of the GPU Operator pod image (debug: ImagePullBackOff)"
command:
oc get pods
-n gpu-operator-ci
-o=jsonpath='{range .items[*]}{"\n"}{.metadata.name}{":\t"}{range .spec.containers[*]}{.image}{" "}{end}{end}'
failed_when: false

- name: Get the state of the GPU operator pod (debug)
command: oc get pods -n gpu-operator-ci
failed_when: false
Expand All @@ -27,5 +38,4 @@
failed_when: false

- name: Fail because the GPU Operator custom commit could not be deployed
fail:
msg: Fail because the GPU Operator custom commit could not be deployed
fail: msg="Fail because the GPU Operator custom commit could not be deployed"
12 changes: 4 additions & 8 deletions roles/gpu_operator_deploy_from_operatorhub/tasks/main.yml
Expand Up @@ -34,8 +34,7 @@
when: gpu_operator_operatorhub_version != ''

- name: "Create the OperatorHub subscription for {{ gpu_operator_csv_name }}"
debug:
msg: "{{ gpu_operator_csv_name }}"
debug: msg="{{ gpu_operator_csv_name }}"

- name: "Create the OperatorHub subscription for {{ gpu_operator_csv_name }}"
shell:
Expand Down Expand Up @@ -85,12 +84,10 @@
failed_when: false

- name: "Indicate where the Catalog-operator logs have been saved"
debug:
msg: "The logs of Catalog Operator have been saved in {{ artifact_extra_logs_dir }}/catalog_operator.log"
debug: msg="The logs of Catalog Operator have been saved in {{ artifact_extra_logs_dir }}/catalog_operator.log"

- name: Failed because the GPU Operator could not be install from the Catalog Operator
fail:
msg: Failed because the GPU Operator could not be install from the Catalog Operator
fail: msg="Failed because the GPU Operator could not be install from the Catalog Operator"

- name: Create a temporary file for the GPU Operator clusterpolicy
ansible.builtin.tempfile:
Expand Down Expand Up @@ -143,8 +140,7 @@
failed_when: false

- name: Failing because the ClusterPolicy CR cannot be created
fail:
msg: Failed because the ClusterPolicy CR cannot be created
fail: msg="Failed because the ClusterPolicy CR cannot be created"

- block:
- name: Check if the namespace has the openshift.io/cluster-monitoring label
Expand Down
9 changes: 3 additions & 6 deletions roles/gpu_operator_wait_deployment/tasks/main.yml
Expand Up @@ -26,8 +26,7 @@
- name: Ensure that nvidia-device-plugin-validation Pod has ran successfully
command:
oc get pods
--field-selector=metadata.name=nvidia-device-plugin-validation
--field-selector=status.phase=Succeeded
--field-selector=metadata.name=nvidia-device-plugin-validation,status.phase=Succeeded
-n gpu-operator-resources
-oname --no-headers
register: has_deviceplugin_validation_pod
Expand All @@ -53,8 +52,7 @@
failed_when: false

- name: The GFD did not label the nodes
fail:
msg: The GFD did not label the nodes
fail: msg="The GFD did not label the nodes"

- block:
- name: Wait for the nvidia-dcgm-exporter Pod to respond appropriately
Expand Down Expand Up @@ -103,5 +101,4 @@
failed_when: false

- name: The DCGM does not correctly expose the GPU metrics
fail:
msg: The DCGM does not correctly expose the GPU metrics
fail: msg="The DCGM does not correctly expose the GPU metrics"
3 changes: 1 addition & 2 deletions roles/local-ci_deploy/tasks/main.yml
Expand Up @@ -56,5 +56,4 @@
| oc apply -f-

- name: "Finished: the local CI execution has been launched."
debug:
msg: oc get pod/ci-artifacts -n ci-artifacts; oc logs -f pod/ci-artifacts -n ci-artifacts
debug: msg="oc get pod/ci-artifacts -n ci-artifacts; oc logs -f pod/ci-artifacts -n ci-artifacts"
3 changes: 1 addition & 2 deletions roles/nfd_deploy/tasks/main.yml
Expand Up @@ -38,5 +38,4 @@
failed_when: false

- name: Failed when creating the NFD NodeFeatureDiscovery CR
fail:
msg: "{{ apply_nfd_cr }}"
fail: msg="{{ apply_nfd_cr }}"
7 changes: 2 additions & 5 deletions roles/nfd_has_labels/tasks/main.yml
Expand Up @@ -8,10 +8,7 @@
rescue:
- name: Check if NFD CR exists (debug)
command: oc get NodeFeatureDiscovery -A
register: has_nfd_cr
failed_when: has_nfd_cr.stdout == ""
ignore_errors: true
failed_when: false

- name: Fail because nodes do not have NFD labels
fail:
msg: Fail because nodes do not have NFD labels
fail: msg="Fail because nodes do not have NFD labels"
7 changes: 3 additions & 4 deletions roles/nfd_test_wait_gpu/tasks/main.yml
Expand Up @@ -28,11 +28,10 @@
shell: oc get nodes --show-labels --selector='!node-role.kubernetes.io/master' | tr , '\n'

- name: Failing because no GPU node showed up
fail:
msg: Failed because no GPU node showed up
fail: msg="Failed because no GPU node showed up"

when: nfd_wait_gpu_nodes == 'yes'

- name: Failing because no GPU node is available
fail:
msg: Failed because no GPU node is available
fail: msg="Failed because no GPU node is available"
when: nfd_wait_gpu_nodes != 'yes'
2 changes: 1 addition & 1 deletion toolbox/_common.sh
Expand Up @@ -44,7 +44,7 @@ mkdir -p "$(dirname "${ANSIBLE_LOG_PATH}")"
# Ansible caching directory

if [ -z "${ANSIBLE_CACHE_PLUGIN_CONNECTION:-}" ]; then
export ANSIBLE_CACHE_PLUGIN_CONNECTION="${ARTIFACT_EXTRA_LOGS_DIR}/ansible_facts"
export ANSIBLE_CACHE_PLUGIN_CONNECTION="${ARTIFACT_DIR}/ansible_facts"
fi
echo "Using '${ANSIBLE_CACHE_PLUGIN_CONNECTION}' to store ansible facts."
mkdir -p "${ANSIBLE_CACHE_PLUGIN_CONNECTION}"
Expand Down