ci-operator/templates/openshift/openshift-ansible/cluster-launch-e2e-openshift-ansible.yaml

# TODO: This file has the wrong name and needs to be refactored
kind: Template
apiVersion: template.openshift.io/v1

parameters:
- name: JOB_NAME_SAFE
  required: true
- name: JOB_NAME_HASH
  required: true
- name: NAMESPACE
  required: true
- name: IMAGE_ANSIBLE
  required: true
- name: IMAGE_FORMAT
  required: true
- name: IMAGE_TESTS
  required: true
- name: RPM_REPO_OPENSHIFT_ORIGIN
  required: true
- name: CLUSTER_TYPE
  required: true
- name: TEST_COMMAND
  required: true

objects:

# We want the cluster to be able to access these images
- kind: RoleBinding
  apiVersion: authorization.openshift.io/v1
  metadata:
    name: ${JOB_NAME_SAFE}-image-puller
    namespace: ${NAMESPACE}
  roleRef:
    name: system:image-puller
  subjects:
  - kind: SystemGroup
    name: system:unauthenticated
  - kind: SystemGroup
    name: system:authenticated

# The e2e pod spins up a cluster, runs e2e tests, and then cleans up the cluster.
- kind: Pod
  apiVersion: v1
  metadata:
    name: ${JOB_NAME_SAFE}
    namespace: ${NAMESPACE}
    annotations:
      # we want to gather the teardown logs no matter what
      ci-operator.openshift.io/wait-for-container-artifacts: teardown
      ci-operator.openshift.io/save-container-logs: "true"
  spec:
    restartPolicy: Never
    activeDeadlineSeconds: 21600
    terminationGracePeriodSeconds: 600
    volumes:
    - name: artifacts
      emptyDir: {}
    - name: shared-tmp
      emptyDir: {}
    - name: shared-inventory
      emptyDir: {}
    - name: cluster-profile
      secret:
        secretName: ${JOB_NAME_SAFE}-cluster-profile

    containers:
    # Once admin.kubeconfig exists, executes shared tests
    - name: test
      image: ${IMAGE_TESTS}
      terminationMessagePolicy: FallbackToLogsOnError
      resources:
        requests:
          cpu: 1
          memory: 300Mi
        limits:
          memory: 4Gi
      volumeMounts:
      - name: shared-tmp
        mountPath: /tmp/shared
      - name: cluster-profile
        mountPath: /tmp/cluster
      - name: artifacts
        mountPath: /tmp/artifacts
      env:
      - name: ARTIFACT_DIR
        value: /tmp/artifacts
      - name: HOME
        value: /tmp/home
      - name: KUBECONFIG
        value: /tmp/admin.kubeconfig
      command:
      - /bin/bash
      - -c
      - |
        #!/bin/bash
        set -euo pipefail

        export PATH=/usr/libexec/origin:$PATH

        trap 'touch /tmp/shared/exit' EXIT
        trap 'jobs -p | xargs -r kill || true; exit 0' TERM

        cp "$(which oc)" /tmp/shared/

        mkdir -p "${HOME}"

        # wait until the setup job creates admin.kubeconfig
        while true; do
          if [[ ! -f /tmp/shared/admin.kubeconfig ]]; then
            sleep 15 & wait
            continue
          fi
          break
        done
        echo "Found shared kubeconfig"

        # don't let clients impact the global kubeconfig
        cp /tmp/shared/admin.kubeconfig /tmp/admin.kubeconfig

        # set up cloud-provider-specific env vars
        if [[ "${CLUSTER_TYPE}" == "gcp" ]]; then
          export GOOGLE_APPLICATION_CREDENTIALS="/tmp/cluster/gce.json"
          export KUBE_SSH_USER=cloud-user
          mkdir -p ~/.ssh
          cp /tmp/cluster/ssh-privatekey ~/.ssh/google_compute_engine || true
          export PROVIDER_ARGS='-provider=gce -gce-zone=us-east1-c -gce-project=openshift-gce-devel-ci'
          export TEST_PROVIDER='{"type":"gce","zone":"us-east1-c","projectid":"openshift-gce-devel-ci"}'
        elif [[ "${CLUSTER_TYPE}" == "aws" ]]; then
          mkdir -p ~/.ssh
          cp /tmp/cluster/ssh-privatekey ~/.ssh/kube_aws_rsa || true
          export PROVIDER_ARGS="-provider=aws -gce-zone=us-east-1"
          export TEST_PROVIDER='{"type":"aws","zone":"us-east-1"}'
        elif [[ "${CLUSTER_TYPE}" == "openstack" ]]; then
          mkdir -p ~/.ssh
          cp /tmp/cluster/ssh-privatekey ~/.ssh/kube_openstack_rsa || true
        fi

        mkdir -p /tmp/output
        cd /tmp/output

        function run-tests() {
          set -x
          if which openshift-tests && [[ -n "${TEST_SUITE-}" ]]; then
            openshift-tests run "${TEST_SUITE}" --provider "${TEST_PROVIDER:-}" -o ${ARTIFACT_DIR}/e2e.log --junit-dir ${ARTIFACT_DIR}/junit
            return 0
          fi
          # TODO: remove everything after this point once we fork templates by release - starting with 4.0
          if ! which extended.test; then
            echo "must provide TEST_SUITE variable"
            return 1
          fi
          if [[ -n "${TEST_FOCUS:-}" ]]; then
            ginkgo -v -noColor -nodes="${TEST_PARALLELISM:-30}" $( which extended.test ) -- \
              -ginkgo.focus="${TEST_FOCUS}" -ginkgo.skip="${TEST_SKIP:-"\\[local\\]"}" \
              -e2e-output-dir ${ARTIFACT_DIR} -report-dir ${ARTIFACT_DIR}/junit \
              -test.timeout=2h ${PROVIDER_ARGS-} || rc=$?
          fi
          if [[ -n "${TEST_FOCUS_SERIAL:-}" ]]; then
            ginkgo -v -noColor -nodes=1 $( which extended.test ) -- \
              -ginkgo.focus="${TEST_FOCUS_SERIAL}" -ginkgo.skip="${TEST_SKIP_SERIAL:-"\\[local\\]"}" \
              -e2e-output-dir ${ARTIFACT_DIR} -report-dir ${ARTIFACT_DIR}/junit/serial \
              -test.timeout=2h ${PROVIDER_ARGS-} || rc=$?
          fi
          return ${rc:-0}
        }

        ${TEST_COMMAND}

    # Runs an install
    - name: setup
      image: ${IMAGE_ANSIBLE}
      terminationMessagePolicy: FallbackToLogsOnError
      volumeMounts:
      - name: shared-tmp
        mountPath: /tmp/shared
      - name: shared-inventory
        mountPath: /usr/share/ansible/openshift-ansible/test/ci/inventory
      - name: artifacts
        mountPath: /tmp/artifacts
      - name: cluster-profile
        mountPath: /usr/local/e2e-aws-cluster-profile
      env:
      - name: ARTIFACT_DIR
        value: /tmp/artifacts
      - name: INSTANCE_PREFIX
        value: ${NAMESPACE}-${JOB_NAME_HASH}
      - name: TYPE
        value: ${CLUSTER_TYPE}
      - name: HOME
        value: /tmp
      - name: RPM_REPO_OPENSHIFT_ORIGIN
        value: ${RPM_REPO_OPENSHIFT_ORIGIN}
      command:
      - /usr/local/bin/entrypoint-provider
      args:
      - /bin/bash
      - -c
      - |
        #!/bin/bash
        set -euo pipefail

        trap 'rc=$?; if [[ $rc -ne 0 ]]; then touch /tmp/shared/exit; fi; exit $rc' EXIT
        trap 'jobs -p | xargs -r kill || true; exit 0' TERM

        cd /usr/share/ansible/openshift-ansible/
        mkdir -p test/ci/inventory/group_vars/OSEv3
        echo "localhost ansible_connection=local" > test/ci/inventory/hosts
        cp -f /usr/local/e2e-aws-cluster-profile/vars.yaml test/ci/
        cp -f /usr/local/e2e-aws-cluster-profile/vars-origin.yaml test/ci/inventory/group_vars/OSEv3

        mkdir ~/.ssh
        cp -f /usr/local/e2e-aws-cluster-profile/ssh-privatekey ~/.ssh/id_rsa
        chmod 0600 ~/.ssh/id_rsa
        export BOTO_CONFIG=/usr/local/e2e-aws-cluster-profile/.awscred

        cd /usr/share/ansible/openshift-ansible
        ansible-playbook -vvv \
          -e "openshift_test_repo=${RPM_REPO_OPENSHIFT_ORIGIN}" \
          -i test/ci/inventory \
          test/ci/launch.yml

        # Create a new cluster-admin user
        ansible -i test/ci/inventory "masters[0]" -m command \
          -a "oc adm policy add-cluster-role-to-user cluster-admin ci-test"
        # Fetch auth URL
        ansible -i test/ci/inventory "masters[0]" -m shell \
          -a 'echo "https://{{ openshift_master_default_subdomain }}:{{ openshift_master_api_port | default(8443) }}" > /tmp/auth_url'
        ansible -i test/ci/inventory "masters[0]" -m fetch \
          -a 'src=/tmp/auth_url dest=/tmp/ flat=true'
        # Login
        for i in {0..5}; do
          oc login $(cat /tmp/auth_url) --insecure-skip-tls-verify --username=ci-test --password=ci && break
          sleep 5
        done
        # Copy kubeconfig
        cp $HOME/.kube/config /tmp/shared/admin.kubeconfig

    # Performs cleanup of all created resources
    - name: teardown
      image: ${IMAGE_ANSIBLE}
      terminationMessagePolicy: FallbackToLogsOnError
      volumeMounts:
      - name: shared-tmp
        mountPath: /tmp/shared
      - name: shared-inventory
        mountPath: /usr/share/ansible/openshift-ansible/test/ci/inventory
      - name: artifacts
        mountPath: /tmp/artifacts
      - name: cluster-profile
        mountPath: /usr/local/e2e-aws-cluster-profile
      env:
      - name: ARTIFACT_DIR
        value: /tmp/artifacts
      - name: INSTANCE_PREFIX
        value: ${NAMESPACE}-${JOB_NAME_HASH}
      - name: TYPE
        value: ${CLUSTER_TYPE}
      command:
      - /usr/local/bin/entrypoint-provider
      args:
      - /bin/bash
      - -c
      - |
        #!/bin/bash
        set -eo pipefail

        function teardown() {
          set +e
          touch /tmp/shared/exit
          echo "Gathering artifacts ..."
          export KUBECONFIG=/tmp/shared/admin.kubeconfig
          mkdir -p ${ARTIFACT_DIR}/pods ${ARTIFACT_DIR}/nodes ${ARTIFACT_DIR}/metrics

          oc --request-timeout=5s get nodes -o jsonpath --template '{range .items[*]}{.metadata.name}{"\n"}{end}' > /tmp/nodes
          oc --request-timeout=5s get pods --all-namespaces --template '{{ range .items }}{{ $name := .metadata.name }}{{ $ns := .metadata.namespace }}{{ range .spec.containers }}-n {{ $ns }} {{ $name }} -c {{ .name }}{{ "\n" }}{{ end }}{{ range .spec.initContainers }}-n {{ $ns }} {{ $name }} -c {{ .name }}{{ "\n" }}{{ end }}{{ end }}' > /tmp/containers
          oc --request-timeout=5s get nodes -o json > ${ARTIFACT_DIR}/nodes.json
          oc --request-timeout=5s get events --all-namespaces -o json > ${ARTIFACT_DIR}/events.json
          oc --request-timeout=5s get pods -l openshift.io/component=api --all-namespaces --template '{{ range .items }}-n {{ .metadata.namespace }} {{ .metadata.name }}{{ "\n" }}{{ end }}' > /tmp/pods-api

          # gather nodes first in parallel since they may contain the most relevant debugging info
          while IFS= read -r i; do
            mkdir -p ${ARTIFACT_DIR}/nodes/$i
            (
            oc get --request-timeout=20s --raw /api/v1/nodes/$i/proxy/logs/messages | gzip -c > ${ARTIFACT_DIR}/nodes/$i/messages.gz
            oc get --request-timeout=20s --raw /api/v1/nodes/$i/proxy/logs/journal | sed -e 's|.*href="\(.*\)".*|\1|;t;d' > /tmp/journals
            while IFS= read -r j; do
              oc get --request-timeout=20s --raw /api/v1/nodes/$i/proxy/logs/journal/${j}system.journal | gzip -c > ${ARTIFACT_DIR}/nodes/$i/journal.gz
            done < /tmp/journals
            oc get --request-timeout=20s --raw /api/v1/nodes/$i/proxy/metrics | gzip -c > ${ARTIFACT_DIR}/metrics/node-$i.gz
            oc get --request-timeout=20s --raw /api/v1/nodes/$i/proxy/debug/pprof/heap > ${ARTIFACT_DIR}/nodes/$i/heap
            oc get --request-timeout=20s --raw /api/v1/nodes/$i/proxy/logs/secure | gzip -c > ${ARTIFACT_DIR}/nodes/$i/secure.gz
            oc get --request-timeout=20s --raw /api/v1/nodes/$i/proxy/logs/audit | gzip -c > ${ARTIFACT_DIR}/nodes/$i/audit.gz
            ) &
          done < /tmp/nodes

          while IFS= read -r i; do
            file="$( echo "$i" | cut -d ' ' -f 3 | tr -s ' ' '_' )"
            oc exec $i -- /bin/bash -c 'oc get --raw /debug/pprof/heap --server "https://$( hostname ):8443" --config /etc/origin/master/admin.kubeconfig' > ${ARTIFACT_DIR}/metrics/${file}-heap
            oc exec $i -- /bin/bash -c 'oc get --raw /metrics --server "https://$( hostname ):8443" --config /etc/origin/master/admin.kubeconfig' | gzip -c > ${ARTIFACT_DIR}/metrics/${file}-api.gz
            oc exec $i -- /bin/bash -c 'oc get --raw /debug/pprof/heap --server "https://$( hostname ):8444" --config /etc/origin/master/admin.kubeconfig' > ${ARTIFACT_DIR}/metrics/${file}-controllers-heap
            oc exec $i -- /bin/bash -c 'oc get --raw /metrics --server "https://$( hostname ):8444" --config /etc/origin/master/admin.kubeconfig' | gzip -c > ${ARTIFACT_DIR}/metrics/${file}-controllers.gz
          done < /tmp/pods-api

          while IFS= read -r i; do
            file="$( echo "$i" | cut -d ' ' -f 2,3,5 | tr -s ' ' '_' )"
            oc logs --request-timeout=20s $i | gzip -c > ${ARTIFACT_DIR}/pods/${file}.log.gz
            oc logs --request-timeout=20s -p $i | gzip -c > ${ARTIFACT_DIR}/pods/${file}_previous.log.gz
          done < /tmp/containers

          echo "Waiting for node logs to finish ..."
          wait

          echo "Deprovisioning cluster ..."
          mkdir ~/.ssh
          cp -f /usr/local/e2e-aws-cluster-profile/ssh-privatekey ~/.ssh/id_rsa
          chmod 0600 ~/.ssh/id_rsa
          cp -f /usr/local/e2e-aws-cluster-profile/.awscred /etc/boto.cfg

          cd /usr/share/ansible/openshift-ansible/
          echo "localhost ansible_connection=local" > test/ci/inventory/hosts
          cp -f /usr/local/e2e-aws-cluster-profile/vars.yaml test/ci/vars.yml
          ansible-playbook -vvv -i test/ci/inventory test/ci/deprovision.yml
        }

        trap 'teardown' EXIT
        trap 'jobs -p | xargs -r kill || true; exit 0' TERM

        for i in `seq 1 180`; do
          if [[ -f /tmp/shared/exit ]]; then
            exit 0
          fi
          sleep 60 & wait
        done