Skip to content

Commit

Permalink
DRY out script cleanup code
Browse files Browse the repository at this point in the history
Almost every single cleanup function trapped on exit for our scripts was
taking the same actions in the same order. We can break that out into a
standalone function under `hack/lib` that contains a superset of all the
cleanup steps used, provided that we make sure each individual clean up
step can gracefully handle the case where it is not needed or does not
have any work to do.

Signed-off-by: Steve Kuznetsov <skuznets@redhat.com>
  • Loading branch information
stevekuznetsov committed May 10, 2017
1 parent 89087e7 commit b8c8683
Show file tree
Hide file tree
Showing 17 changed files with 220 additions and 312 deletions.
160 changes: 149 additions & 11 deletions hack/lib/cleanup.sh
Expand Up @@ -3,18 +3,79 @@
# This library holds functions that are used to clean up local
# system state after other scripts have run.

# os::cleanup::all will clean up all of the processes and data that
# a script leaves around after running. All of the sub-tasks called
# from this function should gracefully handle when they do not need
# to do anything.
#
# Globals:
# - ARTIFACT_DIR
# - SKIP_TEARDOWN
# - SKIP_IMAGE_CLEANUP
# Arguments:
# 1 - return code of the script
# Returns:
# None
function os::cleanup::all() {
local return_code="$1"

# All of our cleanup is best-effort, so we do not care
# if any specific step fails.
set +o errexit

os::test::junit::generate_report

os::log::info "[CLEANUP] Beginning cleanup routines..."
os::cleanup::dump_events
os::cleanup::dump_etcd
os::cleanup::dump_container_logs
os::cleanup::dump_pprof_output
os::cleanup::find_cache_alterations
os::cleanup::truncate_large_logs

if [[ -z "${SKIP_TEARDOWN:-}" ]]; then
os::cleanup::containers
os::cleanup::processes
os::cleanup::prune_etcd
fi
os::util::describe_return_code "${return_code}"
}
readonly -f os::cleanup::all

# os::cleanup::dump_etcd dumps the full contents of etcd to a file.
#
# Globals:
# ARTIFACT_DIR
# - ARTIFACT_DIR
# - API_SCHEME
# - API_HOST
# - ETCD_PORT
# Arguments:
# None
# Returns:
# None
function os::cleanup::dump_etcd() {
os::log::info "Dumping etcd contents to ${ARTIFACT_DIR}/etcd_dump.json"
os::util::curl_etcd "/v2/keys/?recursive=true" > "${ARTIFACT_DIR}/etcd_dump.json"
if [[ -n "${API_SCHEME}" && -n "${API_HOST}" && -n "${ETCD_PORT}" ]]; then
os::log::info "[CLEANUP] Dumping etcd contents to $( os::util::repository_relative_path "${ARTIFACT_DIR}/etcd_dump.json" )"
os::util::curl_etcd "/v2/keys/?recursive=true" > "${ARTIFACT_DIR}/etcd_dump.json"
fi
}
readonly -f os::cleanup::dump_etcd

# os::cleanup::prune_etcd removes the etcd data store from disk.
#
# Globals:
# ARTIFACT_DIR
# Arguments:
# None
# Returns:
# None
function os::cleanup::prune_etcd() {
if [[ -n "${ETCD_DATA_DIR:-}" ]]; then
os::log::info "[CLEANUP] Pruning etcd data directory"
${USE_SUDO:+sudo} rm -rf "${ETCD_DATA_DIR}"
fi
}
readonly -f os::cleanup::prune_etcd

# os::cleanup::containers operates on our containers to stop the containers
# and optionally remove the containers and any volumes they had attached.
Expand All @@ -27,11 +88,11 @@ function os::cleanup::dump_etcd() {
# None
function os::cleanup::containers() {
if ! os::util::find::system_binary docker >/dev/null 2>&1; then
os::log::warning "No \`docker\` binary found, skipping container cleanup."
os::log::warning "[CLEANUP] No \`docker\` binary found, skipping container cleanup."
return
fi

os::log::info "Stopping docker containers"
os::log::info "[CLEANUP] Stopping docker containers"
for id in $( os::cleanup::internal::list_our_containers ); do
os::log::debug "Stopping ${id}"
docker stop "${id}" >/dev/null
Expand All @@ -41,7 +102,7 @@ function os::cleanup::containers() {
return
fi

os::log::info "Removing docker containers"
os::log::info "[CLEANUP] Removing docker containers"
for id in $( os::cleanup::internal::list_our_containers ); do
os::log::debug "Removing ${id}"
docker stop "${id}" >/dev/null
Expand All @@ -60,14 +121,14 @@ readonly -f os::cleanup::containers
# None
function os::cleanup::dump_container_logs() {
if ! os::util::find::system_binary docker >/dev/null 2>&1; then
os::log::warning "No \`docker\` binary found, skipping container cleanup."
os::log::warning "[CLEANUP] No \`docker\` binary found, skipping container log harvest."
return
fi

local container_log_dir="${LOG_DIR}/containers"
mkdir -p "${container_log_dir}"

os::log::info "Dumping container logs to ${container_log_dir}"
os::log::info "[CLEANUP] Dumping container logs to $( os::util::repository_relative_path "${container_log_dir}" )"
for id in $( os::cleanup::internal::list_our_containers ); do
local name; name="$( docker inspect --format '{{ .Name }}' "${id}" )"
os::log::debug "Dumping logs for ${id} to ${name}.log"
Expand Down Expand Up @@ -138,6 +199,7 @@ readonly -f os::cleanup::internal::list_containers
# Returns:
# None
function os::cleanup::tmpdir() {
os::log::info "[CLEANUP] Cleaning up temporary directories"
# ensure that the directories are clean
if os::util::find::system_binary "findmnt" &>/dev/null; then
for target in $( ${USE_SUDO:+sudo} findmnt --output TARGET --list ); do
Expand All @@ -163,11 +225,87 @@ readonly -f os::cleanup::tmpdir
# Returns:
# None
function os::cleanup::dump_events() {
os::log::info "Dumping cluster events to ${ARTIFACT_DIR}/events.txt"
os::log::info "[CLEANUP] Dumping cluster events to $( os::util::repository_relative_path "${ARTIFACT_DIR}/events.txt" )"
local kubeconfig
if [[ -n "${ADMIN_KUBECONFIG}" ]]; then
if [[ -n "${ADMIN_KUBECONFIG:-}" ]]; then
kubeconfig="--config=${ADMIN_KUBECONFIG}"
fi
oc get events --all-namespaces ${kubeconfig} > "${ARTIFACT_DIR}/events.txt"
oc get events --all-namespaces ${kubeconfig:-} > "${ARTIFACT_DIR}/events.txt" 2>&1
}
readonly -f os::cleanup::dump_events

# os::cleanup::find_cache_alterations ulls information out of the server
# log so that we can get failure management in jenkins to highlight it
# and really have it smack people in their logs. This is a severe
# correctness problem.
#
# Globals:
# - LOG_DIR
# Arguments:
# None
# Returns:
# None
function os::cleanup::find_cache_alterations() {
grep -ra5 "CACHE.*ALTERED" "${LOG_DIR}" || true
}
readonly -f os::cleanup::find_cache_alterations

# os::cleanup::dump_pprof_output dumps profiling output for the
# `openshift` binary
#
# Globals:
# - LOG_DIR
# Arguments:
# None
# Returns:
# None
function os::cleanup::dump_pprof_output() {
if go tool -n pprof >/dev/null 2>&1 && [[ -s cpu.pprof ]]; then
os::log::info "[CLEANUP] \`pprof\` output logged to $( os::util::repository_relative_path "${LOG_DIR}/pprof.out" )"
go tool pprof -text "./_output/local/bin/$(os::util::host_platform)/openshift" cpu.pprof >"${LOG_DIR}/pprof.out" 2>&1
fi
}
readonly -f os::cleanup::dump_pprof_output

# os::cleanup::truncate_large_logs truncates very large files under
# $LOG_DIR and $ARTIFACT_DIR so we do not upload them to cloud storage
# after CI runs.
#
# Globals:
# - LOG_DIR
# - ARTIFACT_DIR
# Arguments:
# None
# Returns:
# None
function os::cleanup::truncate_large_logs() {
local max_file_size="100M"
os::log::info "[CLEANUP] Truncating log files over ${max_file_size}"
for file in $(find "${ARTIFACT_DIR}" "${LOG_DIR}" -type f -name '*.log' \( -size +${max_file_size} \)); do
mv "${file}" "${file}.tmp"
echo "LOGFILE TOO LONG ($(du -h "${file}.tmp")), PREVIOUS BYTES TRUNCATED. LAST ${max_file_size} OF LOGFILE:" > "${file}"
tail -c ${max_file_size} "${file}.tmp" >> "${file}"
rm "${file}.tmp"
done
}
readonly -f os::cleanup::truncate_large_logs

# os::cleanup::processes kills all processes created by the test
# script.
#
# Globals:
# None
# Arguments:
# None
# Returns:
# None
function os::cleanup::processes() {
os::log::info "[CLEANUP] Killing child processes"
for job in $( jobs -pr ); do
for child in $( pgrep -P "${job}" ); do
${USE_SUDO:+sudo} kill "${child}" &> /dev/null
done
${USE_SUDO:+sudo} kill "${job}" &> /dev/null
done
}
readonly -f os::cleanup::processes
2 changes: 2 additions & 0 deletions hack/lib/init.sh
Expand Up @@ -9,6 +9,8 @@ set -o errexit
set -o nounset
set -o pipefail

OS_SCRIPT_START_TIME="$( date +%s )"; export OS_SCRIPT_START_TIME

# os::util::absolute_path returns the absolute path to the directory provided
function os::util::absolute_path() {
local relative_path="$1"
Expand Down
15 changes: 7 additions & 8 deletions hack/lib/util/misc.sh
Expand Up @@ -13,12 +13,7 @@
# None
function os::util::describe_return_code() {
local return_code=$1

if [[ "${return_code}" = "0" ]]; then
echo -n "[INFO] $0 succeeded "
else
echo -n "[ERROR] $0 failed "
fi
local message="$0 exited with code ${return_code} "

if [[ -n "${OS_SCRIPT_START_TIME:-}" ]]; then
local end_time
Expand All @@ -27,9 +22,13 @@ function os::util::describe_return_code() {
elapsed_time="$(( end_time - OS_SCRIPT_START_TIME ))"
local formatted_time
formatted_time="$( os::util::format_seconds "${elapsed_time}" )"
echo "after ${formatted_time}"
message+="after ${formatted_time}"
fi

if [[ "${return_code}" = "0" ]]; then
os::log::info "${message}"
else
echo
os::log::error "${message}"
fi
}
readonly -f os::util::describe_return_code
Expand Down
38 changes: 5 additions & 33 deletions hack/test-cmd.sh
Expand Up @@ -2,41 +2,14 @@

# This command checks that the built commands can function together for
# simple scenarios. It does not require Docker so it can run in travis.
STARTTIME=$(date +%s)
source "$(dirname "${BASH_SOURCE}")/lib/init.sh"
os::util::environment::setup_time_vars

function cleanup()
{
out=$?
set +e

os::cleanup::dump_etcd

pkill -P $$
kill_all_processes

# pull information out of the server log so that we can get failure management in jenkins to highlight it and
# really have it smack people in their logs. This is a severe correctness problem
grep -a5 "CACHE.*ALTERED" ${LOG_DIR}/openshift.log

# we keep a JSON dump of etcd data so we do not need to keep the binary store
local sudo="${USE_SUDO:+sudo}"
${sudo} rm -rf "${ETCD_DATA_DIR}"

if go tool -n pprof >/dev/null 2>&1; then
os::log::debug "\`pprof\` output logged to ${LOG_DIR}/pprof.out"
go tool pprof -text "./_output/local/bin/$(os::util::host_platform)/openshift" cpu.pprof >"${LOG_DIR}/pprof.out" 2>&1
fi

os::test::junit::generate_oscmd_report

ENDTIME=$(date +%s); echo "$0 took $(($ENDTIME - $STARTTIME)) seconds"
os::log::info "Exiting with ${out}"
exit $out
function cleanup() {
return_code=$?
os::cleanup::all "${return_code}"
exit "${return_code}"
}

trap "exit" INT TERM
trap "cleanup" EXIT

function find_tests() {
Expand All @@ -52,8 +25,7 @@ function find_tests() {
done

if [[ "${#selected_tests[@]}" -eq 0 ]]; then
os::log::error "No tests were selected due to invalid regex."
return 1
os::log::fatal "No tests were selected due to invalid regex."
else
echo "${selected_tests[@]}"
fi
Expand Down
41 changes: 5 additions & 36 deletions hack/test-end-to-end-docker.sh
Expand Up @@ -2,7 +2,6 @@

# This script tests the high level end-to-end functionality demonstrated
# as part of the examples/sample-app
STARTTIME=$(date +%s)
source "$(dirname "${BASH_SOURCE}")/lib/init.sh"

os::log::info "Starting containerized end-to-end test"
Expand All @@ -19,42 +18,12 @@ if [[ -n "${JUNIT_REPORT:-}" ]]; then
export JUNIT_REPORT_OUTPUT="${LOG_DIR}/raw_test_output.log"
fi

function cleanup()
{
out=$?
echo
if [ $out -ne 0 ]; then
echo "[FAIL] !!!!! Test Failed !!!!"
else
os::log::info "Test Succeeded"
fi
echo

set +e
os::cleanup::dump_container_logs

# pull information out of the server log so that we can get failure management in jenkins to highlight it and
# really have it smack people in their logs. This is a severe correctness problem
grep -ra5 "CACHE.*ALTERED" ${LOG_DIR}/containers

os::cleanup::dump_etcd

os::cleanup::dump_events

if [[ -z "${SKIP_TEARDOWN-}" ]]; then
os::cleanup::containers
fi

truncate_large_logs
os::test::junit::generate_oscmd_report
set -e

os::log::info "Exiting"
ENDTIME=$(date +%s); echo "$0 took $(($ENDTIME - $STARTTIME)) seconds"
exit $out
function cleanup() {
return_code=$?
os::cleanup::all "${return_code}"
exit "${return_code}"
}

trap "cleanup" EXIT INT TERM
trap "cleanup" EXIT

os::log::system::start

Expand Down

0 comments on commit b8c8683

Please sign in to comment.