DRY out script cleanup code

Almost every single cleanup function trapped on exit for our scripts was taking the same actions in the same order. We can break that out into a standalone function under `hack/lib` that contains a superset of all the cleanup steps used, provided that we make sure each individual clean up step can gracefully handle the case where it is not needed or does not have any work to do. Signed-off-by: Steve Kuznetsov <skuznets@redhat.com>
openshift · May 10, 2017 · b8c8683 · b8c8683
1 parent 89087e7
commit b8c8683
Show file tree

Hide file tree

Showing 17 changed files with 220 additions and 312 deletions.
diff --git a/hack/lib/cleanup.sh b/hack/lib/cleanup.sh
@@ -3,18 +3,79 @@
 # This library holds functions that are used to clean up local
 # system state after other scripts have run.
 
+# os::cleanup::all will clean up all of the processes and data that
+# a script leaves around after running. All of the sub-tasks called
+# from this function should gracefully handle when they do not need
+# to do anything.
+#
+# Globals:
+#  - ARTIFACT_DIR
+#  - SKIP_TEARDOWN
+#  - SKIP_IMAGE_CLEANUP
+# Arguments:
+#  1 - return code of the script
+# Returns:
+#  None
+function os::cleanup::all() {
+	local return_code="$1"
+
+	# All of our cleanup is best-effort, so we do not care
+	# if any specific step fails.
+	set +o errexit
+
+	os::test::junit::generate_report
+
+	os::log::info "[CLEANUP] Beginning cleanup routines..."
+	os::cleanup::dump_events
+	os::cleanup::dump_etcd
+	os::cleanup::dump_container_logs
+	os::cleanup::dump_pprof_output
+	os::cleanup::find_cache_alterations
+	os::cleanup::truncate_large_logs
+
+	if [[ -z "${SKIP_TEARDOWN:-}" ]]; then
+		os::cleanup::containers
+		os::cleanup::processes
+		os::cleanup::prune_etcd
+	fi
+	os::util::describe_return_code "${return_code}"
+}
+readonly -f os::cleanup::all
+
 # os::cleanup::dump_etcd dumps the full contents of etcd to a file.
 #
 # Globals:
-#  ARTIFACT_DIR
+#  - ARTIFACT_DIR
+#  - API_SCHEME
+#  - API_HOST
+#  - ETCD_PORT
 # Arguments:
 #  None
 # Returns:
 #  None
 function os::cleanup::dump_etcd() {
-	os::log::info "Dumping etcd contents to ${ARTIFACT_DIR}/etcd_dump.json"
-	os::util::curl_etcd "/v2/keys/?recursive=true" > "${ARTIFACT_DIR}/etcd_dump.json"
+	if [[ -n "${API_SCHEME}" && -n "${API_HOST}" && -n "${ETCD_PORT}" ]]; then
+		os::log::info "[CLEANUP] Dumping etcd contents to $( os::util::repository_relative_path "${ARTIFACT_DIR}/etcd_dump.json" )"
+		os::util::curl_etcd "/v2/keys/?recursive=true" > "${ARTIFACT_DIR}/etcd_dump.json"
+	fi
 }
+readonly -f os::cleanup::dump_etcd
+
+# os::cleanup::prune_etcd removes the etcd data store from disk.
+#
+# Globals:
+#  ARTIFACT_DIR
+# Arguments:
+#  None
+# Returns:
+#  None
+function os::cleanup::prune_etcd() {
+	if [[ -n "${ETCD_DATA_DIR:-}" ]]; then
+		os::log::info "[CLEANUP] Pruning etcd data directory"
+		${USE_SUDO:+sudo} rm -rf "${ETCD_DATA_DIR}"
+	fi
+}
+readonly -f os::cleanup::prune_etcd
 
 # os::cleanup::containers operates on our containers to stop the containers
 # and optionally remove the containers and any volumes they had attached.
@@ -27,11 +88,11 @@ function os::cleanup::dump_etcd() {
 #  None
 function os::cleanup::containers() {
 	if ! os::util::find::system_binary docker >/dev/null 2>&1; then
-		os::log::warning "No \`docker\` binary found, skipping container cleanup."
+		os::log::warning "[CLEANUP] No \`docker\` binary found, skipping container cleanup."
 		return
 	fi
 
-	os::log::info "Stopping docker containers"
+	os::log::info "[CLEANUP] Stopping docker containers"
 	for id in $( os::cleanup::internal::list_our_containers ); do
 		os::log::debug "Stopping ${id}"
 		docker stop "${id}" >/dev/null
@@ -41,7 +102,7 @@ function os::cleanup::containers() {
 		return
 	fi
 
-	os::log::info "Removing docker containers"
+	os::log::info "[CLEANUP] Removing docker containers"
 	for id in $( os::cleanup::internal::list_our_containers ); do
 		os::log::debug "Removing ${id}"
 		docker stop "${id}" >/dev/null
@@ -60,14 +121,14 @@ readonly -f os::cleanup::containers
 #  None
 function os::cleanup::dump_container_logs() {
 	if ! os::util::find::system_binary docker >/dev/null 2>&1; then
-		os::log::warning "No \`docker\` binary found, skipping container cleanup."
+		os::log::warning "[CLEANUP] No \`docker\` binary found, skipping container log harvest."
 		return
 	fi
 
 	local container_log_dir="${LOG_DIR}/containers"
 	mkdir -p "${container_log_dir}"
 
-	os::log::info "Dumping container logs to ${container_log_dir}"
+	os::log::info "[CLEANUP] Dumping container logs to $( os::util::repository_relative_path "${container_log_dir}" )"
 	for id in $( os::cleanup::internal::list_our_containers ); do
 		local name; name="$( docker inspect --format '{{ .Name }}' "${id}" )"
 		os::log::debug "Dumping logs for ${id} to ${name}.log"
@@ -138,6 +199,7 @@ readonly -f os::cleanup::internal::list_containers
 # Returns:
 #  None
 function os::cleanup::tmpdir() {
+	os::log::info "[CLEANUP] Cleaning up temporary directories"
 	# ensure that the directories are clean
 	if os::util::find::system_binary "findmnt" &>/dev/null; then
 		for target in $( ${USE_SUDO:+sudo} findmnt --output TARGET --list ); do
@@ -163,11 +225,87 @@ readonly -f os::cleanup::tmpdir
 # Returns:
 #  None
 function os::cleanup::dump_events() {
-	os::log::info "Dumping cluster events to ${ARTIFACT_DIR}/events.txt"
+	os::log::info "[CLEANUP] Dumping cluster events to $( os::util::repository_relative_path "${ARTIFACT_DIR}/events.txt" )"
 	local kubeconfig
-	if [[ -n "${ADMIN_KUBECONFIG}" ]]; then
+	if [[ -n "${ADMIN_KUBECONFIG:-}" ]]; then
 		kubeconfig="--config=${ADMIN_KUBECONFIG}"
 	fi
-	oc get events --all-namespaces ${kubeconfig} > "${ARTIFACT_DIR}/events.txt"
+	oc get events --all-namespaces ${kubeconfig:-} > "${ARTIFACT_DIR}/events.txt" 2>&1
 }
 readonly -f os::cleanup::dump_events
+
+# os::cleanup::find_cache_alterations ulls information out of the server
+# log so that we can get failure management in jenkins to highlight it
+# and really have it smack people in their logs. This is a severe
+# correctness problem.
+#
+# Globals:
+#  - LOG_DIR
+# Arguments:
+#  None
+# Returns:
+#  None
+function os::cleanup::find_cache_alterations() {
+	grep -ra5 "CACHE.*ALTERED" "${LOG_DIR}" || true
+}
+readonly -f os::cleanup::find_cache_alterations
+
+# os::cleanup::dump_pprof_output dumps profiling output for the
+# `openshift` binary
+#
+# Globals:
+#  - LOG_DIR
+# Arguments:
+#  None
+# Returns:
+#  None
+function os::cleanup::dump_pprof_output() {
+	if go tool -n pprof >/dev/null 2>&1 && [[ -s cpu.pprof ]]; then
+		os::log::info "[CLEANUP] \`pprof\` output logged to $( os::util::repository_relative_path "${LOG_DIR}/pprof.out" )"
+		go tool pprof -text "./_output/local/bin/$(os::util::host_platform)/openshift" cpu.pprof >"${LOG_DIR}/pprof.out" 2>&1
+	fi
+}
+readonly -f os::cleanup::dump_pprof_output
+
+# os::cleanup::truncate_large_logs truncates very large files under
+# $LOG_DIR and $ARTIFACT_DIR so we do not upload them to cloud storage
+# after CI runs.
+#
+# Globals:
+#  - LOG_DIR
+#  - ARTIFACT_DIR
+# Arguments:
+#  None
+# Returns:
+#  None
+function os::cleanup::truncate_large_logs() {
+	local max_file_size="100M"
+	os::log::info "[CLEANUP] Truncating log files over ${max_file_size}"
+	for file in $(find "${ARTIFACT_DIR}" "${LOG_DIR}" -type f -name '*.log' \( -size +${max_file_size} \)); do
+		mv "${file}" "${file}.tmp"
+		echo "LOGFILE TOO LONG ($(du -h "${file}.tmp")), PREVIOUS BYTES TRUNCATED. LAST ${max_file_size} OF LOGFILE:" > "${file}"
+		tail -c ${max_file_size} "${file}.tmp" >> "${file}"
+		rm "${file}.tmp"
+	done
+}
+readonly -f os::cleanup::truncate_large_logs
+
+# os::cleanup::processes kills all processes created by the test
+# script.
+#
+# Globals:
+#  None
+# Arguments:
+#  None
+# Returns:
+#  None
+function os::cleanup::processes() {
+	os::log::info "[CLEANUP] Killing child processes"
+	for job in $( jobs -pr ); do
+		for child in $( pgrep -P "${job}" ); do
+			${USE_SUDO:+sudo} kill "${child}" &> /dev/null
+		done
+		${USE_SUDO:+sudo} kill "${job}" &> /dev/null
+	done
+}
+readonly -f os::cleanup::processes
diff --git a/hack/lib/init.sh b/hack/lib/init.sh
@@ -9,6 +9,8 @@ set -o errexit
 set -o nounset
 set -o pipefail
 
+OS_SCRIPT_START_TIME="$( date +%s )"; export OS_SCRIPT_START_TIME
+
 # os::util::absolute_path returns the absolute path to the directory provided
 function os::util::absolute_path() {
 	local relative_path="$1"

diff --git a/hack/lib/util/misc.sh b/hack/lib/util/misc.sh
@@ -13,12 +13,7 @@
 #  None
 function os::util::describe_return_code() {
 	local return_code=$1
-
-	if [[ "${return_code}" = "0" ]]; then
-		echo -n "[INFO] $0 succeeded "
-	else
-		echo -n "[ERROR] $0 failed "
-	fi
+	local message="$0 exited with code ${return_code} "
 
 	if [[ -n "${OS_SCRIPT_START_TIME:-}" ]]; then
 		local end_time
@@ -27,9 +22,13 @@ function os::util::describe_return_code() {
         elapsed_time="$(( end_time - OS_SCRIPT_START_TIME ))"
 		local formatted_time
         formatted_time="$( os::util::format_seconds "${elapsed_time}" )"
-		echo "after ${formatted_time}"
+		message+="after ${formatted_time}"
+	fi
+
+	if [[ "${return_code}" = "0" ]]; then
+		os::log::info "${message}"
 	else
-		echo
+		os::log::error "${message}"
 	fi
 }
 readonly -f os::util::describe_return_code

diff --git a/hack/test-cmd.sh b/hack/test-cmd.sh
@@ -2,41 +2,14 @@
 
 # This command checks that the built commands can function together for
 # simple scenarios.  It does not require Docker so it can run in travis.
-STARTTIME=$(date +%s)
 source "$(dirname "${BASH_SOURCE}")/lib/init.sh"
 os::util::environment::setup_time_vars
 
-function cleanup()
-{
-    out=$?
-    set +e
-
-    os::cleanup::dump_etcd
-
-    pkill -P $$
-    kill_all_processes
-
-    # pull information out of the server log so that we can get failure management in jenkins to highlight it and
-    # really have it smack people in their logs.  This is a severe correctness problem
-    grep -a5 "CACHE.*ALTERED" ${LOG_DIR}/openshift.log
-
-    # we keep a JSON dump of etcd data so we do not need to keep the binary store
-    local sudo="${USE_SUDO:+sudo}"
-    ${sudo} rm -rf "${ETCD_DATA_DIR}"
-
-    if go tool -n pprof >/dev/null 2>&1; then
-        os::log::debug "\`pprof\` output logged to ${LOG_DIR}/pprof.out"
-        go tool pprof -text "./_output/local/bin/$(os::util::host_platform)/openshift" cpu.pprof >"${LOG_DIR}/pprof.out" 2>&1
-    fi
-
-    os::test::junit::generate_oscmd_report
-
-    ENDTIME=$(date +%s); echo "$0 took $(($ENDTIME - $STARTTIME)) seconds"
-    os::log::info "Exiting with ${out}"
-    exit $out
+function cleanup() {
+  return_code=$?
+  os::cleanup::all "${return_code}"
+  exit "${return_code}"
 }
-
-trap "exit" INT TERM
 trap "cleanup" EXIT
 
 function find_tests() {
@@ -52,8 +25,7 @@ function find_tests() {
     done
 
     if [[ "${#selected_tests[@]}" -eq 0 ]]; then
-        os::log::error "No tests were selected due to invalid regex."
-        return 1
+        os::log::fatal "No tests were selected due to invalid regex."
     else
         echo "${selected_tests[@]}"
     fi

diff --git a/hack/test-end-to-end-docker.sh b/hack/test-end-to-end-docker.sh
@@ -2,7 +2,6 @@
 
 # This script tests the high level end-to-end functionality demonstrated
 # as part of the examples/sample-app
-STARTTIME=$(date +%s)
 source "$(dirname "${BASH_SOURCE}")/lib/init.sh"
 
 os::log::info "Starting containerized end-to-end test"
@@ -19,42 +18,12 @@ if [[ -n "${JUNIT_REPORT:-}" ]]; then
 	export JUNIT_REPORT_OUTPUT="${LOG_DIR}/raw_test_output.log"
 fi
 
-function cleanup()
-{
-	out=$?
-	echo
-	if [ $out -ne 0 ]; then
-		echo "[FAIL] !!!!! Test Failed !!!!"
-	else
-		os::log::info "Test Succeeded"
-	fi
-	echo
-
-	set +e
-	os::cleanup::dump_container_logs
-
-	# pull information out of the server log so that we can get failure management in jenkins to highlight it and
-	# really have it smack people in their logs.  This is a severe correctness problem
-    grep -ra5 "CACHE.*ALTERED" ${LOG_DIR}/containers
-
-	os::cleanup::dump_etcd
-
-	os::cleanup::dump_events
-
-	if [[ -z "${SKIP_TEARDOWN-}" ]]; then
-		os::cleanup::containers
-	fi
-
-	truncate_large_logs
-	os::test::junit::generate_oscmd_report
-	set -e
-
-	os::log::info "Exiting"
-	ENDTIME=$(date +%s); echo "$0 took $(($ENDTIME - $STARTTIME)) seconds"
-	exit $out
+function cleanup() {
+	return_code=$?
+	os::cleanup::all "${return_code}"
+	exit "${return_code}"
 }
-
-trap "cleanup" EXIT INT TERM
+trap "cleanup" EXIT
 
 os::log::system::start