From b6963181cc2bfc3c9372807ccea363ead83bbf5a Mon Sep 17 00:00:00 2001 From: Michael Jennings Date: Fri, 15 Sep 2023 11:55:36 -0600 Subject: [PATCH] lbnl_cmd.nhc: Remove line numbers from dmesg check When using `check_cmd_dmesg()` directly (as written in `scripts/lbnl_cmd.nhc`) with a negated match string, the default behavior of `check_cmd_output()` (which `check_cmd_dmesg()` wraps) used for error reporting causes the "Reason" field to contain not only the match string that was found (and shouldn't have been) but also the **_line number_** where the match was found. In the case of `dmesg` output, the line number is almost completely useless; moreover, it prevents Slurm and other schedulers/RMs from being able to group all the affected nodes together -- because the line numbers almost always differ! Granted that users/admins can override the default failure message generation behavior (via `-M` entries, all of which are passed directly to `check_cmd_output()`), but in the specific case of `check_cmd_dmesg()`, I think the default behavior should suppress the line numbers and use a simpler, more concise message instead. This changeset does exactly that by adding a bit of pre-processing to the command-line arguments passed to `check_cmd_dmesg()` before passing them on to `check_cmd_output()`. Each match string (`-m` argument) that doesn't already have a corresponding message (`-M` argument) to override the default will have a new default provided to it that omits the extraneous information. In other words, any `-m`_`mstr`_ that already has a matching `-M`_`message`_ will be passed on to `check_cmd_output()` exactly as it is; any `-m`_`mstr`_ that _**lacks**_ a corresponding `-M`_`message`_ — or that has an **empty** _`message`_ as a placeholder — will be assigned a new `-M`_`message`_ that gets passed to `check_cmd_output()` without any line number or other dynamic information. Fixes #143. --- scripts/common.nhc | 4 +-- scripts/lbnl_cmd.nhc | 42 +++++++++++++++++++++------- test/nhc-test | 6 +++- test/test_common.nhc | 4 +-- test/test_lbnl_cmd.nhc | 63 ++++++++++++++++++++++++++++++++++-------- 5 files changed, 92 insertions(+), 27 deletions(-) diff --git a/scripts/common.nhc b/scripts/common.nhc index 1d91405..d3b7aee 100644 --- a/scripts/common.nhc +++ b/scripts/common.nhc @@ -705,9 +705,7 @@ function nhc_common_get_unix_time() { if [[ $SECONDS -lt 946684800 ]]; then NOW=$(date '+%s') - ELAPSED_SECONDS=$SECONDS - SECONDS=$((ELAPSED_SECONDS+NOW)) - NHC_START_TS=$((NOW-ELAPSED_SECONDS)) + (( ELAPSED_SECONDS=$SECONDS, SECONDS+=NOW, NHC_START_TS=NOW-ELAPSED_SECONDS )) fi } diff --git a/scripts/lbnl_cmd.nhc b/scripts/lbnl_cmd.nhc index 4d657b0..2a48d8f 100644 --- a/scripts/lbnl_cmd.nhc +++ b/scripts/lbnl_cmd.nhc @@ -17,9 +17,9 @@ function check_cmd_output() { case "$OPTION" in C) CHECKNAME="$OPTARG" ;; # Used for writing wrapper checks O) OUTPUT_VAR="$OPTARG" ;; # Used for writing wrapper checks - M) MSGS[${#MSGS[*]}]="$OPTARG" ;; # Used for writing wrapper checks + M) MSGS+=( "$OPTARG" ) ;; # Used for writing wrapper checks e) CMD="$OPTARG" ;; - m) MATCHES[${#MATCHES[*]}]="$OPTARG" ;; + m) MATCHES+=( "$OPTARG" ) ;; r) CMD_RETVAL="$OPTARG" ;; t) CMD_TIMEOUT="$OPTARG" ;; :) die 1 "$CHECKNAME: Option -$OPTARG requires an argument." ; return 1 ;; @@ -41,11 +41,14 @@ function check_cmd_output() { elif [[ -n "$CMD_TIMEOUT" && $CMD_TIMEOUT -gt $TIMEOUT ]]; then die 1 "$CHECKNAME: Command timeout $CMD_TIMEOUT exceeds NHC timeout $TIMEOUT" return 1 + elif (( ${#MATCHES[*]} < ${#MSGS[*]} )); then + die 1 "$CHECKNAME: Message count ($(( ${#MSGS[*]} / 2 ))) must not exceed match string count ($(( ${#MATCHES[*]} / 2 )))." + return 1 fi # Initialize a parallel array for tracking which matches have been found. for ((i = 0; i < ${#MATCHES[*]}; i++)); do - if [[ "${MATCHES[$i]:0:1}" = '!' ]]; then + if [[ "${MATCHES[$i]:0:1}" == '!' ]]; then MATCHED[$i]=1 ((NEG_MATCH_CNT++)) else @@ -74,7 +77,7 @@ function check_cmd_output() { # Iterate through $LINES[] array to gather process data. for ((LINENUM = 0; LINENUM <= ${#LINES[*]}; LINENUM++)); do for ((i = 0; i < ${#MATCHES[*]}; i++)); do - if [[ "${MATCHES[$i]:0:1}" = '!' ]]; then + if [[ "${MATCHES[$i]:0:1}" == '!' ]]; then # Match expression is negated. Fail if the line matches this expression. if mcheck "${LINES[$LINENUM]}" "${MATCHES[$i]:1}" ; then MSG=${MSGS[$i]:-\"%m\" matched at line %l of \"%c\"} @@ -85,7 +88,7 @@ function check_cmd_output() { die 1 "$CHECKNAME: $MSG." return 1 fi - elif [ ${MATCHED[$i]} -eq 0 ] && mcheck "${LINES[$LINENUM]}" "${MATCHES[$i]}" ; then + elif [[ ${MATCHED[$i]} -eq 0 ]] && mcheck "${LINES[$LINENUM]}" "${MATCHES[$i]}" ; then MATCHED[$i]=1 ((MATCH_CNT++)) if [[ $MATCH_CNT == ${#MATCHED[*]} ]]; then @@ -186,15 +189,17 @@ function check_cmd_status() { # Check dmesg output for matching lines. Fairly simple wrapper for check_cmd_output() (above). function check_cmd_dmesg() { - local CMD_TIMEOUT CMD_RETVAL CMD="dmesg" + local CMD="" CMD_TIMEOUT CMD_RETVAL OUTVAR local -a MATCHES MSGS + local -i i OPTIND=1 - while getopts ":M:e:m:r:t:" OPTION ; do + while getopts ":M:O:e:m:r:t:" OPTION ; do case "$OPTION" in - M) MSGS[${#MSGS[*]}]=-M ; MSGS[${#MSGS[*]}]="$OPTARG" ;; + M) MSGS+=( '-M' "$OPTARG" ) ;; + O) OUTVAR="$OPTARG" ;; e) CMD="$OPTARG" ;; - m) MATCHES[${#MATCHES[*]}]=-m ; MATCHES[${#MATCHES[*]}]="$OPTARG" ;; + m) MATCHES+=( '-m' "$OPTARG" ) ;; r) CMD_RETVAL="$OPTARG" ;; t) CMD_TIMEOUT="$OPTARG" ;; :) die 1 "$FUNCNAME: Option -$OPTARG requires an argument." ; return 1 ;; @@ -202,6 +207,23 @@ function check_cmd_dmesg() { esac done shift $((OPTIND-1)) + if [[ -z "$CMD" && $# -eq 0 ]]; then + CMD="dmesg" + fi + if (( (${#MATCHES[*]} % 2 != 0) || (${#MSGS[*]} % 2 != 0) )); then + die 1 "$FUNCNAME: Internal failure -- Both match string (${#MATCHES[*]}) and message (${#MSGS[*]}) list sizes must be even." + return 1 + fi - check_cmd_output -C $FUNCNAME ${CMD_RETVAL:+-r $CMD_RETVAL} ${CMD_TIMEOUT:+-t $CMD_TIMEOUT} -e "$CMD" "${MSGS[@]}" "${MATCHES[@]}" + for (( i = 1; i < ${#MATCHES[*]}; i += 2 )); do + if [[ -z "${MSGS[i]}" ]]; then + MSGS[i-1]='-M' + if [[ "${MATCHES[$i]:0:1}" == '!' ]]; then + MSGS[i]="dmesg output matched \"${MATCHES[$i]:1}\"" + else + MSGS[i]="dmesg output failed to match \"${MATCHES[$i]}\"" + fi + fi + done + check_cmd_output -C "$FUNCNAME" ${OUTVAR:+-O $OUTVAR} ${CMD_RETVAL:+-r $CMD_RETVAL} ${CMD_TIMEOUT:+-t $CMD_TIMEOUT} ${CMD:+-e "$CMD"} "${MSGS[@]}" "${MATCHES[@]}" "$@" } diff --git a/test/nhc-test b/test/nhc-test index 617716d..0a102a2 100755 --- a/test/nhc-test +++ b/test/nhc-test @@ -196,9 +196,13 @@ function test_nhc_driver() { } function test_checks() { - local CHECKTEST + local CHECKTEST DIE_MSG="" DIE_RC="" local -a TEST_SCRIPTS + # Instead of die() being empty, have it save the message and return code + # into global variables that can be referenced by our unit tests. + function die() { DIE_RC="$1"; DIE_MSG="${*:2}"; return $DIE_RC; } + set +f TEST_SCRIPTS=( $SRCDIR/test_*.nhc ) set -f diff --git a/test/test_common.nhc b/test/test_common.nhc index 2c78849..93910c6 100644 --- a/test/test_common.nhc +++ b/test/test_common.nhc @@ -367,8 +367,8 @@ plan $((22+11+11+13+11+28+9+7+15+6+7+4+6+18+7+19+2+8)) "common.nhc" && { OLDSECS=$SECONDS cmp_ok "$SECONDS" "-lt" "60" 'Initial $SECONDS value should be small (sub-1min)' nhc_common_get_unix_time - ok '(( (SECONDS == UNIXTIME+OLDSECS) || (SECONDS == UNIXTIME+OLDSECS+1) ))' \ - "\$SECONDS ($SECONDS) should now reflect \`time_t\` value (roughly $UNIXTIME +/- $OLDSECS)" + ok '(( (SECONDS <= UNIXTIME+OLDSECS+5) && (SECONDS >= UNIXTIME+OLDSECS-5) ))' \ + "\$SECONDS ($SECONDS) should now reflect \`time_t\` value (roughly $UNIXTIME+$OLDSECS +/- 5s)" if [[ $- == *x* ]]; then # There seems to be a bash bug. It doesn't handle process substitution diff --git a/test/test_lbnl_cmd.nhc b/test/test_lbnl_cmd.nhc index 3413eee..1bcefb8 100644 --- a/test/test_lbnl_cmd.nhc +++ b/test/test_lbnl_cmd.nhc @@ -1,34 +1,75 @@ # Tests for lbnl_cmd.nhc -plan $((2+9+2)) "lbnl_cmd.nhc" && { +plan $((3+14+14+2)) "lbnl_cmd.nhc" && { is "`type -t check_cmd_output 2>&1`" 'function' 'check_cmd_output() loaded properly' is "`type -t check_cmd_status 2>&1`" 'function' 'check_cmd_status() loaded properly' + is "`type -t check_cmd_dmesg 2>&1`" 'function' 'check_cmd_dmesg() loaded properly' # Needed to bypass sanity check on timeout values. export TIMEOUT=99999 # Bash 3.0 (RHEL4) apparently has a bug in handling "${ARRAY[@]}" expansion, so we have to skip these. if [[ "${BASH_VERSINFO[0]}" == "3" && "${BASH_VERSINFO[1]}" -lt 1 ]]; then - skip 9 + skip $((13+14)) else check_cmd_output -t 1 -m "/Test/" echo Test - is $? 0 "Simple command output match" + is $? 0 "Simple command output match [$DIE_MSG]" check_cmd_output -t 1 -m "/line1/" -m "/line2/" -m "/line3/" echo -e "line1\nline2\nline3" - is $? 0 "Multiple command output match, 3 positive (success)" + is $? 0 "Multiple command output match, 3 positive (success) [$DIE_MSG]" check_cmd_output -t 1 -m "/line1/" -m "/line2/" -m "/line3/" echo -e "line4\nline5\nline6" - is $? 1 "Multiple command output match, 3 positive (failure)" + is $? 1 "Multiple command output match, 3 positive (failure) [$DIE_MSG]" check_cmd_output -t 1 -m "!/line4/" -m "/line2/" -m "/line3/" echo -e "line1\nline2\nline3" - is $? 0 "Multiple command output match, 2 positive, 1 negative (success)" + is $? 0 "Multiple command output match, 2 positive, 1 negative (success) [$DIE_MSG]" check_cmd_output -t 1 -m "!/line1/" -m "/line2/" -m "/line3/" echo -e "line1\nline2\nline3" - is $? 1 "Multiple command output match, 2 positive, 1 negative (negative failure)" + is $? 1 "Multiple command output match, 2 positive, 1 negative (negative failure) [$DIE_MSG]" check_cmd_output -t 1 -m "!/line4/" -m "/line5/" -m "/line3/" echo -e "line1\nline2\nline3" - is $? 1 "Multiple command output match, 2 positive, 1 negative (positive failure)" + is $? 1 "Multiple command output match, 2 positive, 1 negative (positive failure) [$DIE_MSG]" check_cmd_output -t 1 -m "!/line1/" -m "/line5/" -m "/line6/" echo -e "line1\nline2\nline3" - is $? 1 "Multiple command output match, 2 positive, 1 negative (failure)" + is $? 1 "Multiple command output match, 2 positive, 1 negative (failure) [$DIE_MSG]" check_cmd_output -t 1 -m "!/line4/" -m "!/line5/" -m "!/line6/" echo -e "line1\nline2\nline3" - is $? 0 "Multiple command output match, 3 negative (success)" + is $? 0 "Multiple command output match, 3 negative (success) [$DIE_MSG]" check_cmd_output -t 1 -m "!/line1/" -m "!/line2/" -m "!/line3/" echo -e "line1\nline2\nline3" - is $? 1 "Multiple command output match, 3 negative (failure)" + is $? 1 "Multiple command output match, 3 negative (failure) [$DIE_MSG]" + + # Make sure certain error conditions are kicked out informatively + check_cmd_output -m '*' -m '!*' + is $? 1 "Missing command should fail (early)" + like "$DIE_MSG" '*Syntax error*No command*' "No commands? No dice!" + + check_cmd_output -t 9999999 -m '/bogus/' -m 'nope' -e 'moo --moo=moo moo_moo_mcmoo' + is $? 1 'Timeout exceeding that of `nhc` itself should barf' + like "$DIE_MSG" '*Command timeout * exceeds NHC timeout *' + + check_cmd_output -M 'Messages without match strings' -M 'should not work!' -e bob + like "$DIE_MSG" '*Message count * must not * match string *' "Messages without match strings are not valid input" + + ### check_cmd_dmesg() wraps check_cmd_output(), so we don't need to retest the above. + export TESTDATA=$'\n\ncheck_cmd_dmesg() Test Data\n\n dmesg-good1\ndmesg-b@d2 blah blah blah yadda yadda yadda\n\n\t\tmoomoodmesg-bad1moomoo\n' + check_cmd_dmesg -t 1 -m '*dmesg-good1*' -M 'dmesg is not GOOD1' -m '!*dmesg-bad1*' -M '' -m '!/dmesg-bad2/' -M 'dmesg is BAD2' echo "$TESTDATA" + is $? 1 "dmesg data containing 1 good and 1 bad string" + like "$DIE_MSG" "check_cmd_dmesg: dmesg output matched *" "die() message should match default since none was provided" + + check_cmd_dmesg -t 1 -m '*dmesg-good1*' -M 'dmesg is not GOOD1' -m '!*dmesg-bad1*' -M '' -m '!/dmesg-bad2/' -M 'dmesg is BAD2' /bin/sh -c 'cat <<< "$TESTDATA"' + is $? 1 "dmesg data containing 1 good and 1 bad string" + like "$DIE_MSG" "check_cmd_dmesg: dmesg output matched *" "die() message should match default since none was provided" + + # Make sure that "no messages overridden" cases are handled correctly + check_cmd_dmesg -t 1 -m '*dmesg-good1*' -m '*dmesg-bad1*' -m '/dmesg-bad2/' echo "$TESTDATA" + is $? 1 "dmesg data -- positive presence check fails" + like "$DIE_MSG" "check_cmd_dmesg: dmesg output failed to match *" "die() message should always match default when none are overridden (#3)" + check_cmd_dmesg -t 1 -m '!*dmesg-good1*' -m '*dmesg-bad1*' -m '!/dmesg-bad2/' echo "$TESTDATA" + is $? 1 "dmesg data -- negative presence check fails" + like "$DIE_MSG" "check_cmd_dmesg: dmesg output matched *" "die() message should always match default when none are overridden (#1)" + + # Make sure that "all messages overridden" cases work, even when passed differently + check_cmd_dmesg -t 1 -m '*dmesg-good1*' -m '*dmesg-bad1*' -m '/dmesg-bad2/' -M 'dmesg is not GOOD1' -M 'dmesg is BAD1' -M 'dmesg is BAD2' echo "$TESTDATA" + is $? 1 "dmesg data -- positive presence check fails" + unlike "$DIE_MSG" "check_cmd_dmesg: dmesg output failed to match *" "die() message should never match default when all are overridden (#3)" + like "$DIE_MSG" "check_cmd_dmesg: dmesg is BAD2." "die() message should match supplied override value" + check_cmd_dmesg -t 1 -m '!*dmesg-good1*' -m '!*dmesg-bad1*' -m '!/dmesg-bad2/' -M 'dmesg is not GOOD1' -M 'dmesg is BAD1' -M 'dmesg is BAD2' echo "$TESTDATA" + is $? 1 "dmesg data -- negative presence check fails" + unlike "$DIE_MSG" "check_cmd_dmesg: dmesg output matched *" "die() message should never match default when all are overridden (#1)" + like "$DIE_MSG" "check_cmd_dmesg: dmesg is not GOOD1." "die() message should match supplied override value" fi check_cmd_status -t 1 true