Skip to content

Commit

Permalink
lbnl_cmd.nhc: Remove line numbers from dmesg check
Browse files Browse the repository at this point in the history
When using `check_cmd_dmesg()` directly (as written in `scripts/lbnl_cmd.nhc`)
with a negated match string, the default behavior of
`check_cmd_output()` (which `check_cmd_dmesg()` wraps) used for error
reporting causes the "Reason" field to contain not only the match
string that was found (and shouldn't have been) but also the **_line
number_** where the match was found.  In the case of `dmesg` output,
the line number is almost completely useless; moreover, it prevents
Slurm and other schedulers/RMs from being able to group all the
affected nodes together -- because the line numbers almost always
differ!

Granted that users/admins can override the default failure message
generation behavior (via `-M` entries, all of which are passed
directly to `check_cmd_output()`), but in the specific case of
`check_cmd_dmesg()`, I think the default behavior should suppress the
line numbers and use a simpler, more concise message instead.

This changeset adds a bit of pre-processing to `check_cmd_dmesg()`.
Each "mstr/message" pair" is examined, and for each match string (`-m`
argument) that doesn't have a corresponding message (`-M` argument)
that overrides the default will have a new default provided to it that
omits the extraneous information.

Addresses #143.
  • Loading branch information
mej committed Sep 15, 2023
1 parent a583663 commit 73d3c39
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 17 deletions.
4 changes: 1 addition & 3 deletions scripts/common.nhc
Original file line number Diff line number Diff line change
Expand Up @@ -705,9 +705,7 @@ function nhc_common_get_unix_time() {

if [[ $SECONDS -lt 946684800 ]]; then
NOW=$(date '+%s')
ELAPSED_SECONDS=$SECONDS
SECONDS=$((ELAPSED_SECONDS+NOW))
NHC_START_TS=$((NOW-ELAPSED_SECONDS))
(( ELAPSED_SECONDS=$SECONDS, SECONDS+=NOW, NHC_START_TS=NOW-ELAPSED_SECONDS ))
fi
}

Expand Down
39 changes: 29 additions & 10 deletions scripts/lbnl_cmd.nhc
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ function check_cmd_output() {
case "$OPTION" in
C) CHECKNAME="$OPTARG" ;; # Used for writing wrapper checks
O) OUTPUT_VAR="$OPTARG" ;; # Used for writing wrapper checks
M) MSGS[${#MSGS[*]}]="$OPTARG" ;; # Used for writing wrapper checks
M) MSGS+=( "$OPTARG" ) ;; # Used for writing wrapper checks
e) CMD="$OPTARG" ;;
m) MATCHES[${#MATCHES[*]}]="$OPTARG" ;;
m) MATCHES+=( "$OPTARG" ) ;;
r) CMD_RETVAL="$OPTARG" ;;
t) CMD_TIMEOUT="$OPTARG" ;;
:) die 1 "$CHECKNAME: Option -$OPTARG requires an argument." ; return 1 ;;
Expand All @@ -45,7 +45,7 @@ function check_cmd_output() {

# Initialize a parallel array for tracking which matches have been found.
for ((i = 0; i < ${#MATCHES[*]}; i++)); do
if [[ "${MATCHES[$i]:0:1}" = '!' ]]; then
if [[ "${MATCHES[$i]:0:1}" == '!' ]]; then
MATCHED[$i]=1
((NEG_MATCH_CNT++))
else
Expand Down Expand Up @@ -74,7 +74,7 @@ function check_cmd_output() {
# Iterate through $LINES[] array to gather process data.
for ((LINENUM = 0; LINENUM <= ${#LINES[*]}; LINENUM++)); do
for ((i = 0; i < ${#MATCHES[*]}; i++)); do
if [[ "${MATCHES[$i]:0:1}" = '!' ]]; then
if [[ "${MATCHES[$i]:0:1}" == '!' ]]; then
# Match expression is negated. Fail if the line matches this expression.
if mcheck "${LINES[$LINENUM]}" "${MATCHES[$i]:1}" ; then
MSG=${MSGS[$i]:-\"%m\" matched at line %l of \"%c\"}
Expand All @@ -85,7 +85,7 @@ function check_cmd_output() {
die 1 "$CHECKNAME: $MSG."
return 1
fi
elif [ ${MATCHED[$i]} -eq 0 ] && mcheck "${LINES[$LINENUM]}" "${MATCHES[$i]}" ; then
elif [[ ${MATCHED[$i]} -eq 0 ]] && mcheck "${LINES[$LINENUM]}" "${MATCHES[$i]}" ; then
MATCHED[$i]=1
((MATCH_CNT++))
if [[ $MATCH_CNT == ${#MATCHED[*]} ]]; then
Expand Down Expand Up @@ -186,22 +186,41 @@ function check_cmd_status() {

# Check dmesg output for matching lines. Fairly simple wrapper for check_cmd_output() (above).
function check_cmd_dmesg() {
local CMD_TIMEOUT CMD_RETVAL CMD="dmesg"
local CMD="" CMD_TIMEOUT CMD_RETVAL OUTVAR
local -a MATCHES MSGS
local -i i

OPTIND=1
while getopts ":M:e:m:r:t:" OPTION ; do
while getopts ":M:O:e:m:r:t:" OPTION ; do
case "$OPTION" in
M) MSGS[${#MSGS[*]}]=-M ; MSGS[${#MSGS[*]}]="$OPTARG" ;;
M) MSGS+=( '-M' "$OPTARG" ) ;;
O) OUTVAR="$OPTARG" ;;
e) CMD="$OPTARG" ;;
m) MATCHES[${#MATCHES[*]}]=-m ; MATCHES[${#MATCHES[*]}]="$OPTARG" ;;
m) MATCHES+=( '-m' "$OPTARG" ) ;;
r) CMD_RETVAL="$OPTARG" ;;
t) CMD_TIMEOUT="$OPTARG" ;;
:) die 1 "$FUNCNAME: Option -$OPTARG requires an argument." ; return 1 ;;
\?) die 1 "$FUNCNAME: Invalid option: -$OPTARG" ; return 1 ;;
esac
done
shift $((OPTIND-1))
if [[ -z "$CMD" && $# -eq 0 ]]; then
CMD="dmesg"
fi
if (( (${#MATCHES[*]} % 2 != 0) || (${#MSGS[*]} % 2 != 0) || (${#MATCHES[*]} < ${#MSGS[*]}) )); then
die 1 "$FUNCNAME: Match-string count (${#MATCHES[*]}) and/or message count (${#MSGS[*]}) inconsistent/invalid."
return 1
fi

check_cmd_output -C $FUNCNAME ${CMD_RETVAL:+-r $CMD_RETVAL} ${CMD_TIMEOUT:+-t $CMD_TIMEOUT} -e "$CMD" "${MSGS[@]}" "${MATCHES[@]}"
for (( i = 1; i < ${#MATCHES[*]}; i += 2 )); do
if [[ -z "${MSGS[i]}" ]]; then
MSGS[i-1]='-M'
if [[ "${MATCHES[$i]:0:1}" == '!' ]]; then
MSGS[i]="dmesg output matches \"${MATCHES[$i]:1}\""
else
MSGS[i]="dmesg output failed to match \"${MATCHES[$i]}\""
fi
fi
done
check_cmd_output -C "$FUNCNAME" ${OUTVAR:+-O $OUTVAR} ${CMD_RETVAL:+-r $CMD_RETVAL} ${CMD_TIMEOUT:+-t $CMD_TIMEOUT} ${CMD:+-e "$CMD"} "${MSGS[@]}" "${MATCHES[@]}" "$@"
}
6 changes: 5 additions & 1 deletion test/nhc-test
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,13 @@ function test_nhc_driver() {
}

function test_checks() {
local CHECKTEST
local CHECKTEST DIE_MSG="" DIE_RC=""
local -a TEST_SCRIPTS

# Instead of die() being empty, have it save the message and return code
# into global variables that can be referenced by our unit tests.
function die() { DIE_RC="$1"; DIE_MSG="${*:2}"; return $DIE_RC; }

set +f
TEST_SCRIPTS=( $SRCDIR/test_*.nhc )
set -f
Expand Down
4 changes: 2 additions & 2 deletions test/test_common.nhc
Original file line number Diff line number Diff line change
Expand Up @@ -367,8 +367,8 @@ plan $((22+11+11+13+11+28+9+7+15+6+7+4+6+18+7+19+2+8)) "common.nhc" && {
OLDSECS=$SECONDS
cmp_ok "$SECONDS" "-lt" "60" 'Initial $SECONDS value should be small (sub-1min)'
nhc_common_get_unix_time
ok '(( (SECONDS == UNIXTIME+OLDSECS) || (SECONDS == UNIXTIME+OLDSECS+1) ))' \
"\$SECONDS ($SECONDS) should now reflect \`time_t\` value (roughly $UNIXTIME +/- $OLDSECS)"
ok '(( (SECONDS <= UNIXTIME+OLDSECS+5) && (SECONDS >= UNIXTIME+OLDSECS-5) ))' \
"\$SECONDS ($SECONDS) should now reflect \`time_t\` value (roughly $UNIXTIME+$OLDSECS +/- 5s)"

if [[ $- == *x* ]]; then
# There seems to be a bash bug. It doesn't handle process substitution
Expand Down
21 changes: 20 additions & 1 deletion test/test_lbnl_cmd.nhc
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# Tests for lbnl_cmd.nhc

plan $((2+9+2)) "lbnl_cmd.nhc" && {
plan $((3+9+8+2)) "lbnl_cmd.nhc" && {
is "`type -t check_cmd_output 2>&1`" 'function' 'check_cmd_output() loaded properly'
is "`type -t check_cmd_status 2>&1`" 'function' 'check_cmd_status() loaded properly'
is "`type -t check_cmd_dmesg 2>&1`" 'function' 'check_cmd_dmesg() loaded properly'

# Needed to bypass sanity check on timeout values.
export TIMEOUT=99999
Expand All @@ -29,6 +30,24 @@ plan $((2+9+2)) "lbnl_cmd.nhc" && {
is $? 0 "Multiple command output match, 3 negative (success)"
check_cmd_output -t 1 -m "!/line1/" -m "!/line2/" -m "!/line3/" echo -e "line1\nline2\nline3"
is $? 1 "Multiple command output match, 3 negative (failure)"

export TESTDATA=$'\n\ncheck_cmd_dmesg() Test Data\n\n dmesg-good1\ndmesg-b@d2 blah blah yadda yadda\n\n\t\tmoodmesg-bad1moomoo\n'
check_cmd_dmesg -t 1 -m '*dmesg-good1*' -M 'dmesg is not GOOD1' -m '!*dmesg-bad1*' -M '' -m '!/dmesg-bad2/' -M 'dmesg is BAD2' echo "$TESTDATA"
is $? 1 "dmesg data containing 1 good and 1 bad string"
like "$DIE_MSG" "check_cmd_dmesg: dmesg output matches *" "die() message should match default since none was provided"

check_cmd_dmesg -t 1 -m '*dmesg-good1*' -M 'dmesg is not GOOD1' -m '!*dmesg-bad1*' -M '' -m '!/dmesg-bad2/' -M 'dmesg is BAD2' /bin/sh -c 'cat <<< "$TESTDATA"'
is $? 1 "dmesg data containing 1 good and 1 bad string"
like "$DIE_MSG" "check_cmd_dmesg: dmesg output matches *" "die() message should match default since none was provided"

# Make sure certain error conditions are kicked out informatively
check_cmd_dmesg -m '*' -m '!*'
is $? 1 "Missing command should fail (early)"
like "$DIE_MSG" '*Syntax error*No command*' "No commands? No dice!"

check_cmd_dmesg -t 9999999 -m '/bogus/' -m 'nope' -e moo --moo=moo moo_moo_mcmoo
is $? 1 'Timeout exceeding that of `nhc` itself should barf'
like "$DIE_MSG" '*Command timeout * exceeds NHC timeout *"
fi

check_cmd_status -t 1 true
Expand Down

0 comments on commit 73d3c39

Please sign in to comment.