qa/check

#! /bin/sh
#
# Control script for running PCP QA tests
#
# Copyright (c) 1997-2002 Silicon Graphics, Inc.  All Rights Reserved.
#

mypid=$$
status=0
needwrap=true
try=0
n_bad=0
bad=""
n_triaged=0
triaged=""
notrun=""
seq=''
__aborted=true
myname=`basename $0`
__scriptname=$myname  #  a synonym

# lock (and PID) file to prevent concurrent QA execution
#
CHECKLOCK=/tmp/PCP-QA-LOCK

_wallclock()
{
    date "+%H %M %S" | $PCP_AWK_PROG '{ print $1*3600 + $2*60 + $3 }'
}

_timestamp()
{
    now=`date "+%D-%T"` 
    $PCP_ECHO_PROG $PCP_ECHO_N " [$now]""$PCP_ECHO_C"
}

_release_lock()
{
    if [ -f "$CHECKLOCK" ]
    then
    	LOCKOWNER=`cat "$CHECKLOCK" 2>/dev/null` || return 0
    	[ "$LOCKOWNER" = "$mypid" ] && rm -f "$CHECKLOCK"
    fi

    return 0
}

_wrapup()
{
    if [ -z "$tmp" ]
    then
	# did not get very far into the intialization!
	:
    else
	# release the lock and remove backup files
	_release_lock
	[ -d $tmp ] && ( rm -rf $tmp/checksums ; rmdir $tmp )

	if $__showme
	then
	    :
	elif $needwrap
	then
	    if [ -f check.time -a -f $tmp.time ]
	    then
		cat check.time $tmp.time \
		| $PCP_AWK_PROG '
	{ t[$1] = $2 }
END	{ if (NR > 0) {
	    for (i in t) print i " " t[i]
	  }
	}' \
		| sort -n >$tmp.out
		mv $tmp.out check.time
	    fi

	    echo $__list | fmt | sed -e 's/^/    /' >>check.log
	    if $__aborted
	    then
		if [ -z "$seq" ]
		then
		    echo "Aborted! [during setup]" | tee -a check.log
		else
		    echo "Aborted! [running $seq]" | tee -a check.log
		fi
		status=1
	    fi

	    if [ ! -z "$notrun" ]
	    then
		[ $__color = true ] && tput bold && tput setaf 4 # blue
		echo "Not run:$notrun"
		[ $__color = true ] && tput sgr0 # reset
		echo "Not run:$notrun" | fmt >>check.log
	    fi
	    if [ "$n_triaged" != 0 ]
	    then
		[ $__color = true ] && tput bold && tput setaf 2 # green
		echo "Triaged:$triaged"
		[ $__color = true ] && tput sgr0 # reset
		echo "Triaged:$triaged" | fmt >>check.log
	    fi
	    if [ "$n_bad" != 0 ]
	    then
		[ $__color = true ] && tput bold && tput setaf 1 # red
		echo "Failures:$bad"
		echo "Failed $n_bad of $try tests"
		[ $__color = true ] && tput sgr0 # reset
		echo "Failures:$bad" | fmt >>check.log
		echo "Failed $n_bad of $try tests" >>check.log
	    else
		try=`expr $try - $n_triaged`
		if [ $try != 0 ]
		then
		    [ $__color = true ] && tput bold && tput setaf 2 # green
		    echo "Passed $try tests"
		    [ $__color = true ] && tput sgr0 # reset
		    echo "Passed $try tests" >>check.log
		else
		    [ $__color = true ] && tput bold && tput setaf 1 # red
		    echo "No tests passed"
		    [ $__color = true ] && tput sgr0 # reset
		    echo "No tests passed" >>check.log
		fi
	    fi
	    needwrap=false
	fi

	rm -f $tmp.*
    fi
}

_addfiles()
{
    af=$1
    [ "$af" = "" ] && return 1
    [ ! -f "$af" ] && touch "$af"
    shift

    for fn in "$@"
    do
    	grep -F -s "$fn" "$af" >/dev/null
    	[ $? = 1 ] && echo "$fn" >>"$af"
    done

    return 0
}

_check_lock()
{
    #  Check that a check process of that process ID found in
    #  $CHECKLOCK exists, and if not, release the lock.

    [ ! -f "$CHECKLOCK" ] && return 0
    PID=`cat "$CHECKLOCK" 2>/dev/null` || return 0

    CCNT=`ps -e -o "pid args" | grep -v grep | grep "$PID" | grep check | \
      $PCP_AWK_PROG '{ print $1 }'`
    if [ "$PID" != "$CCNT" ]
    then
    	#  We can remove the lock; no check process found with that ID
    	$sudo rm -f "$CHECKLOCK"
    fi
    
    return 0
}

_get_lock()
{
    #  Does someone else have a lock on check at this time?  If so, we
    #  can't run a test until the lock is removed.
    #
    #  NOTE: the use of CHECKLOCK rather than check.pid was done so that
    #  people running check manually (rather than run.pcpqa running check)
    #  can have tests running between themselves.  This is better than
    #  having people waiting on one long series of tests passed to check
    #  and having spent 10 minutes waiting for nothing.

    #  Check that an instance of check who claims to have the lock actually
    #  exists!
    _check_lock

    #  Get (make) a lock
    echomessage=true
    for sleeptime in \
      1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 \
      1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 \
      1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 \
      1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 \
      2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 \
      2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 \
      2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 \
      5 5 5 5 5 5 5 5 5 5 5 5 \
      5 5 5 5 5 5 5 5 5 5 5 5 \
      5 5 5 5 5 5 5 5 5 5 5 5 \
      5 5 5 5 5 5 5 5 5 5 5 5 \
      5 5 5 5 5 5 5 5 5 5 5 5 0  #  10 minutes waiting time per test...
    do
    	if [ -f "$CHECKLOCK" ]
    	then
    	    LOCKOWNER=`cat "$CHECKLOCK" 2>/dev/null` || continue

    	    if [ "$LOCKOWNER" != $mypid ]
    	    then
    	    	#  wait until lock disappears...
    	    	if [ "$sleeptime" = 0 ]
    	    	then
    	    	    #  We should leave... something's terribly wrong.
    	    	    echo ""
    	    	    return 1
    	    	else
    	    	    $echomessage && \
    	    	      $PCP_ECHO_PROG $PCP_ECHO_N " waiting for lock [owner pid=$LOCKOWNER]... ""$PCP_ECHO_C" && \
    	    	      echomessage=false
    	    	    sleep $sleeptime
    	    	fi
    	    else
    	    	#  already have lock
    	    	break
    	    fi
    	else
    	    #  make lock
    	    echo "$mypid" >"$CHECKLOCK"
    	    chmod a+r "$CHECKLOCK"
    	    break
    	fi
    done
    $echomessage || echo "got it; proceeding: $seq"

    return 0
}

_make_checkfiles()
{
    if [ ! -f $tmp.checkfiles ]
    then
	[ -z "$PCP_PMCDOPTIONS_PATH" ] && \
		PCP_PMCDOPTIONS_PATH="$PCP_SYSCONF_DIR/pmcd/pmcd.options"
	[ -z "$PCP_PMLOGGERCONTROL_PATH" ] && \
		PCP_PMLOGGERCONTROL_PATH="$PCP_SYSCONF_DIR/pmlogger/control"
	[ -z "$PCP_PMIECONTROL_PATH" ] && \
		PCP_PMIECONTROL_PATH="$PCP_SYSCONF_DIR/pmie/control"
	_checkfiles="$PCP_PMCDCONF_PATH \
		$PCP_PMLOGGERCONTROL_PATH \
		$PCP_VAR_DIR/config/pmlogger/config.default \
		$PCP_PMLOGGERCONTROL_PATH \
		$PCP_PMCDOPTIONS_PATH \
		$PCP_DIR/etc/init.d/pcp \
		$PCP_DIR/etc/pcp.conf $PCP_DIR/etc/pcp.env \
		$PCP_PMDAS_DIR/sample/dsohelp.dir \
		$PCP_PMDAS_DIR/sample/dsohelp.pag \
		$PCP_PMDAS_DIR/sample/help.dir \
		$PCP_PMDAS_DIR/sample/help.pag \
		$PCP_PMDAS_DIR/simple/simple.conf"
    fi
}

_checksums()
{
    cmd="$1"

    _make_checkfiles

    case "$cmd"
    in
	get)
	    mkdir -p $tmp/checksums
    	    chmod a+w $tmp/checksums
	    for f in `cat $tmp.checkfiles`
	    do
    	    	buf=`echo $f | sed -e 's;/;+;g'`
    	    	buf=$tmp/checksums/$buf
		[ -f $f ] && sum $f
		[ -f $f -a ! -f $buf ] && $sudo cp $f $buf
	    done
	    ;;

	check)
	    for f in `cat $tmp.checkfiles`
	    do
    	    	buf=`echo $f | sed -e 's;/;+;g'`
    	    	buf=$tmp/checksums/$buf
		if [ ! -f $f ]
		then
		    if grep -F "$f" $2 >/dev/null 2>&1
		    then
			echo "    Missing: \"$f\""
		        [ -f $buf ] && $sudo cp -f $buf $f
		    fi
		else
		    _cs=`sum $f`
		    if grep -F "$_cs" $2 >/dev/null 2>&1
		    then
			$sudo rm -f $f.$seq.O
		    else
			echo "    Changed: \"$f\""
			$sudo cp -f $f $f.$seq.O
		        [ -f $buf ] && $sudo cp -f $buf $f
		    fi
		fi

	    done
	    ;;

	*)
	    bozo
	    ;;
    esac
    return 0
}


# If the file "triaged" exists, check if the current test ($seq)
# and this platform are mentioned there ... if so, assume this
# particular failure has already been triaged.
#
_is_triaged()
{
    [ -f triaged ] || return 0
    rm -f $tmp.ok
    [ ! -f $tmp.whatami ] && admin/whatami >$tmp.whatami
    sed -n <triaged \
	-e '/^'$seq'	/{
s///
s@\\@\\\\\\\\\\\\\\\\@g
p
}' \
    | while read _line
    do
	eval `echo "$_line" | sed -e 's/\//\\\\/g' -e 's/^/_re="/' -e 's/	.*/"/'`
	eval `echo "$_line" | sed -e 's/^[^	]*	/_comments="/' -e 's/$/"/'`
	if grep -E "$_re" $tmp.whatami >/dev/null 2>&1
	then
	    # triaged, don't treat as a Failure
	    #
	    touch $tmp.ok
	    break
	fi
    done

    if [ -f $tmp.ok ]
    then
	return 0
    else
	return 1
    fi
}

trap "_wrapup; exit \$status" 0 1 2 3 15

# generic initialization... this may take a while to run, because (unless
# $__quick is true) make is run.
. ./common

# Suggested by Andreas Gerstmayr on 11 Mar 2022
#
if which timeout >/dev/null 2>&1
then
    use_timeout=true
    # some VMs are _really_ slow ...
    #
    case `hostname`
    in
	bozo-vm)	max_timeout=6m
			;;
	*)		max_timeout=5m
			;;
    esac
    # but timeout(1) does not work on some platforms because the QA
    # tests may start daemons (like pmcd, and then the PMDAs) that
    # are children of the test's shell script (no systemd in play)
    # and then it looks like the script never ends while it has
    # children that are still alive
    #
    case $PCP_PLATFORM
    in
	freebsd)
	    use_timeout=false
	    ;;
    esac
else
    use_timeout=false
fi

[ -f check.time ] || touch check.time
$__timings_file && $PCP_ECHO_PROG $PCP_ECHO_N > check.timings

# Remove automated PMDA install/remove that will interfere with tests
$sudo rm -f $PCP_PMDAS_DIR/*/.NeedInstall $PCP_PMDAS_DIR/*/.NeedRemove

# Remove automated pmlogger log checks that will interfere with tests
$sudo rm -f $PCP_PMLOGGER_DIR/.NeedRewrite

[ "`_get_config pmcd`" != on ] && _change_config pmcd on

torun=`echo $__list | wc -w | sed -e 's/ //g'`
haverun=0

if $__showme
then
    :
else
    echo "" >>check.log
    date >>check.log
    # make sure everything is ship-shape from pmlogger_daily
    #
    if ./daily-cleanup
    then
	:
    else
	echo "... you have been warned!"
    fi
fi

for seq in $__list
do
    state="Pass"
    if $__showme
    then
	echo $seq
	continue
    fi
    if [ $torun -gt 9 ]
    then
	pct=`expr 100 \* $haverun / $torun`
	haverun=`expr $haverun + 1`
	$PCP_ECHO_PROG $PCP_ECHO_N "[$pct%] ""$PCP_ECHO_C"
    fi
    $PCP_ECHO_PROG $PCP_ECHO_N "$seq""$PCP_ECHO_C"
    if [ ! -f $seq ]
    then
	echo " [not run, missing]"
	notrun="$notrun $seq"
	continue
    else
	# really going to try and run this one
	#
	rm -f $seq.full
	lasttime=`sed -n -e "/^$seq /s/.* //p" <check.time`
	[ "X$lasttime" != X ] && $PCP_ECHO_PROG $PCP_ECHO_N " ${lasttime}s ...""$PCP_ECHO_C"
	rm -f core $seq.notrun

    	# acquire lock
    	_get_lock
    	if [ $? != 0 ]
    	then
	    echo "$myname: could not acquire lock; exiting" 2>&1
	    status=1
	    exit
    	fi

	if $__check_config
	then
	    # save checksums for critical conf and control files
	    [ ! -f $tmp.checksums ] && _checksums get >$tmp.checksums
	fi

	start=`_wallclock`
	$__timestamp && _timestamp
	$__timings_file && $PCP_ECHO_PROG $PCP_ECHO_N "$seq `date '+%s'` " >> check.timings

	# optional --precheck callback
	#
	rm -f $tmp.callback.pre
	if [ -x ./check.callback ]
	then
	    ./check.callback --precheck $seq >$tmp.callback.pre 2>&1
	fi

	if $use_timeout
	then
	    # -s 6 => SIGABRT ... see trap in common.rc
	    #
	    timeout -s 6 $max_timeout ./$seq >$tmp.out 2>&1
	    sts=$?
	else
	    ./$seq >$tmp.out 2>&1
	    sts=$?
	fi

	if $__check_config
	then
	    # check the saved checksums
	    _checksums check $tmp.checksums >$tmp.check
	    if [ -s $tmp.check ]
	    then
		echo "$myname: $seq: ERROR: test failed to restore the following config files:" >>$tmp.out
		cat $tmp.check >>$tmp.out
		$PCP_ECHO_PROG $PCP_ECHO_N " [config not restored]""$PCP_ECHO_C"
	    fi
	fi

	# remove the lock
	_release_lock

	if [ -f core ]
	then
	    $PCP_ECHO_PROG $PCP_ECHO_N " [dumped core]""$PCP_ECHO_C"
	    mv core $seq.core
	    state="Fail"
	fi

	$sudo rm -f $seq.out.bad
	rm -f $tmp.callback.abort
	if [ -f $seq.notrun ]
	then
	    [ $__color = true ] && tput bold && tput setaf 4 # blue
	    echo " [not run] `cat $seq.notrun`"
	    [ $__color = true ] && tput sgr0 # reset
	    notrun="$notrun $seq"
	else
	    if [ $sts -ne 0 ]
	    then
		$PCP_ECHO_PROG $PCP_ECHO_N " [failed, exit status $sts]""$PCP_ECHO_C"
		state="Fail"
	    fi
	    if [ ! -f $seq.out ]
	    then
		echo " - no qualified output"
		mv $tmp.out $seq.out.bad
		state="Fail"
	    else
		# we have $seq.out (expected) and $tmp.out (observed)
		# output files, first check if they match
		#
		if diff $seq.out $tmp.out >/dev/null 2>&1
		then
		    done_nl=false
		else
		    [ $__color = true ] && tput bold && tput setaf 1 # red
		    $PCP_ECHO_PROG $PCP_ECHO_N " - output mismatch (see $seq.out.bad)"
		    if _is_triaged
		    then
			[ $__color = true ] && tput bold && tput setaf 2 # green
			echo " - triaged"
			echo "[triaged]" >>$tmp.out
			state="Triaged"
			[ $__color = true ] && tput sgr0 # reset
		    else
			echo
			$__diff $seq.out $tmp.out
			state="Fail"
			[ $__color = true ] && tput sgr0 # reset
		    fi
		    done_nl=true
		fi

		# now some integrity checks to see if the test has
		# left things in a bad state ... if badness
		# found, append to $tmp.out
		#

		# make sure this test did not muck up the permissions or
		# ownership of key installed files and directories
		#
		sh ./994 --fix >$tmp.tmp
		if [ -s $tmp.tmp ]
		then
		    [ $__color = true ] && tput bold && tput setaf 1 # red
		    if $done_nl
		    then
			echo "$seq ... - also failed permissions check"
		    else
			$PCP_ECHO_PROG $PCP_ECHO_N " - failed permissions check""$PCP_ECHO_C"
		    fi
		    [ $__color = true ] && tput sgr0 # reset
		    reported=true
		    echo >>$tmp.out
		    echo "*** Failed permissions/ownership checks ***" >>$tmp.out
		    cat $tmp.tmp >>$tmp.out
		    state="Fail"
		fi

		# optional callback
		#
		if [ -x ./check.callback ]
		then
		    if ./check.callback $seq >$tmp.callback.post 2>&1
		    then
			# script passes, so don't think about aborting
			# but output would indicate a problem
			# ...
			if [ -s $tmp.callback.post ]
			then
			    # output from check.callback, this means something
			    # bad has been detected ...
			    #
			    [ $__color = true ] && tput bold && tput setaf 1 # red
			    if $done_nl
			    then
				echo "$seq ... - also badness detected by check.callback"
			    else
				$PCP_ECHO_PROG $PCP_ECHO_N " - badness detected by check.callback""$PCP_ECHO_C"
			    fi
			    [ $__color = true ] && tput sgr0 # reset
			    echo >>$tmp.out
			    echo "+++ check.callback output +++" >>$tmp.out
			    [ -f $tmp.callback.pre ] && cat $tmp.callback.pre >>$tmp.out
			    cat $tmp.callback.post >>$tmp.out
			    state="Fail"
			fi
		    else
			# check.callback suggests we should abandon the QA run ...
			#
			[ $__color = true ] && tput bold && tput setaf 1 # red
			if $done_nl
			then
			    echo "$seq ... - also run aborted by check.callback"
			else
			    $PCP_ECHO_PROG $PCP_ECHO_N " - run aborted by check.callback""$PCP_ECHO_C"
			fi
			[ $__color = true ] && tput sgr0 # reset
			echo >>$tmp.out
			echo "+++ check.callback output +++" >>$tmp.out
			[ -f $tmp.callback.pre ] && cat $tmp.callback.pre >>$tmp.out
			cat $tmp.callback.post >>$tmp.out
			state="Fail"
			echo "Check! [after $seq] Run aborted by check.callback @ `date`" >>check.log
			touch $tmp.callback.abort
		    fi
		fi

		$__timestamp && _timestamp
		stop=`_wallclock`

		if [ "$state" != "Pass" ]
		then
		    mv $tmp.out $seq.out.bad
		else
		    echo "$seq `expr $stop - $start`" >>$tmp.time
		fi

		$done_nl || echo
	    fi
	fi
	$__timings_file && date '+%s' >> check.timings
    fi

    # come here for each test, except when $__showme is true
    #
    if [ "$state" = "Fail" ]
    then
	bad="$bad $seq"
	n_bad=`expr $n_bad + 1`
	try=`expr $try + 1`
	__quick=false
	[ $__diff = true ] || echo "Check local PMCD is still alive ..."
	$OPTION_AGENTS && _haveagents
	$OPTION_LOGGER && _havelogger
    elif [ "$state" = "Triaged" ]
    then
	triaged="$triaged $seq"
	n_triaged=`expr $n_triaged + 1`
	try=`expr $try + 1`
    else
	[ -f $seq.notrun ] || try=`expr $try + 1`
    fi
    rm -f $seq.notrun

    [ -f $tmp.callback.abort ] && break

done

if [ -f $tmp.callback.abort ]
then
    __aborted=true
else
    __aborted=false
fi

status=$n_bad
exit