benchwarmer

#!/bin/bash

source ./config

# Take clients and rate limit from command line if passed, use defaults
# otherwise

# Clients
if [ ! -z "$1" ]; then
  CLIENTS="$1"
elif [ -z "$CLIENTS" ]; then
  CLIENTS=4
fi

# Rate limit
if [ -n "$2" ]; then
  RATE="$2"
  RATEOPT="-R $2 "
else
  RATE="NULL"
  RATEOPT=""
fi

# Query the test database and put its background writer statistics into
# a set of environment variables, probably for use inserting stats into
# the results database.
function get_bgwriter {
  BGW=`$TESTPSQL -A -t -F" " -c "select checkpoints_timed, checkpoints_req, buffers_checkpoint, buffers_clean, maxwritten_clean, buffers_backend, buffers_alloc, buffers_backend_fsync from pg_stat_bgwriter"`
  set -- $BGW
  checkpoints_timed=$1
  checkpoints_req=$2
  buffers_checkpoint=$3
  buffers_clean=$4
  maxwritten_clean=$5
  buffers_backend=$6
  buffers_alloc=$7
  buffers_backend_fsync=$8
}

# Figure out how many transactions per client, then recompute
# a new total; this copes with rounding issues
if [ -z "$TOTTRANS" ] ; then
  TOTTRANS=1000
fi
TRANS=`expr $TOTTRANS / $CLIENTS`
TOTTRANS=`expr $TRANS \* $CLIENTS`

# If a runtime has been specified instead, throw away the transaction counts
# use it instead
if [ -n "$RUNTIME" ] ; then
  TESTLEN="-T $RUNTIME"
  TOTTRANS=""
else
  TESTLEN="-t $TRANS"
fi

# Set WORKERS string so that the largest possible worker count
# up to MAX_WORKERS is used, while still dividing CLIENTS into an
# even number per worker.
WORKERS=""
NUM_WORKERS="1"

if [ -n "$MAX_WORKERS" ] ; then

  # Only bother with/allow adjustment to WORKERS if the max is >1.
  # That keeps up out of trouble if using a pgbench before 9.0,
  # where using any value for "-j" won't be allowed, as long as the
  # config file we're given isn't setup incorrectly.

  if [ "$MAX_WORKERS" -gt 1 ]; then
    NUM_WORKERS=$MAX_WORKERS

    while [ "$NUM_WORKERS" -gt 1 ]; do
      (( remainder=$CLIENTS % $NUM_WORKERS ))
      if [ $remainder -eq 0 ] ; then
        break
      fi
      (( NUM_WORKERS = $NUM_WORKERS - 1 ))
    done

    WORKERS="-j ${NUM_WORKERS}"
  fi
fi

# psql statements for the test database and the result database
TESTPSQL="psql -h $TESTHOST -U $TESTUSER -p $TESTPORT -d $TESTDB"
RESULTPSQL="psql -h $RESULTHOST -U $RESULTUSER -p $RESULTPORT -d $RESULTDB"
echo Running tests using:  $TESTPSQL
echo Storing results using:  $RESULTPSQL

# See if this database has all the standard pgbench tables in it
PGCOUNT=`$TESTPSQL -A -t -c "SELECT count(*)  FROM pg_stat_user_tables WHERE relname IN ('pgbench_history', 'pgbench_tellers', 'pgbench_accounts', 'pgbench_branches')"`
if [ "$PGCOUNT" -eq 4 ] ; then
  PGBENCH_TABLES=1
  echo Found standard pgbench tables
else
    echo Did not find standard pgbench tables
    PGBENCH_TABLES=0
fi

# Determine database scale
SCALE=0
if [ "$PGBENCH_TABLES" -eq "1" ] ; then
  SCALE=`$TESTPSQL -A -t -c "select count(*) from pgbench_branches"`
fi

# Confirm we have a useful pgbench to run
if [ ! -f $PGBENCHBIN ]; then
  echo ERROR:  cannot find pgbench binary $PGBENCHBIN , aborting
  exit
fi

# Use CURRENT_SET environment variable, typically set by runset
SET=$CURRENT_SET

# Cleanup pgbench tables, unless we've been told to skip that
if [ "$SKIPINIT" -ne "1" ]; then
  echo Cleaning up database $TESTDB
  if [ "$PGBENCH_TABLES" -eq "1" ] ; then
    $TESTPSQL -c "truncate table pgbench_history"
  fi

  # If the previous test ran recently and it did a cleanup, we should
  # be safe to skip this possibly lengthy step.  "Recently" is defined
  # as within 10 seconds of when the last test cleanup finished.  There
  # is some margin for error here, particulary if processing the results
  # latency log was very time consuming.  The price of making a mistake
  # is just some extra run-time though, doing a redundant vacuum.  On
  # databases sizes where the vacuum time is very long, its actually less
  # likely that the results log is large.  Tests against large database
  # tend to run slowly.
  CLEANED=`$RESULTPSQL -A -t -c "SELECT EXISTS(SELECT 1 FROM tests WHERE cleanup IS NOT NULL AND ((now() - end_time) + cleanup) < '100 seconds'::interval ORDER BY test DESC LIMIT 1)"`
  if [ "$CLEANED" = 'f' ] ; then
    # VACUUM VERBOSE -- additional verbosity gives at least some indication of progress
    $TESTPSQL -c "VACUUM"
  else
    echo "Skipping vacuum, it was recently ran by the last test"
  fi

  # We want clean stats from the pg_stat_bgwriter, but those won't show up
  # until after the checkpoint is done.  Wait a bit for the stats to update
  STARTCHECK=`$TESTPSQL -At -c "SELECT checkpoints_req FROM pg_stat_bgwriter"`
  $TESTPSQL -c "checkpoint"
  echo "Waiting for checkpoint statistics"
  while [ 1 ] ; do
    CHECK=`$TESTPSQL -At -c "SELECT checkpoints_req FROM pg_stat_bgwriter"`
    if [ "$CHECK" -gt "$STARTCHECK" ] ; then
      break
    fi
    sleep 1
  done

fi

# Create the tests record
DBSIZE=`$TESTPSQL -A -t -c "select pg_database_size('$TESTDB')"`
$RESULTPSQL -q -c "insert into tests (script, clients, workers, set, scale, dbsize, rate_limit) values('$SCRIPT','$CLIENTS','$NUM_WORKERS','$SET','$SCALE','$DBSIZE',$RATE)"
TEST=`$RESULTPSQL -A -t -c "select max(test) from tests where set = '$SET'"`
if [ "$?" -ne "0" ]; then
  echo ERROR  Can\'t read from tests table.  Was the test data installed?
  exit
fi

if [ -z "$TEST" ]; then
  echo ERROR:  Attempt to get a test number returned \"$TEST\", aborting
  exit
fi

# Grab starting values for statistics
get_bgwriter
START_WAL=`$TESTPSQL -A -t -F" " -c "select pg_current_xlog_location()"`
$RESULTPSQL -c "insert into test_bgwriter(test, checkpoints_timed, checkpoints_req, buffers_checkpoint, buffers_clean, maxwritten_clean, buffers_backend, buffers_alloc, buffers_backend_fsync) values('$TEST','$checkpoints_timed','$checkpoints_req','$buffers_checkpoint','$buffers_clean','$maxwritten_clean','$buffers_backend','$buffers_alloc','$buffers_backend_fsync')"

echo This is test $TEST

# Start background daemon collectors
if [ "$OSDATA" -eq "1" ]; then
  # Delete dstat output, just in case there is some left over from before
  # (dstat output to file is always additive)
  rm $BASEDIR/dstat.out &> /dev/null
  ./timed-os-stats dstat $DSTAT_DELAY_SECS &
  DSTAT=$!
fi

# Run the main pgbench test
echo Script $SCRIPT executing for $CLIENTS concurrent users... 1>&2
$PGBENCHBIN -f $BASEDIR/tests/$SCRIPT -s $SCALE -l --aggregate-interval=1 -n $TESTLEN -U $TESTUSER -h $TESTHOST -p $TESTPORT -c $CLIENTS $WORKERS $RATEOPT $TESTDB > results.txt &
P=$!
wait $P
$RESULTPSQL -q -c "update tests set end_time = now() where test = $TEST"

if [ "$OSDATA" -eq "1" ]; then
  # The PIDs we have will be the process ID of the Python timed-os-stats
  # process, but we must make sure to also kill the child processes (launched
  # via subprocess.Popen()) so they are not lingering forever.
  $BASEDIR/kill_pg "${DSTAT}"
  # Insert test into CSV representation, for the benefit of COPY
  cat $BASEDIR/dstat.out | grep -E '^[0-9]' | sed "s/^/$TEST,/g" > $BASEDIR/dstat.stat
  rm $BASEDIR/dstat.out
  $RESULTPSQL -c "copy temp_test_dstat from stdin with csv" < $BASEDIR/dstat.stat
  $RESULTPSQL -c "select convert_dstats('$TEST'::int)" &> /dev/null
  rm $BASEDIR/dstat.stat
fi

# Update bgwriter data with delta
get_bgwriter
$RESULTPSQL -c "update test_bgwriter set \
  checkpoints_timed = $checkpoints_timed - checkpoints_timed,\
  checkpoints_req = $checkpoints_req - checkpoints_req,\
  buffers_checkpoint = $buffers_checkpoint - buffers_checkpoint,\
  buffers_clean = $buffers_clean - buffers_clean,\
  maxwritten_clean = $maxwritten_clean - maxwritten_clean,\
  buffers_backend = $buffers_backend - buffers_backend,\
  buffers_alloc = $buffers_alloc - buffers_alloc,\
  buffers_backend_fsync = $buffers_backend_fsync - buffers_backend_fsync \
  where test = '$TEST'"

# WAL values are initially saved as text because they're pulled from the test
# database--but the functions to turn them into numeric types is available
# only in the results one.
END_WAL=`$TESTPSQL -A -t -F" " -c "select pg_current_xlog_location()"`
$RESULTPSQL -q -c "UPDATE tests SET wal_written = wal_lsn('$END_WAL') - wal_lsn('$START_WAL') WHERE test = $TEST"

cat results.txt

# Try to do the table cleanup before spoiling the database and OS cache by
# touching the results heavily.  That means before reading pgbench.log,
# which can be quite large on a fast server.
if [ "$SKIPINIT" -ne "1" ]; then
  echo Cleaning up database $TESTDB
  if [ "$PGBENCH_TABLES" -eq "1" ] ; then
    $TESTPSQL -c "truncate table pgbench_history"
  fi
  VACUUMSTART=`$RESULTPSQL -A -t -c "SELECT now()"`
  $TESTPSQL -c "VACUUM"
  $TESTPSQL -c "CHECKPOINT"
  $RESULTPSQL -q -c "UPDATE tests SET cleanup = now() - '$VACUUMSTART'::timestamptz WHERE test = $TEST"
fi

# Save pgbench log
# In the multi-threaded case, there may be a number of files, so copy
# them all in
cat pgbench_log.${P}* > pgbench.log

tps=`grep "(including connections establishing)" results.txt | cut -d " " -f 3`
trans=`grep "number of transactions actually processed:" results.txt | cut -d":" -f 2 | cut -d "/" -f 1`
$RESULTPSQL -q -c "update tests set tps = '$tps', trans = '$trans' where test = $TEST"

# Confirm we have an patched version of pgbench that has timestamps
TESTFORTS=`cat pgbench.log | head -n 1 | cut -d" " -f 6`
if [ -z "$TESTFORTS" ]; then
  echo
  echo "ERROR:  the pgbench used for this test is missing transaction"
  echo timestamps.  No latency information will be imported into
  echo the database, and no plots will be generated.
  echo

else

  # Import timestamp information
  $BASEDIR/log-to-csv $TEST < pgbench.log > timing.csv
  $RESULTPSQL -c "set timezone to 'utc'; copy timing from stdin with csv" < timing.csv
  # (This may be used within webreport)

  $RESULTPSQL -q -c "update tests set avg_latency=(select avg(latency_sum/num_of_transactions) / 1000 from timing where tests.test=timing.test), \
	  max_latency = (select max(max_latency) / 1000 from timing where tests.test = timing.test) \
  where tests.test='$TEST'"
fi

# Save some configuration information about the server
CLIENTHOST=`hostname`
SERVERHOST="$TESTHOST"
if [ "$SERVERHOST" = "localhost" ]; then
  SERVERHOST="$CLIENTHOST"
fi
SETTINGS="pg_settings.txt"

# Write out system and PostgreSQL installation
echo Test results: > $SETTINGS
$RESULTPSQL -c "select script, clients, round(tps) as tps, 1000 * round(avg_latency) / 1000 as avg_latency from tests where test = $TEST" | grep -v " row)" >> $SETTINGS
echo Server $SERVERHOST, client $CLIENTHOST >> $SETTINGS
echo >> $SETTINGS
echo "Server info and settings in postgresql.conf:" >> $SETTINGS
$TESTPSQL -At -c "select version();" | grep -v " row" >> $SETTINGS
$TESTPSQL -c "show data_directory" | grep -v " row" >> $SETTINGS
$TESTPSQL -c "select name, current_setting(name) from pg_settings where source ='configuration file' and not name in ('DateStyle', 'lc_messages', 'lc_monetary', 'lc_numeric', 'lc_time', 'listen_addresses', 'log_directory', 'log_rotation_age', 'log_rotation_size', 'log_truncate_on_rotation');" | grep -v " rows)" >> $SETTINGS

# Operating system information
echo >> $SETTINGS
echo "benchmark client OS Configuration (may not be the same as the server)" >> $SETTINGS
uname -a >> $SETTINGS

# Linux OS release is likely to be in one or more of these files
for f in `ls /etc/lsb-release /etc/debian_version /etc/redhat-release 2>/dev/null` ; do
  echo $f: >> $SETTINGS
  cat $f >> $SETTINGS
  echo >> $SETTINGS
done

# Save version info on Mac OS X
OSNAME=`uname`
if [ "$OSNAME" = "Darwin" ] ; then
  sw_vers >> $SETTINGS
fi
echo >> $SETTINGS

if [ -d /proc/sys/vm/ ] ; then
  for f in `ls /proc/sys/vm/dirty_*` ; do
    S=`cat $f`
    echo  $f=$S >> $SETTINGS
  done
  echo >> $SETTINGS
fi

mount >> $SETTINGS

# Finally, store settings file within database
CONTENTS=`cat $SETTINGS`
# Use Postgres dollar quotes here, as a good-enough form of escaping
$RESULTPSQL -q -c "update testset set settings= \$settings\$$CONTENTS\$settings\$ where set = '$SET'"

# Remove temporary files
rm pgbench_log.${P}*
rm pgbench.log
rm timing.csv
rm results.txt
rm $SETTINGS