benchmark: use t-test for comparing node versions

AndreasMadsen · AndreasMadsen · commit 855009af7f46 · 2016-07-26T13:21:53.000+02:00
The data sampling is done in node and the data processing is done in R. Only plyr was added as an R dependency and it is fairly standard. PR-URL: #7094 Reviewed-By: Trevor Norris <trev.norris@gmail.com> Reviewed-By: Jeremiah Senkpiel <fishrock123@rocketmail.com> Reviewed-By: Brian White <mscdex@mscdex.net> Reviewed-By: Anna Henningsen <anna@addaleax.net>
diff --git a/benchmark/_cli.R b/benchmark/_cli.R
@@ -0,0 +1,24 @@
+
+args = commandArgs(TRUE);
+
+args.options = list();
+
+temp.option.key = NULL;
+
+for (arg in args) {
+  # Optional arguments declaration
+  if (substring(arg, 1, 1) == '-') {
+    temp.option.key = substring(arg, 2);
+    if (substring(arg, 2, 2) == '-') {
+      temp.option.key = substring(arg, 3);
+    }
+
+    args.options[[temp.option.key]] = TRUE;
+  }
+  # Optional arguments value
+  else if (!is.null(temp.option.key)) {
+    args.options[[temp.option.key]] = arg;
+
+    temp.option.key = NULL;
+  }
+}
diff --git a/benchmark/compare.R b/benchmark/compare.R
@@ -0,0 +1,70 @@
+#!/usr/bin/env Rscript
+library(ggplot2);
+library(plyr);
+
+# get __dirname and load ./_cli.R
+args = commandArgs(trailingOnly = F);
+dirname = dirname(sub("--file=", "", args[grep("--file", args)]));
+source(paste0(dirname, '/_cli.R'), chdir=T);
+
+if (!is.null(args.options$help) ||
+   (!is.null(args.options$plot) && args.options$plot == TRUE)) {
+  stop("usage: cat file.csv | Rscript compare.R
+  --help           show this message
+  --plot filename  save plot to filename");
+}
+
+plot.filename = args.options$plot;
+
+dat = read.csv(file('stdin'));
+dat = data.frame(dat);
+dat$nameTwoLines = paste0(dat$filename, '\n', dat$configuration);
+dat$name = paste0(dat$filename, dat$configuration);
+
+# Create a box plot
+if (!is.null(plot.filename)) {
+  p = ggplot(data=dat);
+  p = p + geom_boxplot(aes(x=nameTwoLines, y=rate, fill=binary));
+  p = p + ylab("rate of operations (higher is better)");
+  p = p + xlab("benchmark");
+  p = p + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5));
+  ggsave(plot.filename, p);
+}
+
+# Print a table with results
+statistics = ddply(dat, "name", function(subdat) {
+  # Perform a statistics test to see of there actually is a difference in
+  # performace.
+  w = t.test(rate ~ binary, data=subdat);
+
+  # Calculate improvement for the "new" binary compared with the "old" binary
+  new_mu = mean(subset(subdat, binary == "new")$rate);
+  old_mu = mean(subset(subdat, binary == "old")$rate);
+  improvement = sprintf("%.2f %%", ((new_mu - old_mu) / old_mu * 100));
+
+  # Add user friendly stars to the table. There should be at least one star
+  # before you can say that there is an improvement.
+  significant = '';
+  if (w$p.value < 0.001) {
+    significant = '***';
+  } else if (w$p.value < 0.01) {
+    significant = '**';
+  } else if (w$p.value < 0.05) {
+    significant = '*';
+  }
+
+  r = list(
+    improvement = improvement,
+    significant = significant,
+    p.value = w$p.value
+  );
+  return(data.frame(r));
+});
+
+
+# Set the benchmark names as the row.names to left align them in the print
+row.names(statistics) = statistics$name;
+statistics$name = NULL;
+
+options(width = 200);
+print(statistics);
diff --git a/benchmark/compare.js b/benchmark/compare.js
@@ -1,181 +1,86 @@
 'use strict';
-var usage = 'node benchmark/compare.js ' +
-            '<node-binary1> <node-binary2> ' +
-            '[--html] [--red|-r] [--green|-g] ' +
-            '[-- <type> [testFilter]]';
 
-var show = 'both';
-var nodes = [];
-var html = false;
-var benchmarks;
+const fork = require('child_process').fork;
+const path = require('path');
+const CLI = require('./_cli.js');
+
+//
+// Parse arguments
+//
+const cli = CLI(`usage: ./node compare.js [options] [--] <category> ...
+  Run each benchmark in the <category> directory many times using two diffrent
+  node versions. More than one <category> directory can be specified.
+  The output is formatted as csv, which can be processed using for
+  example 'compare.R'.
+
+  --new    ./new-node-binary  new node binary (required)
+  --old    ./old-node-binary  old node binary (required)
+  --runs   30                 number of samples
+  --filter pattern            string to filter benchmark scripts
+  --set    variable=value     set benchmark variable (can be repeated)
+`, {
+  arrayArgs: ['set']
+});
+
+if (!cli.optional.new || !cli.optional.old) {
+  cli.abort(cli.usage);
+  return;
+}
 
-for (var i = 2; i < process.argv.length; i++) {
-  var arg = process.argv[i];
-  switch (arg) {
-    case '--red': case '-r':
-      show = show === 'green' ? 'both' : 'red';
-      break;
-    case '--green': case '-g':
-      show = show === 'red' ? 'both' : 'green';
-      break;
-    case '--html':
-      html = true;
-      break;
-    case '-h': case '-?': case '--help':
-      console.log(usage);
-      process.exit(0);
-      break;
-    case '--':
-      benchmarks = [];
-      break;
-    default:
-      if (Array.isArray(benchmarks))
-        benchmarks.push(arg);
-      else
-        nodes.push(arg);
-      break;
-  }
+const binaries = ['old', 'new'];
+const runs = cli.optional.runs ? parseInt(cli.optional.runs, 10) : 30;
+const benchmarks = cli.benchmarks();
+
+if (benchmarks.length === 0) {
+  console.error('no benchmarks found');
+  process.exit(1);
 }
 
-var start, green, red, reset, end;
-if (!html) {
-  start = '';
-  green = '\u001b[1;32m';
-  red = '\u001b[1;31m';
-  reset = '\u001b[m';
-  end = '';
-} else {
-  start = '<pre style="background-color:#333;color:#eee">';
-  green = '<span style="background-color:#0f0;color:#000">';
-  red = '<span style="background-color:#f00;color:#fff">';
-  reset = '</span>';
-  end = '</pre>';
+// Create queue from the benchmarks list such both node versions are tested
+// `runs` amount of times each.
+const queue = [];
+for (let iter = 0; iter < runs; iter++) {
+  for (const filename of benchmarks) {
+    for (const binary of binaries) {
+      queue.push({ binary, filename, iter });
+    }
+  }
 }
 
-var runBench = process.env.NODE_BENCH || 'bench';
+// Print csv header
+console.log('"binary", "filename", "configuration", "rate", "time"');
 
-if (nodes.length !== 2)
-  return console.error('usage:\n  %s', usage);
+(function recursive(i) {
+  const job = queue[i];
 
-var spawn = require('child_process').spawn;
-var results = {};
-var toggle = 1;
-var r = (+process.env.NODE_BENCH_RUNS || 1) * 2;
+  const child = fork(path.resolve(__dirname, job.filename), cli.optional.set, {
+    execPath: cli.optional[job.binary]
+  });
 
-run();
-function run() {
-  if (--r < 0)
-    return compare();
-  toggle = ++toggle % 2;
+  child.on('message', function(data) {
+    // Construct configuration string, " A=a, B=b, ..."
+    let conf = '';
+    for (const key of Object.keys(data.conf)) {
+      conf += ' ' + key + '=' + JSON.stringify(data.conf[key]);
+    }
+    conf = conf.slice(1);
 
-  var node = nodes[toggle];
-  console.error('running %s', node);
-  var env = {};
-  for (var i in process.env)
-    env[i] = process.env[i];
-  env.NODE = node;
+    // Escape qoutes (") for correct csv formatting
+    conf = conf.replace(/"/g, '""');
 
-  var out = '';
-  var child;
-  if (Array.isArray(benchmarks) && benchmarks.length) {
-    child = spawn(
-      node,
-      ['benchmark/run.js'].concat(benchmarks),
-      { env: env }
-    );
-  } else {
-    child = spawn('make', [runBench], { env: env });
-  }
-  child.stdout.setEncoding('utf8');
-  child.stdout.on('data', function(c) {
-    out += c;
+    console.log(`"${job.binary}", "${job.filename}", "${conf}", ` +
+                `${data.rate}, ${data.time}`);
   });
 
-  child.stderr.pipe(process.stderr);
-
-  child.on('close', function(code) {
+  child.once('close', function(code) {
     if (code) {
-      console.error('%s exited with code=%d', node, code);
       process.exit(code);
-    } else {
-      out.trim().split(/\r?\n/).forEach(function(line) {
-        line = line.trim();
-        if (!line)
-          return;
-
-        var s = line.split(':');
-        var num = +s.pop();
-        if (!num && num !== 0)
-          return;
-
-        line = s.join(':');
-        var res = results[line] = results[line] || {};
-        res[node] = res[node] || [];
-        res[node].push(num);
-      });
-
-      run();
-    }
-  });
-}
-
-function compare() {
-  // each result is an object with {"foo.js arg=bar":12345,...}
-  // compare each thing, and show which node did the best.
-  // node[0] is shown in green, node[1] shown in red.
-  var maxLen = -Infinity;
-  var util = require('util');
-  console.log(start);
-
-  Object.keys(results).map(function(bench) {
-    var res = results[bench];
-    var n0 = avg(res[nodes[0]]);
-    var n1 = avg(res[nodes[1]]);
-
-    var pct = ((n0 - n1) / n1 * 100).toFixed(2);
-
-    var g = n0 > n1 ? green : '';
-    var r = n0 > n1 ? '' : red;
-    var c = r || g;
-
-    if (show === 'green' && !g || show === 'red' && !r)
       return;
+    }
 
-    var r0 = util.format(
-      '%s%s: %d%s',
-      g,
-      nodes[0],
-      n0.toPrecision(5), g ? reset : ''
-    );
-    var r1 = util.format(
-      '%s%s: %d%s',
-      r,
-      nodes[1],
-      n1.toPrecision(5), r ? reset : ''
-    );
-    pct = c + pct + '%' + reset;
-    var l = util.format('%s: %s %s', bench, r0, r1);
-    maxLen = Math.max(l.length + pct.length, maxLen);
-    return [l, pct];
-  }).filter(function(l) {
-    return l;
-  }).forEach(function(line) {
-    var l = line[0];
-    var pct = line[1];
-    var dotLen = maxLen - l.length - pct.length + 2;
-    var dots = ' ' + new Array(Math.max(0, dotLen)).join('.') + ' ';
-    console.log(l + dots + pct);
+    // If there are more benchmarks execute the next
+    if (i + 1 < queue.length) {
+      recursive(i + 1);
+    }
   });
-  console.log(end);
-}
-
-function avg(list) {
-  if (list.length >= 3) {
-    list = list.sort();
-    var q = Math.floor(list.length / 4) || 1;
-    list = list.slice(q, -q);
-  }
-  return list.reduce(function(a, b) {
-    return a + b;
-  }, 0) / list.length;
-}
+})(0);