test: make crypto.timingSafeEqual test less flaky

not-an-aardvark · Fishrock123 · commit d0e61d4cc34c · 2016-09-13T17:04:45.000-07:00
The `crypto.timingSafeEqual` test still seems to be a bit flaky. This makes a few changes to the test: * Separates the basic usage and the benchmarking into different tests * Moves the timing-sensitive benchmark function into a separate module, and reparses the module on every iteration of the loop to avoid shared state between timing measurements. PR-URL: #8456 Reviewed-By: James M Snell <jasnell@gmail.com>
diff --git a/test/fixtures/crypto-timing-safe-equal-benchmark-func.js b/test/fixtures/crypto-timing-safe-equal-benchmark-func.js
@@ -0,0 +1,17 @@
+'use strict';
+
+const assert = require('assert');
+module.exports = (compareFunc, firstBufFill, secondBufFill, bufSize) => {
+  const firstBuffer = Buffer.alloc(bufSize, firstBufFill);
+  const secondBuffer = Buffer.alloc(bufSize, secondBufFill);
+
+  const startTime = process.hrtime();
+  const result = compareFunc(firstBuffer, secondBuffer);
+  const endTime = process.hrtime(startTime);
+
+  // Ensure that the result of the function call gets used, so it doesn't
+  // get discarded due to engine optimizations.
+  assert.strictEqual(result, firstBufFill === secondBufFill);
+
+  return endTime[0] * 1e9 + endTime[1];
+};
diff --git a/test/sequential/test-crypto-timing-safe-equal-benchmarks.js b/test/sequential/test-crypto-timing-safe-equal-benchmarks.js
@@ -0,0 +1,123 @@
+'use strict';
+const common = require('../common');
+const assert = require('assert');
+
+if (!common.hasCrypto) {
+  common.skip('missing crypto');
+  return;
+}
+
+if (!common.enoughTestMem) {
+  common.skip('memory-intensive test');
+  return;
+}
+
+const crypto = require('crypto');
+
+const BENCHMARK_FUNC_PATH =
+  `${common.fixturesDir}/crypto-timing-safe-equal-benchmark-func`;
+function runOneBenchmark(...args) {
+  const benchmarkFunc = require(BENCHMARK_FUNC_PATH);
+  const result = benchmarkFunc(...args);
+
+  // Don't let the comparison function get cached. This avoid a timing
+  // inconsistency due to V8 optimization where the function would take
+  // less time when called with a specific set of parameters.
+  delete require.cache[require.resolve(BENCHMARK_FUNC_PATH)];
+  return result;
+}
+
+function getTValue(compareFunc) {
+  const numTrials = 10000;
+  const bufSize = 10000;
+  // Perform benchmarks to verify that timingSafeEqual is actually timing-safe.
+
+  const rawEqualBenches = Array(numTrials);
+  const rawUnequalBenches = Array(numTrials);
+
+  for (let i = 0; i < numTrials; i++) {
+    if (Math.random() < 0.5) {
+      // First benchmark: comparing two equal buffers
+      rawEqualBenches[i] = runOneBenchmark(compareFunc, 'A', 'A', bufSize);
+      // Second benchmark: comparing two unequal buffers
+      rawUnequalBenches[i] = runOneBenchmark(compareFunc, 'B', 'C', bufSize);
+    } else {
+      // Flip the order of the benchmarks half of the time.
+      rawUnequalBenches[i] = runOneBenchmark(compareFunc, 'B', 'C', bufSize);
+      rawEqualBenches[i] = runOneBenchmark(compareFunc, 'A', 'A', bufSize);
+    }
+  }
+
+  const equalBenches = filterOutliers(rawEqualBenches);
+  const unequalBenches = filterOutliers(rawUnequalBenches);
+
+  // Use a two-sample t-test to determine whether the timing difference between
+  // the benchmarks is statistically significant.
+  // https://wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test
+
+  const equalMean = mean(equalBenches);
+  const unequalMean = mean(unequalBenches);
+
+  const equalLen = equalBenches.length;
+  const unequalLen = unequalBenches.length;
+
+  const combinedStd = combinedStandardDeviation(equalBenches, unequalBenches);
+  const standardErr = combinedStd * Math.sqrt(1 / equalLen + 1 / unequalLen);
+
+  return (equalMean - unequalMean) / standardErr;
+}
+
+// Returns the mean of an array
+function mean(array) {
+  return array.reduce((sum, val) => sum + val, 0) / array.length;
+}
+
+// Returns the sample standard deviation of an array
+function standardDeviation(array) {
+  const arrMean = mean(array);
+  const total = array.reduce((sum, val) => sum + Math.pow(val - arrMean, 2), 0);
+  return Math.sqrt(total / (array.length - 1));
+}
+
+// Returns the common standard deviation of two arrays
+function combinedStandardDeviation(array1, array2) {
+  const sum1 = Math.pow(standardDeviation(array1), 2) * (array1.length - 1);
+  const sum2 = Math.pow(standardDeviation(array2), 2) * (array2.length - 1);
+  return Math.sqrt((sum1 + sum2) / (array1.length + array2.length - 2));
+}
+
+// Filter large outliers from an array. A 'large outlier' is a value that is at
+// least 50 times larger than the mean. This prevents the tests from failing
+// due to the standard deviation increase when a function unexpectedly takes
+// a very long time to execute.
+function filterOutliers(array) {
+  const arrMean = mean(array);
+  return array.filter((value) => value / arrMean < 50);
+}
+
+// t_(0.99995, ∞)
+// i.e. If a given comparison function is indeed timing-safe, the t-test result
+// has a 99.99% chance to be below this threshold. Unfortunately, this means
+// that this test will be a bit flakey and will fail 0.01% of the time even if
+// crypto.timingSafeEqual is working properly.
+// t-table ref: http://www.sjsu.edu/faculty/gerstman/StatPrimer/t-table.pdf
+// Note that in reality there are roughly `2 * numTrials - 2` degrees of
+// freedom, not ∞. However, assuming `numTrials` is large, this doesn't
+// significantly affect the threshold.
+const T_THRESHOLD = 3.892;
+
+const t = getTValue(crypto.timingSafeEqual);
+assert(
+  Math.abs(t) < T_THRESHOLD,
+  `timingSafeEqual should not leak information from its execution time (t=${t})`
+);
+
+// As a sanity check to make sure the statistical tests are working, run the
+// same benchmarks again, this time with an unsafe comparison function. In this
+// case the t-value should be above the threshold.
+const unsafeCompare = (bufA, bufB) => bufA.equals(bufB);
+const t2 = getTValue(unsafeCompare);
+assert(
+  Math.abs(t2) > T_THRESHOLD,
+  `Buffer#equals should leak information from its execution time (t=${t2})`
+);
diff --git a/test/sequential/test-crypto-timing-safe-equal.js b/test/sequential/test-crypto-timing-safe-equal.js
@@ -32,135 +32,3 @@ assert.throws(function() {
 assert.throws(function() {
   crypto.timingSafeEqual(Buffer.from([1, 2]), 'not a buffer');
 }, 'should throw if the second argument is not a buffer');
-
-function getTValue(compareFunc) {
-  const numTrials = 10000;
-  const testBufferSize = 10000;
-  // Perform benchmarks to verify that timingSafeEqual is actually timing-safe.
-
-  const rawEqualBenches = Array(numTrials);
-  const rawUnequalBenches = Array(numTrials);
-
-  for (let i = 0; i < numTrials; i++) {
-
-    // The `runEqualBenchmark` and `runUnequalBenchmark` functions are
-    // intentionally redefined on every iteration of this loop, to avoid
-    // timing inconsistency.
-    function runEqualBenchmark(compareFunc, bufferA, bufferB) {
-      const startTime = process.hrtime();
-      const result = compareFunc(bufferA, bufferB);
-      const endTime = process.hrtime(startTime);
-
-      // Ensure that the result of the function call gets used, so it doesn't
-      // get discarded due to engine optimizations.
-      assert.strictEqual(result, true);
-      return endTime[0] * 1e9 + endTime[1];
-    }
-
-    // This is almost the same as the runEqualBenchmark function, but it's
-    // duplicated to avoid timing issues with V8 optimization/inlining.
-    function runUnequalBenchmark(compareFunc, bufferA, bufferB) {
-      const startTime = process.hrtime();
-      const result = compareFunc(bufferA, bufferB);
-      const endTime = process.hrtime(startTime);
-
-      assert.strictEqual(result, false);
-      return endTime[0] * 1e9 + endTime[1];
-    }
-
-    if (i % 2) {
-      const bufferA1 = Buffer.alloc(testBufferSize, 'A');
-      const bufferB = Buffer.alloc(testBufferSize, 'B');
-      const bufferA2 = Buffer.alloc(testBufferSize, 'A');
-      const bufferC = Buffer.alloc(testBufferSize, 'C');
-
-      // First benchmark: comparing two equal buffers
-      rawEqualBenches[i] = runEqualBenchmark(compareFunc, bufferA1, bufferA2);
-
-      // Second benchmark: comparing two unequal buffers
-      rawUnequalBenches[i] = runUnequalBenchmark(compareFunc, bufferB, bufferC);
-    } else {
-      // Swap the order of the benchmarks every second iteration, to avoid any
-      // patterns caused by memory usage.
-      const bufferB = Buffer.alloc(testBufferSize, 'B');
-      const bufferA1 = Buffer.alloc(testBufferSize, 'A');
-      const bufferC = Buffer.alloc(testBufferSize, 'C');
-      const bufferA2 = Buffer.alloc(testBufferSize, 'A');
-      rawUnequalBenches[i] = runUnequalBenchmark(compareFunc, bufferB, bufferC);
-      rawEqualBenches[i] = runEqualBenchmark(compareFunc, bufferA1, bufferA2);
-    }
-  }
-
-  const equalBenches = filterOutliers(rawEqualBenches);
-  const unequalBenches = filterOutliers(rawUnequalBenches);
-
-  // Use a two-sample t-test to determine whether the timing difference between
-  // the benchmarks is statistically significant.
-  // https://wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test
-
-  const equalMean = mean(equalBenches);
-  const unequalMean = mean(unequalBenches);
-
-  const equalLen = equalBenches.length;
-  const unequalLen = unequalBenches.length;
-
-  const combinedStd = combinedStandardDeviation(equalBenches, unequalBenches);
-  const standardErr = combinedStd * Math.sqrt(1 / equalLen + 1 / unequalLen);
-
-  return (equalMean - unequalMean) / standardErr;
-}
-
-// Returns the mean of an array
-function mean(array) {
-  return array.reduce((sum, val) => sum + val, 0) / array.length;
-}
-
-// Returns the sample standard deviation of an array
-function standardDeviation(array) {
-  const arrMean = mean(array);
-  const total = array.reduce((sum, val) => sum + Math.pow(val - arrMean, 2), 0);
-  return Math.sqrt(total / (array.length - 1));
-}
-
-// Returns the common standard deviation of two arrays
-function combinedStandardDeviation(array1, array2) {
-  const sum1 = Math.pow(standardDeviation(array1), 2) * (array1.length - 1);
-  const sum2 = Math.pow(standardDeviation(array2), 2) * (array2.length - 1);
-  return Math.sqrt((sum1 + sum2) / (array1.length + array2.length - 2));
-}
-
-// Filter large outliers from an array. A 'large outlier' is a value that is at
-// least 50 times larger than the mean. This prevents the tests from failing
-// due to the standard deviation increase when a function unexpectedly takes
-// a very long time to execute.
-function filterOutliers(array) {
-  const arrMean = mean(array);
-  return array.filter((value) => value / arrMean < 50);
-}
-
-// t_(0.99995, ∞)
-// i.e. If a given comparison function is indeed timing-safe, the t-test result
-// has a 99.99% chance to be below this threshold. Unfortunately, this means
-// that this test will be a bit flakey and will fail 0.01% of the time even if
-// crypto.timingSafeEqual is working properly.
-// t-table ref: http://www.sjsu.edu/faculty/gerstman/StatPrimer/t-table.pdf
-// Note that in reality there are roughly `2 * numTrials - 2` degrees of
-// freedom, not ∞. However, assuming `numTrials` is large, this doesn't
-// significantly affect the threshold.
-const T_THRESHOLD = 3.892;
-
-const t = getTValue(crypto.timingSafeEqual);
-assert(
-  Math.abs(t) < T_THRESHOLD,
-  `timingSafeEqual should not leak information from its execution time (t=${t})`
-);
-
-// As a sanity check to make sure the statistical tests are working, run the
-// same benchmarks again, this time with an unsafe comparison function. In this
-// case the t-value should be above the threshold.
-const unsafeCompare = (bufA, bufB) => bufA.equals(bufB);
-const t2 = getTValue(unsafeCompare);
-assert(
-  Math.abs(t2) > T_THRESHOLD,
-  `Buffer#equals should leak information from its execution time (t=${t2})`
-);