events: optimize arrayClone by copying forward #10571

Closed
wants to merge 1 commit into
from

Projects

None yet
@bmeurer
Contributor
bmeurer commented Jan 2, 2017 edited

It's slightly faster (and more readable) to copy array elements
in forward direction. This way it also avoids the ToBoolean and
the postfix count operation.

Checklist
  • make -j4 test (UNIX)
  • commit message follows commit guidelines
Affected core subsystem(s)

events

@bmeurer bmeurer events: Optimize arrayClone by copying in forward direction.
It's slightly faster (and more readable) to copy array elements
in forward direction. This way it also avoids the ToBoolean and
the postfix count operation.
d81e075
@mscdex mscdex added the performance label Jan 2, 2017
@lpinca
lpinca approved these changes Jan 2, 2017 View changes
@mscdex
Contributor
mscdex commented Jan 2, 2017

FWIW I just tested this locally with benchmark/events/ee-emit.js and saw that with this change small array (e.g. 10) copies are actually slower, but the copying only become slightly faster once you increase the size of the array substantially (e.g. 1000).

@bmeurer
Contributor
bmeurer commented Jan 2, 2017

I see improvements even for small arrays (size 10), on my Z620, i.e.

$ cat bench.js
"use strict";

function arrayCloneOld(a, l) {
  var c = new Array(l);
  while (l--) c[l] = a[l];
  return c;
}

function arrayCloneNew(a, l) {
  var c = new Array(l);
  for (var i = 0; i < l; ++i) c[i] = a[i];
  return c;
}

// Warmup
var a = [0,1,2,3,4,5,6,7,8,9];
for (var i = 0; i < 10000; ++i) {
  arrayCloneOld(a, a.length);
  arrayCloneNew(a, a.length);
}

console.time('old');
for (var i = 0; i < 10000; ++i) {
  arrayCloneOld(a, a.length);
}
console.timeEnd('old');

console.time('new');
for (var i = 0; i < 10000; ++i) {
  arrayCloneNew(a, a.length);
}
console.timeEnd('new');
$ node --version
v8.0.0-pre
$ node bench
old: 0.963ms
new: 0.598ms
@bmeurer
Contributor
bmeurer commented Jan 2, 2017

On my 2015 MBP it's even more visible (using Node.js master):

$ node bench.js
old: 1.126ms
new: 0.538ms
@vsemozhetbyt
Member
vsemozhetbyt commented Jan 2, 2017 edited

It seems the Warmup does not work as expected. With Node.js 7.3.0 on Windows 7 x64:

'use strict';

function arrayCloneOld(a, l) {
  var c = new Array(l);
  while (l--) c[l] = a[l];
  return c;
}

function arrayCloneNew(a, l) {
  var c = new Array(l);
  for (var i = 0; i < l; ++i) c[i] = a[i];
  return c;
}

// Warmup
var a = [0,1,2,3,4,5,6,7,8,9];
for (var i = 0; i < 10000; ++i) {
  arrayCloneOld(a, a.length);
  arrayCloneOld(a, a.length);
}

console.time('old');
for (var i = 0; i < 10000; ++i) {
  arrayCloneOld(a, a.length);
}
console.timeEnd('old');

console.time('old');
for (var i = 0; i < 10000; ++i) {
  arrayCloneOld(a, a.length);
}
console.timeEnd('old');
old: 1.416ms
old: 0.853ms
'use strict';

function arrayCloneOld(a, l) {
  var c = new Array(l);
  while (l--) c[l] = a[l];
  return c;
}

function arrayCloneNew(a, l) {
  var c = new Array(l);
  for (var i = 0; i < l; ++i) c[i] = a[i];
  return c;
}

// Warmup
var a = [0,1,2,3,4,5,6,7,8,9];
for (var i = 0; i < 10000; ++i) {
  arrayCloneNew(a, a.length);
  arrayCloneNew(a, a.length);
}

console.time('new');
for (var i = 0; i < 10000; ++i) {
  arrayCloneNew(a, a.length);
}
console.timeEnd('new');

console.time('new');
for (var i = 0; i < 10000; ++i) {
  arrayCloneNew(a, a.length);
}
console.timeEnd('new');
new: 1.319ms
new: 0.781ms
@vsemozhetbyt
Member
"use strict";

function arrayCloneOld(a, l) {
  var c = new Array(l);
  while (l--) c[l] = a[l];
  return c;
}

function arrayCloneNew(a, l) {
  var c = new Array(l);
  for (var i = 0; i < l; ++i) c[i] = a[i];
  return c;
}

// Warmup
var a = [0,1,2,3,4,5,6,7,8,9];
for (var i = 0; i < 10000; ++i) {
  arrayCloneOld(a, a.length);
  arrayCloneNew(a, a.length);
}

console.time('new');
for (var i = 0; i < 10000; ++i) {
  arrayCloneNew(a, a.length);
}
console.timeEnd('new');

console.time('old');
for (var i = 0; i < 10000; ++i) {
  arrayCloneOld(a, a.length);
}
console.timeEnd('old');
new: 1.597ms
old: 0.843ms
@bmeurer
Contributor
bmeurer commented Jan 2, 2017

Ah, good catch. There's some inlining going on here.

@mscdex
Contributor
mscdex commented Jan 2, 2017

I'm not sure why it does indeed seem to be faster on its own vs testing it implicitly via the emit() benchmark.

@vsemozhetbyt
Member
vsemozhetbyt commented Jan 2, 2017 edited

@bmeurer From my experience, the first cycle inside another cycle always run faster. So I've made this benchmark template for myself, trying firstly to get commensurably equal results for the same function. I use the benchmark.js for balanced tests and vanilla cycles for simplified tests. As you can see, the first ones do not need warm-up, the second ones do need.

Template code and its results (click me):
/******************************************************************************/
'use strict';
/******************************************************************************/
const functions = [
  function warmup() { String(1); },

  function test01() { String(1); },
  function test02() { String(1); },
];
/******************************************************************************/
console.log(`
  1. Balanced benchmark
`);
/******************************************************************************/
const suite = require('benchmark').Suite();

functions.forEach((func) => { suite.add(func.name, func); });
suite.on('cycle', (evt) => { console.log(String(evt.target)); }).run({ async: false });
/******************************************************************************/
console.log(`
  2. Simplified benchmark
`);
/******************************************************************************/
functions.forEach((func) => {
  const MS_IN_S = 1000;
  const numberOfCycles = 1e8;
  const start = Date.now();

  for (let i = 0; i < numberOfCycles; i++) func();

  console.log(`${func.name} x ${
    Math.round(numberOfCycles / ((Date.now() - start) / MS_IN_S)).toLocaleString()
  } ops/sec`);
});
  1. Balanced benchmark

warmup x 31,788,736 ops/sec ±1.09% (83 runs sampled)
test01 x 31,686,704 ops/sec ±0.97% (82 runs sampled)
test02 x 31,480,559 ops/sec ±1.04% (83 runs sampled)

  2. Simplified benchmark

warmup x 52,356,021 ops/sec
test01 x 39,510,075 ops/sec
test02 x 39,745,628 ops/sec

And here are the results for big and small arrays with this script (warmup data output is canceled for the simple cycles).

Clone array benchmarks (click me):
/******************************************************************************/
'use strict';
/******************************************************************************/
var smallArray = Array(1e1).fill(0).map((elm, i) =>  elm + i);
var largeArray = Array(1e4).fill(0).map((elm, i) =>  elm + i);

const functions = [
  function warmup() {},

  function smallArrayCloneOld() {
    var l = smallArray.length;
    var c = new Array(l);
    while (l--) c[l] = smallArray[l];
    return c;
  },

  function smallArrayCloneNew() {
    var l = smallArray.length;
    var c = new Array(l);
    for (var i = 0; i < l; ++i) c[i] = smallArray[i];
    return c;
  },

  function largeArrayCloneOld() {
    var l = largeArray.length;
    var c = new Array(l);
    while (l--) c[l] = largeArray[l];
    return c;
  },

  function largeArrayCloneNew() {
    var l = largeArray.length;
    var c = new Array(l);
    for (var i = 0; i < l; ++i) c[i] = largeArray[i];
    return c;
  },
];
/******************************************************************************/
console.log(`
  1. Balanced benchmark
`);
/******************************************************************************/
const suite = require('benchmark').Suite();

functions.forEach((func) => { if (func.name !== 'warmup') suite.add(func.name, func); });
suite.on('cycle', (evt) => { console.log(String(evt.target)); }).run({ async: false });
/******************************************************************************/
console.log(`
  2. Simplified benchmark
`);
/******************************************************************************/
functions.forEach((func) => {
  const MS_IN_S = 1000;
  const numberOfCycles = 1e6;
  const start = Date.now();

  for (let i = 0; i < numberOfCycles; i++) func();

  if (func.name !== 'warmup') {
    console.log(`${func.name} x ${
      Math.round(numberOfCycles / ((Date.now() - start) / MS_IN_S)).toLocaleString()
    } ops/sec`);
  }
});
  1. Balanced benchmark

smallArrayCloneOld x 12,660,628 ops/sec ±0.47% (87 runs sampled)
smallArrayCloneNew x 13,510,089 ops/sec ±1.32% (85 runs sampled)
largeArrayCloneOld x 21,524 ops/sec ±0.84% (87 runs sampled)
largeArrayCloneNew x 23,764 ops/sec ±0.99% (87 runs sampled)

  2. Simplified benchmark

smallArrayCloneOld x 12,820,513 ops/sec
smallArrayCloneNew x 16,129,032 ops/sec
largeArrayCloneOld x 22,063 ops/sec
largeArrayCloneNew x 24,241 ops/sec

However, in the real life the situation with `emit()` may be exposed to more different factors, as @mscdex notices.
@benjamingr
Member

I'm +1 on this change because it makes clearer code. I don't think we'll see actual considerable performance difference in the benchmarks.

@bmeurer
Contributor
bmeurer commented Jan 2, 2017

Thanks for the investigations. I was looking at it from the VMs perspective; in this particular case by investigating Ignition+TurboFan performance, where we currently suffer a 2x slowdown due to the ToNumber from the postfix count operation (the i--). While we do want to improve that of course, there's no real benefit in doing the while (i--) { } approach here IMHO.

@Trott
Trott approved these changes Jan 2, 2017 View changes

LGTM if CI is

@Trott
Member
Trott commented Jan 2, 2017

Nits on the commit message (but they can be fixed by whoever lands this if you don't get around to it):

  • s/Optimize/optimize/
  • no period/full-stop at the end of the first line
  • first line should be less than 50 chars
@cjihrig
cjihrig approved these changes Jan 2, 2017 View changes
@bmeurer bmeurer changed the title from events: Optimize arrayClone by copying in forward direction. to events: optimize arrayClone by copying in forward direction Jan 3, 2017
@bmeurer bmeurer changed the title from events: optimize arrayClone by copying in forward direction to events: optimize arrayClone by copying forward Jan 3, 2017
@jasnell
jasnell approved these changes Jan 3, 2017 View changes
@bmeurer
Contributor
bmeurer commented Jan 3, 2017

So, what about the test failures? I can't seem to figure out how they related to my PR?

@cjihrig
Member
cjihrig commented Jan 3, 2017

@bmeurer they likely aren't related. I didn't go through all the failures, but I see a couple red CI runs, and some processes left behind on the CI machines. We'll do another CI run once it's all cleaned up.

@Trott
Member
Trott commented Jan 4, 2017

Only Raspberry Pi failed, and it seems to have been build related. Let's try again.

arm-fanned CI: https://ci.nodejs.org/job/node-test-commit-arm-fanned/6279/

@jasnell
Member
jasnell commented Jan 4, 2017

New CI run on rpi looks good

@fhinkel
fhinkel approved these changes Jan 4, 2017 View changes
@fhinkel fhinkel added a commit that referenced this pull request Jan 4, 2017
@bmeurer @fhinkel bmeurer + fhinkel events: Optimize arrayClone.
Optimize arrayClone by copying forward.

It's slightly faster (and more readable) to copy array elements
in forward direction. This way it also avoids the ToBoolean and
the postfix count operation.

PR-URL: #10571
Reviewed-By: Luigi Pinca <luigipinca@gmail.com>
Reviewed-By: Benjamin Gruenbaum <benjamingr@gmail.com>
Reviewed-By: Rich Trott <rtrott@gmail.com>
Reviewed-By: Colin Ihrig <cjihrig@gmail.com>
Reviewed-By: James M Snell <jasnell@gmail.com>
Reviewed-By: Franziska Hinkelmann <franziska.hinkelmann@gmail.com>
21ca9e7
@fhinkel fhinkel added a commit that referenced this pull request Jan 4, 2017
@bmeurer @fhinkel bmeurer + fhinkel events: optimize arrayClone by copying forward
Optimize arrayClone by copying forward.

It's slightly faster (and more readable) to copy array elements
in forward direction. This way it also avoids the ToBoolean and
the postfix count operation.

PR-URL: #10571
Reviewed-By: Luigi Pinca <luigipinca@gmail.com>
Reviewed-By: Benjamin Gruenbaum <benjamingr@gmail.com>
Reviewed-By: Rich Trott <rtrott@gmail.com>
Reviewed-By: Colin Ihrig <cjihrig@gmail.com>
Reviewed-By: James M Snell <jasnell@gmail.com>
Reviewed-By: Franziska Hinkelmann <franziska.hinkelmann@gmail.com>
f2f997a
@fhinkel
Member
fhinkel commented Jan 4, 2017

Landed in f2f997a. @bmeurer Should this be backported to v4 and v6 or does it depend on newer V8 changes?

@fhinkel fhinkel closed this Jan 4, 2017
@evanlucas evanlucas added a commit that referenced this pull request Jan 4, 2017
@bmeurer @evanlucas bmeurer + evanlucas events: optimize arrayClone by copying forward
Optimize arrayClone by copying forward.

It's slightly faster (and more readable) to copy array elements
in forward direction. This way it also avoids the ToBoolean and
the postfix count operation.

PR-URL: #10571
Reviewed-By: Luigi Pinca <luigipinca@gmail.com>
Reviewed-By: Benjamin Gruenbaum <benjamingr@gmail.com>
Reviewed-By: Rich Trott <rtrott@gmail.com>
Reviewed-By: Colin Ihrig <cjihrig@gmail.com>
Reviewed-By: James M Snell <jasnell@gmail.com>
Reviewed-By: Franziska Hinkelmann <franziska.hinkelmann@gmail.com>
66a9f01
@evanlucas evanlucas referenced this pull request Jan 4, 2017
Merged

v7.4.0 release proposal #10589

@bmeurer
Contributor
bmeurer commented Jan 4, 2017

It doesn't depend on a specific V8 version. For Crankshaft this is mostly identical or slightly faster ever since. For TurboFan the new version is better.

@MylesBorins
Member

I'm going to opt to hold off on this optimization for a bit. I didn't read through the entire thread, but are the perf gains significant?

@bmeurer
Contributor
bmeurer commented Jan 23, 2017

With the default configuration there are no significant performance gains. With TurboFan there are significant gains. So skipping this for now sounds perfectly reasonable.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment