From b9f488d8667e76aefb9c25d2ff51c81fd9f8539c Mon Sep 17 00:00:00 2001 From: Abdullah Atta Date: Wed, 3 Jan 2024 22:56:14 +0500 Subject: [PATCH] fix: Speed up cache reads (#255) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR speeds up `read.stream` and `read` by skipping `fs.stat` call if `size` was passed via `opts`. Currently, the only reason for doing a `stat` call is to get the size (and throw the size mismatch error if the size is different). This is unnecessary for 3 reasons: 1. In the case of `read.stream`, the stream already compares the sizes at the end and throws an error if there's a mismatch. 2. In the case of `read`, we can compare the sizes after reading the cache contents 3. In both cases we are already doing an integrity check which would automatically fail if there's a size difference since the hashes would be different. In this PR, the `stat` call is only made if the user does not pass a `size` property via `opts`. This makes sense because without knowing the `size`, the stream has to make an unnecessary `fs.read` call at the end before closing which has a significant cost (that cost is much, much greater than the cost of doing `fs.stat`). On my machine, the benchmarks with this change look like this: ``` ┌─────────┬─────────────────────┬─────────┬────────────────────┬───────────┬─────────┐ │ (index) │ Task Name │ ops/sec │ Average Time (ns) │ Margin │ Samples │ ├─────────┼─────────────────────┼─────────┼────────────────────┼───────────┼─────────┤ │ 0 │ 'read.stream (new)' │ '4,643' │ 215352.03841424757 │ '±10.20%' │ 465 │ │ 1 │ 'read.stream (old)' │ '3,933' │ 254237.5665025663 │ '±7.17%' │ 394 │ │ 2 │ 'read (old)' │ '2,915' │ 343045.55719845917 │ '±13.42%' │ 292 │ │ 3 │ 'read (new)' │ '4,392' │ 227636.30011033904 │ '±12.14%' │ 449 │ └─────────┴─────────────────────┴─────────┴────────────────────┴───────────┴─────────┘ ``` That's a solid 16% improvement in the case of `read.stream` and 36% improvement in the case of `read`. --- lib/content/read.js | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/lib/content/read.js b/lib/content/read.js index f41b539..a1fa8a0 100644 --- a/lib/content/read.js +++ b/lib/content/read.js @@ -13,18 +13,20 @@ async function read (cache, integrity, opts = {}) { const { size } = opts const { stat, cpath, sri } = await withContentSri(cache, integrity, async (cpath, sri) => { // get size - const stat = await fs.stat(cpath) + const stat = size ? { size } : await fs.stat(cpath) return { stat, cpath, sri } }) - if (typeof size === 'number' && stat.size !== size) { - throw sizeError(size, stat.size) - } if (stat.size > MAX_SINGLE_READ_SIZE) { return readPipeline(cpath, stat.size, sri, new Pipeline()).concat() } const data = await fs.readFile(cpath, { encoding: null }) + + if (stat.size !== data.length) { + throw sizeError(stat.size, data.length) + } + if (!ssri.checkData(data, sri)) { throw integrityError(sri, cpath) } @@ -55,13 +57,10 @@ function readStream (cache, integrity, opts = {}) { // Set all this up to run on the stream and then just return the stream Promise.resolve().then(async () => { const { stat, cpath, sri } = await withContentSri(cache, integrity, async (cpath, sri) => { - // just stat to ensure it exists - const stat = await fs.stat(cpath) + // get size + const stat = size ? { size } : await fs.stat(cpath) return { stat, cpath, sri } }) - if (typeof size === 'number' && size !== stat.size) { - return stream.emit('error', sizeError(size, stat.size)) - } return readPipeline(cpath, stat.size, sri, stream) }).catch(err => stream.emit('error', err))