From b9f488d8667e76aefb9c25d2ff51c81fd9f8539c Mon Sep 17 00:00:00 2001
From: Abdullah Atta <thecodrr@protonmail.com>
Date: Wed, 3 Jan 2024 22:56:14 +0500
Subject: [PATCH] fix: Speed up cache reads (#255)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR speeds up `read.stream` and `read` by skipping `fs.stat` call if
`size` was passed via `opts`. Currently, the only reason for doing a
`stat` call is to get the size (and throw the size mismatch error if the
size is different). This is unnecessary for 3 reasons:

1. In the case of `read.stream`, the stream already compares the sizes
at the end and throws an error if there's a mismatch.
2. In the case of `read`, we can compare the sizes after reading the
cache contents
3. In both cases we are already doing an integrity check which would
automatically fail if there's a size difference since the hashes would
be different.

In this PR, the `stat` call is only made if the user does not pass a
`size` property via `opts`. This makes sense because without knowing the
`size`, the stream has to make an unnecessary `fs.read` call at the end
before closing which has a significant cost (that cost is much, much
greater than the cost of doing `fs.stat`).

On my machine, the benchmarks with this change look like this:
```
┌─────────┬─────────────────────┬─────────┬────────────────────┬───────────┬─────────┐
│ (index) │      Task Name      │ ops/sec │ Average Time (ns)  │  Margin   │ Samples │
├─────────┼─────────────────────┼─────────┼────────────────────┼───────────┼─────────┤
│    0    │ 'read.stream (new)' │ '4,643' │ 215352.03841424757 │ '±10.20%' │   465   │
│    1    │ 'read.stream (old)' │ '3,933' │ 254237.5665025663  │ '±7.17%'  │   394   │
│    2    │    'read (old)'     │ '2,915' │ 343045.55719845917 │ '±13.42%' │   292   │
│    3    │    'read (new)'     │ '4,392' │ 227636.30011033904 │ '±12.14%' │   449   │
└─────────┴─────────────────────┴─────────┴────────────────────┴───────────┴─────────┘
```

That's a solid 16% improvement in the case of `read.stream` and 36%
improvement in the case of `read`.
---
 lib/content/read.js | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/lib/content/read.js b/lib/content/read.js
index f41b539..a1fa8a0 100644
--- a/lib/content/read.js
+++ b/lib/content/read.js
@@ -13,18 +13,20 @@ async function read (cache, integrity, opts = {}) {
   const { size } = opts
   const { stat, cpath, sri } = await withContentSri(cache, integrity, async (cpath, sri) => {
     // get size
-    const stat = await fs.stat(cpath)
+    const stat = size ? { size } : await fs.stat(cpath)
     return { stat, cpath, sri }
   })
-  if (typeof size === 'number' && stat.size !== size) {
-    throw sizeError(size, stat.size)
-  }
 
   if (stat.size > MAX_SINGLE_READ_SIZE) {
     return readPipeline(cpath, stat.size, sri, new Pipeline()).concat()
   }
 
   const data = await fs.readFile(cpath, { encoding: null })
+
+  if (stat.size !== data.length) {
+    throw sizeError(stat.size, data.length)
+  }
+
   if (!ssri.checkData(data, sri)) {
     throw integrityError(sri, cpath)
   }
@@ -55,13 +57,10 @@ function readStream (cache, integrity, opts = {}) {
   // Set all this up to run on the stream and then just return the stream
   Promise.resolve().then(async () => {
     const { stat, cpath, sri } = await withContentSri(cache, integrity, async (cpath, sri) => {
-      // just stat to ensure it exists
-      const stat = await fs.stat(cpath)
+      // get size
+      const stat = size ? { size } : await fs.stat(cpath)
       return { stat, cpath, sri }
     })
-    if (typeof size === 'number' && size !== stat.size) {
-      return stream.emit('error', sizeError(size, stat.size))
-    }
 
     return readPipeline(cpath, stat.size, sri, stream)
   }).catch(err => stream.emit('error', err))