Skip to content

Commit 45ad77a

Browse files
authored
feat(verify): tested, working cache verifier/gc (#68)
Fixes: #3
1 parent d5d25ba commit 45ad77a

File tree

6 files changed

+336
-127
lines changed

6 files changed

+336
-127
lines changed

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,7 @@ Completely resets the in-memory entry cache.
422422
Checks out and fixes up your cache:
423423

424424
* Cleans up corrupted or invalid index entries.
425+
* Custom entry filtering options.
425426
* Garbage collects any content entries not referenced by the index.
426427
* Checks digests for all content entries and removes invalid content.
427428
* Fixes cache ownership.
@@ -440,8 +441,8 @@ reading/writing on the cache.
440441

441442
* `opts.uid` - uid to assign to cache and its contents
442443
* `opts.gid` - gid to assign to cache and its contents
443-
* `opts.hashAlgorithm` - defaults to `'sha512'`. Hash to use for content checks.
444-
444+
* `opts.filter` - receives a formatted entry. Return false to remove it.
445+
Note: might be called more than once on the same entry.
445446

446447
##### Example
447448

lib/content/path.js

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,12 @@ function contentPath (cache, address, hashAlgorithm) {
1313
address = address && address.toLowerCase()
1414
hashAlgorithm = hashAlgorithm ? hashAlgorithm.toLowerCase() : 'sha512'
1515
return path.join.apply(path, [
16-
cache,
17-
`content-v${contentVer}`,
18-
hashAlgorithm,
16+
contentDir(cache),
17+
hashAlgorithm
1918
].concat(hashToSegments(address)))
2019
}
20+
21+
module.exports._contentDir = contentDir
22+
function contentDir (cache) {
23+
return path.join(cache, `content-v${contentVer}`)
24+
}

lib/entry-index.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ function bucketEntries (cache, bucket, filter) {
160160
})
161161
}
162162

163+
module.exports._bucketDir = bucketDir
163164
function bucketDir (cache) {
164165
return path.join(cache, `index-v${indexV}`)
165166
}

lib/verify.js

Lines changed: 166 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -2,121 +2,111 @@
22

33
const Promise = require('bluebird')
44

5-
var checksumStream = require('checksum-stream')
6-
var fixOwner = require('./util/fix-owner')
7-
var fs = require('graceful-fs')
8-
var index = require('./entry-index')
9-
var lockfile = Promise.promisifyAll(require('lockfile'))
10-
var path = require('path')
11-
var pipe = Promise.promisify(require('mississippi').pipe)
12-
var rimraf = Promise.promisify(require('rimraf'))
5+
const checksumStream = require('checksum-stream')
6+
const contentPath = require('./content/path')
7+
const finished = Promise.promisify(require('mississippi').finished)
8+
const fixOwner = require('./util/fix-owner')
9+
const fs = require('graceful-fs')
10+
const glob = Promise.promisify(require('glob'))
11+
const index = require('./entry-index')
12+
const path = require('path')
13+
const pipe = Promise.promisify(require('mississippi').pipe)
14+
const rimraf = Promise.promisify(require('rimraf'))
1315

1416
Promise.promisifyAll(fs)
1517

18+
module.exports.lastRun = lastRun
19+
function lastRun (cache) {
20+
return fs.readFileAsync(
21+
path.join(cache, '_lastverified'), 'utf8'
22+
).then(data => new Date(+data))
23+
}
24+
1625
module.exports = verify
1726
function verify (cache, opts) {
1827
opts = opts || {}
19-
opts.log && opts.log.verbose('verify', 'verifying content cache at', cache)
20-
const startTime = +(new Date())
21-
return fixOwner.mkdirfix(
22-
cache, opts.uid, opts.gid
23-
).then(() => {
24-
const lockPath = path.join(cache, 'verify.lock')
25-
const lock = lockfile.lockAsync(lockPath).disposer(() => {
26-
return lockfile.unlock(lockPath)
27-
})
28-
return Promise.using(lock, () => {
29-
return garbageCollect(cache, opts).then(gcStats => {
30-
return tidyIndex(cache, opts).then(tidyStats => {
31-
var stats = tidyStats
32-
Object.keys(gcStats).forEach(function (key) {
33-
stats[key] = gcStats[key]
34-
})
35-
return stats
36-
})
37-
}).then(stats => {
38-
var verifile = path.join(cache, '_lastverified')
39-
opts.log && opts.log.verbose('verify', 'writing verifile to ' + verifile)
40-
return fs.writeFileAsync(
41-
verifile, '' + (+(new Date()))
42-
).then(() => {
43-
opts.log && opts.log.verbose('verify', 'fixing cache ownership')
44-
return fixOwner.chownr(cache, opts.uid, opts.gid)
45-
}).then(() => {
46-
opts.log && opts.log.verbose('verify', 'clearing out tmp')
47-
return rimraf(path.join(cache, 'tmp'))
48-
}).then(() => stats)
28+
opts.log && opts.log.silly('verify', 'verifying cache at', cache)
29+
return Promise.reduce([
30+
markStartTime,
31+
fixPerms,
32+
garbageCollect,
33+
rebuildIndex,
34+
cleanTmp,
35+
writeVerifile,
36+
markEndTime
37+
], (stats, step, i) => {
38+
const label = step.name || `step #${i}`
39+
const start = new Date()
40+
return Promise.resolve(step(cache, opts)).then(s => {
41+
s && Object.keys(s).forEach(k => {
42+
stats[k] = s[k]
4943
})
44+
const end = new Date()
45+
if (!stats.runTime) { stats.runTime = {} }
46+
stats.runTime[label] = end - start
47+
return stats
5048
})
51-
}).then(stats => {
52-
stats.runTime = (+(new Date()) - startTime) / 1000
53-
opts.log && opts.log.verbose('verify', 'final stats:', stats)
54-
return stats
49+
}, {}).tap(stats => {
50+
stats.runTime.total = stats.endTime - stats.startTime
51+
opts.log && opts.log.silly('verify', 'verification finished for', cache, 'in', `${stats.runTime.total}ms`)
5552
})
5653
}
5754

58-
function tidyIndex (cache, opts) {
59-
opts.log && opts.log.verbose('verify', 'tidying index')
60-
return index.ls(cache).then(entries => {
61-
return rimraf(path.join(cache, 'index')).then(() => {
62-
var stats = {
63-
entriesRemoved: 0,
64-
digestMissing: 0,
65-
totalEntries: 0
66-
}
67-
return Promise.reduce(Object.keys(entries), (stats, key) => {
68-
var entry = entries[key]
69-
if (!entry.digest) {
70-
stats.digestMissing++
71-
return stats
72-
}
73-
var content = path.join(cache, 'content', entries[key].digest)
74-
return fs.statAsync(content).catch(err => {
75-
if (err.code === 'ENOENT') {
76-
stats.entriesRemoved++
77-
return stats
78-
}
79-
}).then(() => {
80-
stats.totalEntries++
81-
return index.insert(cache, key, entry.digest, {
82-
uid: opts.uid,
83-
gid: opts.gid,
84-
metadata: entry.metadata
85-
}).then(() => stats)
86-
})
87-
}, stats)
88-
})
89-
})
55+
function markStartTime (cache, opts) {
56+
return { startTime: new Date() }
57+
}
58+
59+
function markEndTime (cache, opts) {
60+
return { endTime: new Date() }
61+
}
62+
63+
function fixPerms (cache, opts) {
64+
opts.log && opts.log.silly('verify', 'fixing cache permissions')
65+
return fixOwner.mkdirfix(cache, opts.uid, opts.gid).then(() => {
66+
// TODO - fix file permissions too
67+
fixOwner.chownr(cache, opts.uid, opts.gid)
68+
}).then(() => null)
9069
}
9170

71+
// Implements a naive mark-and-sweep tracing garbage collector.
72+
//
73+
// The algorithm is basically as follows:
74+
// 1. Read (and filter) all index entries ("pointers")
75+
// 2. Mark each algo/digest combo as "live"
76+
// 3. Read entire filesystem tree in `content-vX/` dir
77+
// 4. If content is live, verify its checksum and delete it if it fails
78+
// 5. If content is not marked as live, rimraf it.
79+
//
9280
function garbageCollect (cache, opts) {
93-
opts.log && opts.log.verbose('verify', 'garbage collecting content')
94-
return index.ls(cache).then(entries => {
95-
var byDigest = {}
96-
Object.keys(entries).forEach(function (k) {
97-
byDigest[entries[k].digest] = entries[k]
98-
})
99-
var contentDir = path.join(cache, 'content')
100-
return fs.readdirAsync(contentDir).catch(err => {
101-
if (err.code === 'ENOENT') {
102-
return
103-
} else {
104-
throw err
105-
}
81+
opts.log && opts.log.silly('verify', 'garbage collecting content')
82+
const indexStream = index.lsStream(cache)
83+
const liveContent = new Set()
84+
indexStream.on('data', entry => {
85+
if (opts && opts.filter && !opts.filter(entry)) { return }
86+
liveContent.add(`${entry.hashAlgorithm}-${entry.digest}`)
87+
})
88+
return finished(indexStream).then(() => {
89+
const contentDir = contentPath._contentDir(cache)
90+
return glob(path.join(contentDir, '**'), {
91+
follow: false,
92+
nodir: true,
93+
nosort: true
10694
}).then(files => {
107-
var stats = {
95+
return Promise.resolve({
10896
verifiedContent: 0,
109-
collectedCount: 0,
97+
reclaimedCount: 0,
11098
reclaimedSize: 0,
99+
badContentCount: 0,
111100
keptSize: 0
112-
}
113-
return Promise.reduce(files, (stats, f) => {
114-
var fullPath = path.join(contentDir, f)
115-
if (byDigest[f]) {
116-
var algo = opts.hashAlgorithm || 'sha512'
117-
return verifyContent(fullPath, algo).then(info => {
101+
}).tap((stats) => Promise.map(files, (f) => {
102+
const split = f.split(/[/\\]/)
103+
const digest = split.slice(split.length - 3).join('')
104+
const algo = split[split.length - 4]
105+
if (liveContent.has(`${algo}-${digest}`)) {
106+
return verifyContent(f, digest, algo).then(info => {
118107
if (!info.valid) {
119-
stats.collectedCount++
108+
stats.reclaimedCount++
109+
stats.badContentCount++
120110
stats.reclaimedSize += info.size
121111
} else {
122112
stats.verifiedContent++
@@ -125,44 +115,99 @@ function garbageCollect (cache, opts) {
125115
return stats
126116
})
127117
} else {
128-
stats.collectedCount++
129-
return fs.statAsync(fullPath).then(s => {
130-
stats.reclaimedSize += s.size
131-
return rimraf(path.join(contentDir, f)).then(() => stats)
118+
// No entries refer to this content. We can delete.
119+
stats.reclaimedCount++
120+
return fs.statAsync(f).then(s => {
121+
return rimraf(f).then(() => {
122+
stats.reclaimedSize += s.size
123+
return stats
124+
})
132125
})
133126
}
134-
}, stats)
127+
}, {concurrency: opts.concurrency || 20}))
135128
})
136129
})
137130
}
138131

139-
function verifyContent (filepath, algo) {
132+
function verifyContent (filepath, digest, algorithm) {
140133
return fs.statAsync(filepath).then(stat => {
141-
var reader = fs.createReadStream(filepath)
142-
var checksummer = checksumStream({
143-
digest: path.basename(filepath),
144-
algorithm: algo
145-
})
146-
var contentInfo = {
134+
const reader = fs.createReadStream(filepath)
135+
const checksummer = checksumStream({digest, algorithm})
136+
const contentInfo = {
147137
size: stat.size,
148138
valid: true
149139
}
150140
checksummer.on('data', () => {})
151-
return pipe(reader, checksummer).catch(err => {
152-
if (err && err.code === 'EBADCHECKSUM') {
153-
return rimraf(filepath).then(() => {
154-
contentInfo.valid = false
155-
})
156-
} else {
157-
throw err
158-
}
141+
return pipe(reader, checksummer).catch({code: 'EBADCHECKSUM'}, () => {
142+
return rimraf(filepath).then(() => {
143+
contentInfo.valid = false
144+
})
159145
}).then(() => contentInfo)
146+
}).catch({code: 'ENOENT'}, () => ({size: 0, valid: false}))
147+
}
148+
149+
function rebuildIndex (cache, opts) {
150+
opts.log && opts.log.silly('verify', 'rebuilding index')
151+
return index.ls(cache).then(entries => {
152+
const stats = {
153+
missingContent: 0,
154+
rejectedEntries: 0,
155+
totalEntries: 0
156+
}
157+
const buckets = {}
158+
for (let k in entries) {
159+
if (entries.hasOwnProperty(k)) {
160+
const hashed = index._hashKey(k)
161+
const entry = entries[k]
162+
const excluded = opts && opts.filter && !opts.filter(entry)
163+
excluded && stats.rejectedEntries++
164+
if (buckets[hashed] && !excluded) {
165+
buckets[hashed].push(entry)
166+
} else if (buckets[hashed] && excluded) {
167+
// skip
168+
} else if (excluded) {
169+
buckets[hashed] = []
170+
buckets[hashed]._path = index._bucketPath(cache, k)
171+
} else {
172+
buckets[hashed] = [entry]
173+
buckets[hashed]._path = index._bucketPath(cache, k)
174+
}
175+
}
176+
}
177+
return Promise.map(Object.keys(buckets), key => {
178+
return rebuildBucket(cache, buckets[key], stats, opts)
179+
}, {concurrency: opts.concurrency || 20}).then(() => stats)
160180
})
161181
}
162182

163-
module.exports.lastRun = lastRun
164-
function lastRun (cache) {
165-
return fs.readFileAsync(
166-
path.join(cache, '_lastverified'), 'utf8'
167-
).then(data => new Date(+data))
183+
function rebuildBucket (cache, bucket, stats, opts) {
184+
return fs.truncateAsync(bucket._path).then(() => {
185+
// This needs to be serialized because cacache explicitly
186+
// lets very racy bucket conflicts clobber each other.
187+
return Promise.mapSeries(bucket, entry => {
188+
const content = contentPath(cache, entry.digest, entry.hashAlgorithm)
189+
return fs.statAsync(content).then(() => {
190+
return index.insert(cache, entry.key, entry.digest, {
191+
uid: opts.uid,
192+
gid: opts.gid,
193+
hashAlgorithm: entry.hashAlgorithm,
194+
metadata: entry.metadata
195+
}).then(() => { stats.totalEntries++ })
196+
}).catch({code: 'ENOENT'}, () => {
197+
stats.rejectedEntries++
198+
stats.missingContent++
199+
})
200+
})
201+
})
202+
}
203+
204+
function cleanTmp (cache, opts) {
205+
opts.log && opts.log.silly('verify', 'cleaning tmp directory')
206+
return rimraf(path.join(cache, 'tmp'))
207+
}
208+
209+
function writeVerifile (cache, opts) {
210+
const verifile = path.join(cache, '_lastverified')
211+
opts.log && opts.log.silly('verify', 'writing verifile to ' + verifile)
212+
return fs.writeFileAsync(verifile, '' + (+(new Date())))
168213
}

package.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@
5252
"chownr": "^1.0.1",
5353
"dezalgo": "^1.0.3",
5454
"graceful-fs": "^4.1.10",
55-
"lockfile": "^1.0.2",
5655
"mississippi": "^1.2.0",
5756
"mkdirp": "^0.5.1",
5857
"once": "^1.4.0",

0 commit comments

Comments
 (0)