2
2
3
3
const Promise = require ( 'bluebird' )
4
4
5
- var checksumStream = require ( 'checksum-stream' )
6
- var fixOwner = require ( './util/fix-owner' )
7
- var fs = require ( 'graceful-fs' )
8
- var index = require ( './entry-index' )
9
- var lockfile = Promise . promisifyAll ( require ( 'lockfile' ) )
10
- var path = require ( 'path' )
11
- var pipe = Promise . promisify ( require ( 'mississippi' ) . pipe )
12
- var rimraf = Promise . promisify ( require ( 'rimraf' ) )
5
+ const checksumStream = require ( 'checksum-stream' )
6
+ const contentPath = require ( './content/path' )
7
+ const finished = Promise . promisify ( require ( 'mississippi' ) . finished )
8
+ const fixOwner = require ( './util/fix-owner' )
9
+ const fs = require ( 'graceful-fs' )
10
+ const glob = Promise . promisify ( require ( 'glob' ) )
11
+ const index = require ( './entry-index' )
12
+ const path = require ( 'path' )
13
+ const pipe = Promise . promisify ( require ( 'mississippi' ) . pipe )
14
+ const rimraf = Promise . promisify ( require ( 'rimraf' ) )
13
15
14
16
Promise . promisifyAll ( fs )
15
17
18
+ module . exports . lastRun = lastRun
19
+ function lastRun ( cache ) {
20
+ return fs . readFileAsync (
21
+ path . join ( cache , '_lastverified' ) , 'utf8'
22
+ ) . then ( data => new Date ( + data ) )
23
+ }
24
+
16
25
module . exports = verify
17
26
function verify ( cache , opts ) {
18
27
opts = opts || { }
19
- opts . log && opts . log . verbose ( 'verify' , 'verifying content cache at' , cache )
20
- const startTime = + ( new Date ( ) )
21
- return fixOwner . mkdirfix (
22
- cache , opts . uid , opts . gid
23
- ) . then ( ( ) => {
24
- const lockPath = path . join ( cache , 'verify.lock' )
25
- const lock = lockfile . lockAsync ( lockPath ) . disposer ( ( ) => {
26
- return lockfile . unlock ( lockPath )
27
- } )
28
- return Promise . using ( lock , ( ) => {
29
- return garbageCollect ( cache , opts ) . then ( gcStats => {
30
- return tidyIndex ( cache , opts ) . then ( tidyStats => {
31
- var stats = tidyStats
32
- Object . keys ( gcStats ) . forEach ( function ( key ) {
33
- stats [ key ] = gcStats [ key ]
34
- } )
35
- return stats
36
- } )
37
- } ) . then ( stats => {
38
- var verifile = path . join ( cache , '_lastverified' )
39
- opts . log && opts . log . verbose ( 'verify' , 'writing verifile to ' + verifile )
40
- return fs . writeFileAsync (
41
- verifile , '' + ( + ( new Date ( ) ) )
42
- ) . then ( ( ) => {
43
- opts . log && opts . log . verbose ( 'verify' , 'fixing cache ownership' )
44
- return fixOwner . chownr ( cache , opts . uid , opts . gid )
45
- } ) . then ( ( ) => {
46
- opts . log && opts . log . verbose ( 'verify' , 'clearing out tmp' )
47
- return rimraf ( path . join ( cache , 'tmp' ) )
48
- } ) . then ( ( ) => stats )
28
+ opts . log && opts . log . silly ( 'verify' , 'verifying cache at' , cache )
29
+ return Promise . reduce ( [
30
+ markStartTime ,
31
+ fixPerms ,
32
+ garbageCollect ,
33
+ rebuildIndex ,
34
+ cleanTmp ,
35
+ writeVerifile ,
36
+ markEndTime
37
+ ] , ( stats , step , i ) => {
38
+ const label = step . name || `step #${ i } `
39
+ const start = new Date ( )
40
+ return Promise . resolve ( step ( cache , opts ) ) . then ( s => {
41
+ s && Object . keys ( s ) . forEach ( k => {
42
+ stats [ k ] = s [ k ]
49
43
} )
44
+ const end = new Date ( )
45
+ if ( ! stats . runTime ) { stats . runTime = { } }
46
+ stats . runTime [ label ] = end - start
47
+ return stats
50
48
} )
51
- } ) . then ( stats => {
52
- stats . runTime = ( + ( new Date ( ) ) - startTime ) / 1000
53
- opts . log && opts . log . verbose ( 'verify' , 'final stats:' , stats )
54
- return stats
49
+ } , { } ) . tap ( stats => {
50
+ stats . runTime . total = stats . endTime - stats . startTime
51
+ opts . log && opts . log . silly ( 'verify' , 'verification finished for' , cache , 'in' , `${ stats . runTime . total } ms` )
55
52
} )
56
53
}
57
54
58
- function tidyIndex ( cache , opts ) {
59
- opts . log && opts . log . verbose ( 'verify' , 'tidying index' )
60
- return index . ls ( cache ) . then ( entries => {
61
- return rimraf ( path . join ( cache , 'index' ) ) . then ( ( ) => {
62
- var stats = {
63
- entriesRemoved : 0 ,
64
- digestMissing : 0 ,
65
- totalEntries : 0
66
- }
67
- return Promise . reduce ( Object . keys ( entries ) , ( stats , key ) => {
68
- var entry = entries [ key ]
69
- if ( ! entry . digest ) {
70
- stats . digestMissing ++
71
- return stats
72
- }
73
- var content = path . join ( cache , 'content' , entries [ key ] . digest )
74
- return fs . statAsync ( content ) . catch ( err => {
75
- if ( err . code === 'ENOENT' ) {
76
- stats . entriesRemoved ++
77
- return stats
78
- }
79
- } ) . then ( ( ) => {
80
- stats . totalEntries ++
81
- return index . insert ( cache , key , entry . digest , {
82
- uid : opts . uid ,
83
- gid : opts . gid ,
84
- metadata : entry . metadata
85
- } ) . then ( ( ) => stats )
86
- } )
87
- } , stats )
88
- } )
89
- } )
55
+ function markStartTime ( cache , opts ) {
56
+ return { startTime : new Date ( ) }
57
+ }
58
+
59
+ function markEndTime ( cache , opts ) {
60
+ return { endTime : new Date ( ) }
61
+ }
62
+
63
+ function fixPerms ( cache , opts ) {
64
+ opts . log && opts . log . silly ( 'verify' , 'fixing cache permissions' )
65
+ return fixOwner . mkdirfix ( cache , opts . uid , opts . gid ) . then ( ( ) => {
66
+ // TODO - fix file permissions too
67
+ fixOwner . chownr ( cache , opts . uid , opts . gid )
68
+ } ) . then ( ( ) => null )
90
69
}
91
70
71
+ // Implements a naive mark-and-sweep tracing garbage collector.
72
+ //
73
+ // The algorithm is basically as follows:
74
+ // 1. Read (and filter) all index entries ("pointers")
75
+ // 2. Mark each algo/digest combo as "live"
76
+ // 3. Read entire filesystem tree in `content-vX/` dir
77
+ // 4. If content is live, verify its checksum and delete it if it fails
78
+ // 5. If content is not marked as live, rimraf it.
79
+ //
92
80
function garbageCollect ( cache , opts ) {
93
- opts . log && opts . log . verbose ( 'verify' , 'garbage collecting content' )
94
- return index . ls ( cache ) . then ( entries => {
95
- var byDigest = { }
96
- Object . keys ( entries ) . forEach ( function ( k ) {
97
- byDigest [ entries [ k ] . digest ] = entries [ k ]
98
- } )
99
- var contentDir = path . join ( cache , 'content' )
100
- return fs . readdirAsync ( contentDir ) . catch ( err => {
101
- if ( err . code === 'ENOENT' ) {
102
- return
103
- } else {
104
- throw err
105
- }
81
+ opts . log && opts . log . silly ( 'verify' , 'garbage collecting content' )
82
+ const indexStream = index . lsStream ( cache )
83
+ const liveContent = new Set ( )
84
+ indexStream . on ( 'data' , entry => {
85
+ if ( opts && opts . filter && ! opts . filter ( entry ) ) { return }
86
+ liveContent . add ( ` ${ entry . hashAlgorithm } - ${ entry . digest } ` )
87
+ } )
88
+ return finished ( indexStream ) . then ( ( ) => {
89
+ const contentDir = contentPath . _contentDir ( cache )
90
+ return glob ( path . join ( contentDir , '**' ) , {
91
+ follow : false ,
92
+ nodir : true ,
93
+ nosort : true
106
94
} ) . then ( files => {
107
- var stats = {
95
+ return Promise . resolve ( {
108
96
verifiedContent : 0 ,
109
- collectedCount : 0 ,
97
+ reclaimedCount : 0 ,
110
98
reclaimedSize : 0 ,
99
+ badContentCount : 0 ,
111
100
keptSize : 0
112
- }
113
- return Promise . reduce ( files , ( stats , f ) => {
114
- var fullPath = path . join ( contentDir , f )
115
- if ( byDigest [ f ] ) {
116
- var algo = opts . hashAlgorithm || 'sha512'
117
- return verifyContent ( fullPath , algo ) . then ( info => {
101
+ } ) . tap ( ( stats ) => Promise . map ( files , ( f ) => {
102
+ const split = f . split ( / [ / \\ ] / )
103
+ const digest = split . slice ( split . length - 3 ) . join ( '' )
104
+ const algo = split [ split . length - 4 ]
105
+ if ( liveContent . has ( ` ${ algo } - ${ digest } ` ) ) {
106
+ return verifyContent ( f , digest , algo ) . then ( info => {
118
107
if ( ! info . valid ) {
119
- stats . collectedCount ++
108
+ stats . reclaimedCount ++
109
+ stats . badContentCount ++
120
110
stats . reclaimedSize += info . size
121
111
} else {
122
112
stats . verifiedContent ++
@@ -125,44 +115,99 @@ function garbageCollect (cache, opts) {
125
115
return stats
126
116
} )
127
117
} else {
128
- stats . collectedCount ++
129
- return fs . statAsync ( fullPath ) . then ( s => {
130
- stats . reclaimedSize += s . size
131
- return rimraf ( path . join ( contentDir , f ) ) . then ( ( ) => stats )
118
+ // No entries refer to this content. We can delete.
119
+ stats . reclaimedCount ++
120
+ return fs . statAsync ( f ) . then ( s => {
121
+ return rimraf ( f ) . then ( ( ) => {
122
+ stats . reclaimedSize += s . size
123
+ return stats
124
+ } )
132
125
} )
133
126
}
134
- } , stats )
127
+ } , { concurrency : opts . concurrency || 20 } ) )
135
128
} )
136
129
} )
137
130
}
138
131
139
- function verifyContent ( filepath , algo ) {
132
+ function verifyContent ( filepath , digest , algorithm ) {
140
133
return fs . statAsync ( filepath ) . then ( stat => {
141
- var reader = fs . createReadStream ( filepath )
142
- var checksummer = checksumStream ( {
143
- digest : path . basename ( filepath ) ,
144
- algorithm : algo
145
- } )
146
- var contentInfo = {
134
+ const reader = fs . createReadStream ( filepath )
135
+ const checksummer = checksumStream ( { digest, algorithm} )
136
+ const contentInfo = {
147
137
size : stat . size ,
148
138
valid : true
149
139
}
150
140
checksummer . on ( 'data' , ( ) => { } )
151
- return pipe ( reader , checksummer ) . catch ( err => {
152
- if ( err && err . code === 'EBADCHECKSUM' ) {
153
- return rimraf ( filepath ) . then ( ( ) => {
154
- contentInfo . valid = false
155
- } )
156
- } else {
157
- throw err
158
- }
141
+ return pipe ( reader , checksummer ) . catch ( { code : 'EBADCHECKSUM' } , ( ) => {
142
+ return rimraf ( filepath ) . then ( ( ) => {
143
+ contentInfo . valid = false
144
+ } )
159
145
} ) . then ( ( ) => contentInfo )
146
+ } ) . catch ( { code : 'ENOENT' } , ( ) => ( { size : 0 , valid : false } ) )
147
+ }
148
+
149
+ function rebuildIndex ( cache , opts ) {
150
+ opts . log && opts . log . silly ( 'verify' , 'rebuilding index' )
151
+ return index . ls ( cache ) . then ( entries => {
152
+ const stats = {
153
+ missingContent : 0 ,
154
+ rejectedEntries : 0 ,
155
+ totalEntries : 0
156
+ }
157
+ const buckets = { }
158
+ for ( let k in entries ) {
159
+ if ( entries . hasOwnProperty ( k ) ) {
160
+ const hashed = index . _hashKey ( k )
161
+ const entry = entries [ k ]
162
+ const excluded = opts && opts . filter && ! opts . filter ( entry )
163
+ excluded && stats . rejectedEntries ++
164
+ if ( buckets [ hashed ] && ! excluded ) {
165
+ buckets [ hashed ] . push ( entry )
166
+ } else if ( buckets [ hashed ] && excluded ) {
167
+ // skip
168
+ } else if ( excluded ) {
169
+ buckets [ hashed ] = [ ]
170
+ buckets [ hashed ] . _path = index . _bucketPath ( cache , k )
171
+ } else {
172
+ buckets [ hashed ] = [ entry ]
173
+ buckets [ hashed ] . _path = index . _bucketPath ( cache , k )
174
+ }
175
+ }
176
+ }
177
+ return Promise . map ( Object . keys ( buckets ) , key => {
178
+ return rebuildBucket ( cache , buckets [ key ] , stats , opts )
179
+ } , { concurrency : opts . concurrency || 20 } ) . then ( ( ) => stats )
160
180
} )
161
181
}
162
182
163
- module . exports . lastRun = lastRun
164
- function lastRun ( cache ) {
165
- return fs . readFileAsync (
166
- path . join ( cache , '_lastverified' ) , 'utf8'
167
- ) . then ( data => new Date ( + data ) )
183
+ function rebuildBucket ( cache , bucket , stats , opts ) {
184
+ return fs . truncateAsync ( bucket . _path ) . then ( ( ) => {
185
+ // This needs to be serialized because cacache explicitly
186
+ // lets very racy bucket conflicts clobber each other.
187
+ return Promise . mapSeries ( bucket , entry => {
188
+ const content = contentPath ( cache , entry . digest , entry . hashAlgorithm )
189
+ return fs . statAsync ( content ) . then ( ( ) => {
190
+ return index . insert ( cache , entry . key , entry . digest , {
191
+ uid : opts . uid ,
192
+ gid : opts . gid ,
193
+ hashAlgorithm : entry . hashAlgorithm ,
194
+ metadata : entry . metadata
195
+ } ) . then ( ( ) => { stats . totalEntries ++ } )
196
+ } ) . catch ( { code : 'ENOENT' } , ( ) => {
197
+ stats . rejectedEntries ++
198
+ stats . missingContent ++
199
+ } )
200
+ } )
201
+ } )
202
+ }
203
+
204
+ function cleanTmp ( cache , opts ) {
205
+ opts . log && opts . log . silly ( 'verify' , 'cleaning tmp directory' )
206
+ return rimraf ( path . join ( cache , 'tmp' ) )
207
+ }
208
+
209
+ function writeVerifile ( cache , opts ) {
210
+ const verifile = path . join ( cache , '_lastverified' )
211
+ opts . log && opts . log . silly ( 'verify' , 'writing verifile to ' + verifile )
212
+ return fs . writeFileAsync ( verifile , '' + ( + ( new Date ( ) ) ) )
168
213
}
0 commit comments