/
ZipWriteStream.class.st
600 lines (542 loc) · 18.4 KB
/
ZipWriteStream.class.st
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
"
A ZIP write stream
"
Class {
#name : #ZipWriteStream,
#superclass : #DeflateStream,
#instVars : [
'literals',
'distances',
'literalFreq',
'distanceFreq',
'litCount',
'matchCount',
'encoder',
'crc',
'crcPosition',
'bytesWritten'
],
#classVars : [
'VerboseLevel'
],
#pools : [
'ZipConstants'
],
#category : #'Compression-Streams'
}
{ #category : #accessing }
ZipWriteStream class >> baseDistance [
^BaseDistance
]
{ #category : #accessing }
ZipWriteStream class >> baseLength [
^BaseLength
]
{ #category : #accessing }
ZipWriteStream class >> distanceCodes [
^DistanceCodes
]
{ #category : #accessing }
ZipWriteStream class >> extraDistanceBits [
^ExtraDistanceBits
]
{ #category : #accessing }
ZipWriteStream class >> extraLengthBits [
^ExtraLengthBits
]
{ #category : #initialization }
ZipWriteStream class >> initialize [
"ZipWriteStream initialize"
VerboseLevel := 0
]
{ #category : #accessing }
ZipWriteStream class >> matchLengthCodes [
^MatchLengthCodes
]
{ #category : #accessing }
ZipWriteStream class >> maxDistanceCodes [
^MaxDistCodes
]
{ #category : #accessing }
ZipWriteStream class >> maxLiteralCodes [
^MaxLiteralCodes
]
{ #category : #'regression test' }
ZipWriteStream class >> printRegressionStats: stats from: fd [
| raw compressed numFiles |
raw := stats at: #rawSize ifAbsent: [ 0 ].
raw = 0
ifTrue: [ ^ self ].
compressed := stats at: #compressedSize ifAbsent: [ 0 ].
numFiles := stats at: #numFiles ifAbsent: [ 0 ].
SystemNotification signal: (String streamContents: [ :aStream |
aStream nextPutAll: fd fullName asString; cr.
aStream tab; nextPutAll: 'Files compressed: '; nextPutAll: numFiles asStringWithCommas; cr.
aStream tab; nextPutAll: 'Bytes compressed: '; nextPutAll: raw asStringWithCommas; cr.
aStream tab; nextPutAll: 'Avg. compression ratio: '; nextPutAll: (compressed / raw asFloat * 100.0 truncateTo: 0.01) asString ])
]
{ #category : #'regression test' }
ZipWriteStream class >> regressionCompress: aFile into: tempFile notifiying: progressBar stats: stats [
"Compress aFile into tempFile"
| zip encoded buffer |
aFile binary.
aFile position: 0.
tempFile binary.
buffer := ByteArray new: 4096.
zip := self on: (ByteArray new: 10000).
encoded := zip encodedStream.
[aFile atEnd] whileFalse:[
progressBar current: aFile position.
zip nextPutAll: (aFile nextInto: buffer).
encoded position > 0 ifTrue:[
tempFile nextPutAll: encoded contents.
encoded position: 0]].
zip close.
tempFile nextPutAll: encoded contents.
^true
]
{ #category : #crc }
ZipWriteStream class >> updateCrc: oldCrc from: start to: stop in: aCollection [
^ CRC update: oldCrc from: start to: stop in: aCollection
]
{ #category : #'open/close' }
ZipWriteStream >> close [
self deflateBlock.
self flushBlock: true.
encoder close.
]
{ #category : #accessing }
ZipWriteStream >> crc [
^crc
]
{ #category : #deflating }
ZipWriteStream >> deflateBlock: lastIndex chainLength: chainLength goodMatch: goodMatch [
"^DeflatePlugin doPrimitive:#primitiveDeflateBlock"
<primitive: 'primitiveDeflateBlock' module: 'ZipPlugin'>
^super deflateBlock: lastIndex chainLength: chainLength goodMatch: goodMatch
]
{ #category : #'dynamic blocks' }
ZipWriteStream >> dynamicBlockSizeFor: lTree and: dTree using: blTree and: blFreq [
"Compute the length for the current block using dynamic huffman trees"
| bits index extra treeBits freq |
bits := 3 "block type" + 5 "literal codes length" + 5 "distance codes length".
"Compute the # of bits for sending the bit length tree"
treeBits := 4. "Max index for bit length tree"
index := MaxBitLengthCodes.
[index >= 4] whileTrue:[
(index = 4 or:[(blFreq at: (BitLengthOrder at: index)+1) > 0])
ifTrue:[treeBits := treeBits + (index * 3).
index := -1]
ifFalse:[index := index - 1]].
"Compute the # of bits for sending the literal/distance tree.
Note: The frequency are already stored in the blTree"
0 to: 15 do:[:i| "First, the non-repeating values"
freq := blFreq at: i+1.
freq > 0 ifTrue:[treeBits := treeBits + (freq * (blTree bitLengthAt: i))]].
"Now the repeating values"
(Repeat3To6 to: Repeat11To138) with: #(2 3 7) do:[:i :addl|
freq := blFreq at: i+1.
freq > 0 ifTrue:[
treeBits := treeBits + (freq * ((blTree bitLengthAt: i) + addl "addl bits"))]].
VerboseLevel > 1 ifTrue:[
SystemNotification signal: ('[', treeBits asString, ' bits for dynamic tree]')].
bits := bits + treeBits.
"Compute the size of the compressed block"
0 to: NumLiterals do:[:i| "encoding of literals"
freq := literalFreq at: i+1.
freq > 0 ifTrue:[bits := bits + (freq * (lTree bitLengthAt: i))]].
NumLiterals+1 to: lTree maxCode do:[:i| "encoding of match lengths"
freq := literalFreq at: i+1.
extra := ExtraLengthBits at: i-NumLiterals.
freq > 0 ifTrue:[bits := bits + (freq * ((lTree bitLengthAt: i) + extra))]].
0 to: dTree maxCode do:[:i| "encoding of distances"
freq := distanceFreq at: i+1.
extra := ExtraDistanceBits at: i+1.
freq > 0 ifTrue:[bits := bits + (freq * ((dTree bitLengthAt: i) + extra))]].
^bits
]
{ #category : #encoding }
ZipWriteStream >> encodeLiteral: lit [
"Encode the given literal"
litCount := litCount + 1.
literals at: litCount put: lit.
distances at: litCount put: 0.
literalFreq at: lit+1 put: (literalFreq at: lit+1) + 1.
^self shouldFlush
]
{ #category : #encoding }
ZipWriteStream >> encodeMatch: length distance: dist [
"Encode the given match of length length starting at dist bytes ahead"
| literal distance |
dist > 0
ifFalse:[^self error:'Distance must be positive'].
length < MinMatch
ifTrue:[^self error:'Match length must be at least ', MinMatch printString].
litCount := litCount + 1.
matchCount := matchCount + 1.
literals at: litCount put: length - MinMatch.
distances at: litCount put: dist.
literal := (MatchLengthCodes at: length - MinMatch + 1).
literalFreq at: literal+1 put: (literalFreq at: literal+1) + 1.
dist < 257
ifTrue:[distance := DistanceCodes at: dist]
ifFalse:[distance := DistanceCodes at: 257 + (dist - 1 bitShift: -7)].
distanceFreq at: distance+1 put: (distanceFreq at: distance+1) + 1.
^self shouldFlush
]
{ #category : #accessing }
ZipWriteStream >> encodedStream [
^encoder encodedStream
]
{ #category : #initialization }
ZipWriteStream >> finish [
"Finish pending operation. Do not close output stream."
self deflateBlock.
self flushBlock: true.
encoder flush.
]
{ #category : #'fixed blocks' }
ZipWriteStream >> fixedBlockSizeFor: lTree and: dTree [
"Compute the length for the current block using fixed huffman trees"
| bits extra |
bits := 3 "block type".
"Compute the size of the compressed block"
0 to: NumLiterals do:[:i| "encoding of literals"
bits := bits + ((literalFreq at: i+1) * (FixedLiteralTree bitLengthAt: i))].
NumLiterals+1 to: lTree maxCode+1 do:[:i| "Encoding of match lengths"
extra := ExtraLengthBits at: i-NumLiterals.
bits := bits + ((literalFreq at: i+1) * ((FixedLiteralTree bitLengthAt: i) + extra))].
0 to: dTree maxCode do:[:i| "encoding of distances"
extra := ExtraDistanceBits at: i+1.
bits := bits + ((distanceFreq at: i+1) * ((FixedDistanceTree bitLengthAt: i) + extra))].
^bits
]
{ #category : #encoding }
ZipWriteStream >> flushBlock [
^self flushBlock: false
]
{ #category : #encoding }
ZipWriteStream >> flushBlock: lastBlock [
"Send the current block"
| lastFlag bitsRequired method bitsSent storedLength fixedLength dynamicLength blTree lTree dTree blBits blFreq |
lastFlag := lastBlock
ifTrue: [ 1 ]
ifFalse: [ 0 ].
"Compute the literal/length and distance tree"
lTree := ZipEncoderTree buildTreeFrom: literalFreq maxDepth: MaxBits.
dTree := ZipEncoderTree buildTreeFrom: distanceFreq maxDepth: MaxBits.
"Compute the bit length tree"
blBits := lTree bitLengths , dTree bitLengths.
blFreq := WordArray new: MaxBitLengthCodes.
self scanBitLengths: blBits into: blFreq.
blTree := ZipEncoderTree
buildTreeFrom: blFreq
maxDepth: MaxBitLengthBits.
"Compute the bit length for the current block.
Note: Most of this could be computed on the fly but it's getting
really ugly in this case so we do it afterwards."
storedLength := self storedBlockSize.
fixedLength := self fixedBlockSizeFor: lTree and: dTree.
dynamicLength := self
dynamicBlockSizeFor: lTree
and: dTree
using: blTree
and: blFreq.
VerboseLevel > 1
ifTrue: [
SystemNotification signal:
(String
streamContents: [ :stream |
stream
nextPutAll: 'Block sizes (S/F/D):' space;
print: storedLength // 8;
nextPut: $/;
print: fixedLength // 8;
nextPut: $/;
print: dynamicLength // 8;
space;
endEntry ]) ].
"Check which method to use"
method := self forcedMethod.
method
ifNil: [ method := (storedLength < fixedLength
and: [ storedLength < dynamicLength ])
ifTrue: [ #stored ]
ifFalse: [ fixedLength < dynamicLength
ifTrue: [ #fixed ]
ifFalse: [ #dynamic ] ] ].
(method == #stored and: [ blockStart < 0 ])
ifTrue: [ "Cannot use #stored if the block is not available"
method := fixedLength < dynamicLength
ifTrue: [ #fixed ]
ifFalse: [ #dynamic ] ].
bitsSent := encoder bitPosition. "# of bits sent before this block"
bitsRequired := nil.
method == #stored
ifTrue: [ VerboseLevel > 0
ifTrue: [ SystemNotification signal: 'S' ].
bitsRequired := storedLength.
encoder nextBits: 3 put: (StoredBlock << 1) + lastFlag.
self sendStoredBlock ].
method == #fixed
ifTrue: [ VerboseLevel > 0
ifTrue: [ SystemNotification signal: 'F' ].
bitsRequired := fixedLength.
encoder nextBits: 3 put: (FixedBlock << 1) + lastFlag.
self sendFixedBlock ].
method == #dynamic
ifTrue: [ VerboseLevel > 0
ifTrue: [ SystemNotification signal: 'D' ].
bitsRequired := dynamicLength.
encoder nextBits: 3 put: (DynamicBlock << 1) + lastFlag.
self
sendDynamicBlock: blTree
literalTree: lTree
distanceTree: dTree
bitLengths: blBits ].
bitsRequired = (encoder bitPosition - bitsSent)
ifFalse: [ self error: 'Bits size mismatch' ].
lastBlock
ifTrue: [ self release ]
ifFalse: [ self initializeNewBlock ]
]
{ #category : #accessing }
ZipWriteStream >> forcedMethod [
"Return a symbol describing an enforced method or nil if the method should
be chosen adaptively. Valid symbols are
#stored - store blocks (do not compress)
#fixed - use fixed huffman trees
#dynamic - use dynamic huffman trees."
^nil
]
{ #category : #initialization }
ZipWriteStream >> initialize [
super initialize.
literals := ByteArray new: WindowSize.
distances := WordArray new: WindowSize.
literalFreq := WordArray new: MaxLiteralCodes.
distanceFreq := WordArray new: MaxDistCodes.
self initializeNewBlock.
]
{ #category : #initialization }
ZipWriteStream >> initializeNewBlock [
"Initialize the encoder for a new block of data"
literalFreq atAllPut: 0.
distanceFreq atAllPut: 0.
literalFreq at: EndBlock+1 put: 1.
litCount := 0.
matchCount := 0.
]
{ #category : #private }
ZipWriteStream >> moveContentsToFront [
"Need to update crc here"
self updateCrc.
super moveContentsToFront.
crcPosition := position + 1.
]
{ #category : #initialization }
ZipWriteStream >> on: aCollectionOrStream [
crc := 16rFFFFFFFF.
crcPosition := 1.
bytesWritten := 0.
encoder := ZipEncoder on: aCollectionOrStream.
encoder isBinary
ifTrue:[super on: ByteArray new]
ifFalse:[super on: String new].
self writeHeader.
]
{ #category : #initialization }
ZipWriteStream >> release [
"We're done with compression. Do some cleanup."
literals := distances := literalFreq := distanceFreq := nil.
self updateCrc.
encoder flushBits.
self writeFooter.
]
{ #category : #'dynamic blocks' }
ZipWriteStream >> scanBitLength: bitLength repeatCount: repeatCount into: anArray [
"Update the frequency for the aTree based on the given values"
| count |
count := repeatCount.
bitLength = 0 ifTrue:[
[count >= 11] whileTrue:[
anArray at: Repeat11To138+1 put: (anArray at: Repeat11To138+1) + 1.
count := (count - 138) max: 0].
[count >= 3] whileTrue:[
anArray at: Repeat3To10+1 put: (anArray at: Repeat3To10+1) + 1.
count := (count - 10) max: 0].
count > 0 ifTrue:[anArray at: bitLength+1 put: (anArray at: bitLength+1) + count].
] ifFalse:[
anArray at: bitLength+1 put: (anArray at: bitLength+1) + 1.
count := count - 1.
[count >= 3] whileTrue:[
anArray at: Repeat3To6+1 put: (anArray at: Repeat3To6+1) + 1.
count := (count - 6) max: 0].
count > 0 ifTrue:[anArray at: bitLength+1 put: (anArray at: bitLength+1) + count].
].
]
{ #category : #'dynamic blocks' }
ZipWriteStream >> scanBitLengths: bits into: anArray [
"Scan the trees and determine the frequency of the bit lengths.
For repeating codes, emit a repeat count."
| lastValue lastCount value |
bits size = 0 ifTrue:[^self].
lastValue := bits at: 1.
lastCount := 1.
2 to: bits size do:[:i|
value := bits at: i.
value = lastValue
ifTrue:[lastCount := lastCount + 1]
ifFalse:[self scanBitLength: lastValue repeatCount: lastCount into: anArray.
lastValue := value.
lastCount := 1]].
self scanBitLength: lastValue repeatCount: lastCount into: anArray.
]
{ #category : #'dynamic blocks' }
ZipWriteStream >> sendBitLength: bitLength repeatCount: repeatCount tree: aTree [
"Send the given bitLength, repeating repeatCount times"
| count |
count := repeatCount.
bitLength = 0 ifTrue:[
[count >= 11] whileTrue:[
self sendBitLength: Repeat11To138 tree: aTree.
encoder nextBits: 7 put: (count min: 138) - 11.
count := (count - 138) max: 0].
[count >= 3] whileTrue:[
self sendBitLength: Repeat3To10 tree: aTree.
encoder nextBits: 3 put: (count min: 10) - 3.
count := (count - 10) max: 0].
count timesRepeat:[self sendBitLength: bitLength tree: aTree].
] ifFalse:[
self sendBitLength: bitLength tree: aTree.
count := count - 1.
[count >= 3] whileTrue:[
self sendBitLength: Repeat3To6 tree: aTree.
encoder nextBits: 2 put: (count min: 6) - 3.
count := (count - 6) max: 0].
count timesRepeat:[self sendBitLength: bitLength tree: aTree].
].
]
{ #category : #'dynamic blocks' }
ZipWriteStream >> sendBitLength: bitLength tree: aTree [
"Send the given bitLength"
encoder nextBits: (aTree bitLengthAt: bitLength)
put: (aTree codeAt: bitLength).
]
{ #category : #'dynamic blocks' }
ZipWriteStream >> sendBitLengthTree: blTree [
"Send the bit length tree"
| blIndex bitLength |
MaxBitLengthCodes to: 4 by: -1 do:[:maxIndex|
blIndex := BitLengthOrder at: maxIndex.
bitLength := blIndex <= blTree maxCode
ifTrue:[blTree bitLengthAt: blIndex] ifFalse:[0].
(maxIndex = 4 or:[bitLength > 0]) ifTrue:[
encoder nextBits: 4 put: maxIndex - 4.
1 to: maxIndex do:[:j|
blIndex := BitLengthOrder at: j.
bitLength := blIndex <= blTree maxCode
ifTrue:[blTree bitLengthAt: blIndex] ifFalse:[0].
encoder nextBits: 3 put: bitLength].
^self]].
]
{ #category : #'dynamic blocks' }
ZipWriteStream >> sendCompressedBlock: litTree with: distTree [
"Send the current block using the encodings from the given literal/length and distance tree"
| sum |
sum := encoder
sendBlock: (ReadStream on: literals from: 1 to: litCount)
with: (ReadStream on: distances from: 1 to: litCount)
with: litTree
with: distTree.
sum = (blockPosition - blockStart) ifFalse:[self error:'Wrong number of bytes'].
]
{ #category : #'dynamic blocks' }
ZipWriteStream >> sendDynamicBlock: blTree literalTree: lTree distanceTree: dTree bitLengths: bits [
"Send a block using dynamic huffman trees"
self sendLiteralTree: lTree distanceTree: dTree using: blTree bitLengths: bits.
self sendCompressedBlock: lTree with: dTree.
]
{ #category : #'fixed blocks' }
ZipWriteStream >> sendFixedBlock [
"Send a block using fixed huffman trees"
self sendCompressedBlock: FixedLiteralTree with: FixedDistanceTree.
]
{ #category : #'dynamic blocks' }
ZipWriteStream >> sendLiteralTree: lTree distanceTree: dTree using: blTree bitLengths: bits [
"Send all the trees needed for dynamic huffman tree encoding"
| lastValue lastCount value |
encoder nextBits: 5 put: (lTree maxCode - 256).
encoder nextBits: 5 put: (dTree maxCode).
self sendBitLengthTree: blTree.
bits size = 0 ifTrue:[^self].
lastValue := bits at: 1.
lastCount := 1.
2 to: bits size do:[:i|
value := bits at: i.
value = lastValue
ifTrue:[lastCount := lastCount + 1]
ifFalse:[self sendBitLength: lastValue repeatCount: lastCount tree: blTree.
lastValue := value.
lastCount := 1]].
self sendBitLength: lastValue repeatCount: lastCount tree: blTree.
]
{ #category : #'stored blocks' }
ZipWriteStream >> sendStoredBlock [
"Send an uncompressed block"
| inBytes |
inBytes := blockPosition - blockStart.
encoder flushBits. "Skip to byte boundary"
encoder nextBits: 16 put: inBytes.
encoder nextBits: 16 put: (inBytes bitXor: 16rFFFF).
encoder flushBits.
1 to: inBytes do:[:i|
encoder nextBytePut: (collection byteAt: blockStart+i)].
]
{ #category : #encoding }
ZipWriteStream >> shouldFlush [
"Check if we should flush the current block.
Flushing can be useful if the input characteristics change."
| nLits |
litCount = literals size ifTrue:[^true]. "We *must* flush"
(litCount bitAnd: 16rFFF) = 0 ifFalse:[^false]. "Only check every N kbytes"
matchCount * 10 <= litCount ifTrue:[
"This is basically random data.
There is no need to flush early since the overhead
for encoding the trees will add to the overall size"
^false].
"Try to adapt to the input data.
We flush if the ratio between matches and literals
changes beyound a certain threshold"
nLits := litCount - matchCount.
nLits <= matchCount ifTrue:[^false]. "whow! so many matches"
^nLits * 4 <= matchCount
]
{ #category : #'stored blocks' }
ZipWriteStream >> storedBlockSize [
"Compute the length for the current block when stored as is"
^3 "block type bits"
+ (8 - (encoder bitPosition + 3 bitAnd: 7) bitAnd: 7)"skipped bits to byte boundary"
+ 32 "byte length + chksum"
+ (blockPosition - blockStart * 8) "actual data bits".
]
{ #category : #private }
ZipWriteStream >> updateCrc [
crcPosition <= position ifTrue:[
bytesWritten := bytesWritten + position - crcPosition + 1.
crc := self updateCrc: crc from: crcPosition to: position in: collection.
crcPosition := position + 1].
]
{ #category : #private }
ZipWriteStream >> updateCrc: oldCrc from: start to: stop in: aCollection [
^self class updateCrc: oldCrc from: start to: stop in: aCollection
]
{ #category : #initialization }
ZipWriteStream >> writeFooter [
"Write footer information if necessary"
crc := crc bitXor: 16rFFFFFFFF.
]
{ #category : #initialization }
ZipWriteStream >> writeHeader [
"Write header information if necessary"
]