-
Notifications
You must be signed in to change notification settings - Fork 8
/
deflate_interlaced.c
executable file
·2282 lines (1987 loc) · 61.2 KB
/
deflate_interlaced.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* This code implements "interlaced deflate", and is *based* on a
* simplistic implementation of the Deflate algorithm.
* See http://www.ietf.org/rfc/rfc1951.txt for details on the original
* Deflate algorithm.
*
* It differs from RFC1951 in two important ways:
*
* 1) It only supports the huffman encoding step and does not attempt
* to do any LZ-style string matching to generate distance codes.
* (These generally do not improve data compression for our desired
* use.)
*
* 2) It optionally allows interleaving of multiple huffman trees for
* a single data stream. NB: when multiple codes are used this is
* incompatible with RFC1951.
*
* It has been written here, instead of using zlib, so that we can separate
* out the encoding of the huffman tree from the compression of the data
* stream into separate memory sections with the intent to optimise
* compression of very small blocks of data by sharing one set of frequency
* tables (ie huffman tree) with multiple sets of compressed data blocks.
*
* James Bonfield, 2007
*/
#define NDEBUG /* disable asserts for production use */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <unistd.h>
#include <assert.h>
#include <unistd.h>
#include "io_lib/deflate_interlaced.h"
#ifndef MIN
# define MIN(a,b) ((a) < (b) ? (a) : (b))
#endif
#ifndef MAX
# define MAX(a,b) ((a) > (b) ? (a) : (b))
#endif
/* #define TEST_MAIN */
/*
* ---------------------------------------------------------------------------
* Local structs & defines
*/
/* Used in tree construction only */
typedef struct node {
int count;
int sym; /* char or SYM_EOF */
struct node *parent;
struct node *next;
} node_t;
#define SYM_EOF 256
static void output_code_set(FILE *fp, huffman_codes_t *codes);
static void output_code_set2(FILE *fp, huffman_codes_t *codes);
int next_symbol(block_t *in, int *htab);
/*
* ---------------------------------------------------------------------------
* Our standard precomputed tables, for DNA, English text, etc.
*/
/* DNA */
/*
* A 00
* C 01
* G 110
* T 10
* N 1110
* EOF 11110
* ? 11111*
*/
static huffman_code_t codes_dna[] = {
{'A', 2}, {'C', 2}, {'T', 2}, {'G', 3}, {'N', 4}, { 0, 5},
{SYM_EOF, 6},
{ 1, 13}, { 2, 13}, { 3, 13}, { 4, 13}, { 5, 13}, { 6, 13},
{ 7, 14}, { 8, 14}, { 9, 14}, { 10, 14}, { 11, 14}, { 12, 14},
{ 13, 14}, { 14, 14}, { 15, 14}, { 16, 14}, { 17, 14}, { 18, 14},
{ 19, 14}, { 20, 14}, { 21, 14}, { 22, 14}, { 23, 14}, { 24, 14},
{ 25, 14}, { 26, 14}, { 27, 14}, { 28, 14}, { 29, 14}, { 30, 14},
{ 31, 14}, { 32, 14}, { 33, 14}, { 34, 14}, { 35, 14}, { 36, 14},
{ 37, 14}, { 38, 14}, { 39, 14}, { 40, 14}, { 41, 14}, { 42, 14},
{ 43, 14}, { 44, 14}, { 45, 14}, { 46, 14}, { 47, 14}, {'0', 14},
{'1', 14}, {'2', 14}, {'3', 14}, {'4', 14}, {'5', 14}, {'6', 14},
{'7', 14}, {'8', 14}, {'9', 14}, { 58, 14}, { 59, 14}, { 60, 14},
{ 61, 14}, { 62, 14}, { 63, 14}, { 64, 14}, {'B', 14}, {'D', 14},
{'E', 14}, {'F', 14}, {'H', 14}, {'I', 14}, {'J', 14}, {'K', 14},
{'L', 14}, {'M', 14}, {'O', 14}, {'P', 14}, {'Q', 14}, {'R', 14},
{'S', 14}, {'U', 14}, {'V', 14}, {'W', 14}, {'X', 14}, {'Y', 14},
{'Z', 14}, { 91, 14}, { 92, 14}, { 93, 14}, { 94, 14}, { 95, 14},
{ 96, 14}, {'a', 14}, {'b', 14}, {'c', 14}, {'d', 14}, {'e', 14},
{'f', 14}, {'g', 14}, {'h', 14}, {'i', 14}, {'j', 14}, {'k', 14},
{'l', 14}, {'m', 14}, {'n', 14}, {'o', 14}, {'p', 14}, {'q', 14},
{'r', 14}, {'s', 14}, {'t', 14}, {'u', 14}, {'v', 14}, {'w', 14},
{'x', 14}, {'y', 14}, {'z', 14}, {123, 14}, {124, 14}, {125, 14},
{126, 14}, {127, 14}, {128, 14}, {129, 14}, {130, 14}, {131, 14},
{132, 14}, {133, 14}, {134, 14}, {135, 14}, {136, 14}, {137, 14},
{138, 14}, {139, 14}, {140, 14}, {141, 14}, {142, 14}, {143, 14},
{144, 14}, {145, 14}, {146, 14}, {147, 14}, {148, 14}, {149, 14},
{150, 14}, {151, 14}, {152, 14}, {153, 14}, {154, 14}, {155, 14},
{156, 14}, {157, 14}, {158, 14}, {159, 14}, {160, 14}, {161, 14},
{162, 14}, {163, 14}, {164, 14}, {165, 14}, {166, 14}, {167, 14},
{168, 14}, {169, 14}, {170, 14}, {171, 14}, {172, 14}, {173, 14},
{174, 14}, {175, 14}, {176, 14}, {177, 14}, {178, 14}, {179, 14},
{180, 14}, {181, 14}, {182, 14}, {183, 14}, {184, 14}, {185, 14},
{186, 14}, {187, 14}, {188, 14}, {189, 14}, {190, 14}, {191, 14},
{192, 14}, {193, 14}, {194, 14}, {195, 14}, {196, 14}, {197, 14},
{198, 14}, {199, 14}, {200, 14}, {201, 14}, {202, 14}, {203, 14},
{204, 14}, {205, 14}, {206, 14}, {207, 14}, {208, 14}, {209, 14},
{210, 14}, {211, 14}, {212, 14}, {213, 14}, {214, 14}, {215, 14},
{216, 14}, {217, 14}, {218, 14}, {219, 14}, {220, 14}, {221, 14},
{222, 14}, {223, 14}, {224, 14}, {225, 14}, {226, 14}, {227, 14},
{228, 14}, {229, 14}, {230, 14}, {231, 14}, {232, 14}, {233, 14},
{234, 14}, {235, 14}, {236, 14}, {237, 14}, {238, 14}, {239, 14},
{240, 14}, {241, 14}, {242, 14}, {243, 14}, {244, 14}, {245, 14},
{246, 14}, {247, 14}, {248, 14}, {249, 14}, {250, 14}, {251, 14},
{252, 14}, {253, 14}, {254, 14}, {255, 14},
};
/* DNA with a few ambiguity codes */
static huffman_code_t codes_dna_ambig[] = {
{'A', 2}, {'C', 2}, {'T', 2}, {'G', 3}, {'N', 4}, { 0, 7},
{ 45, 7}, {'B', 8}, {'D', 8}, {'H', 8}, {'K', 8}, {'M', 8},
{'R', 8}, {'S', 8}, {'V', 8}, {'W', 8}, {'Y', 8}, {SYM_EOF, 11},
{226, 14}, { 1, 15}, { 2, 15}, { 3, 15}, { 4, 15}, { 5, 15},
{ 6, 15}, { 7, 15}, { 8, 15}, { 9, 15}, { 10, 15}, { 11, 15},
{ 12, 15}, { 13, 15}, { 14, 15}, { 15, 15}, { 16, 15}, { 17, 15},
{ 18, 15}, { 19, 15}, { 20, 15}, { 21, 15}, { 22, 15}, { 23, 15},
{ 24, 15}, { 25, 15}, { 26, 15}, { 27, 15}, { 28, 15}, { 29, 15},
{ 30, 15}, { 31, 15}, { 32, 15}, { 33, 15}, { 34, 15}, { 35, 15},
{ 36, 15}, { 37, 15}, { 38, 15}, { 39, 15}, { 40, 15}, { 41, 15},
{ 42, 15}, { 43, 15}, { 44, 15}, { 46, 15}, { 47, 15}, {'0', 15},
{'1', 15}, {'2', 15}, {'3', 15}, {'4', 15}, {'5', 15}, {'6', 15},
{'7', 15}, {'8', 15}, {'9', 15}, { 58, 15}, { 59, 15}, { 60, 15},
{ 61, 15}, { 62, 15}, { 63, 15}, { 64, 15}, {'E', 15}, {'F', 15},
{'I', 15}, {'J', 15}, {'L', 15}, {'O', 15}, {'P', 15}, {'Q', 15},
{'U', 15}, {'X', 15}, {'Z', 15}, { 91, 15}, { 92, 15}, { 93, 15},
{ 94, 15}, { 95, 15}, { 96, 15}, {'a', 15}, {'b', 15}, {'c', 15},
{'d', 15}, {'e', 15}, {'f', 15}, {'g', 15}, {'h', 15}, {'i', 15},
{'j', 15}, {'k', 15}, {'l', 15}, {'m', 15}, {'n', 15}, {'o', 15},
{'p', 15}, {'q', 15}, {'r', 15}, {'s', 15}, {'t', 15}, {'u', 15},
{'v', 15}, {'w', 15}, {'x', 15}, {'y', 15}, {'z', 15}, {123, 15},
{124, 15}, {125, 15}, {126, 15}, {127, 15}, {128, 15}, {129, 15},
{130, 15}, {131, 15}, {132, 15}, {133, 15}, {134, 15}, {135, 15},
{136, 15}, {137, 15}, {138, 15}, {139, 15}, {140, 15}, {141, 15},
{142, 15}, {143, 15}, {144, 15}, {145, 15}, {146, 15}, {147, 15},
{148, 15}, {149, 15}, {150, 15}, {151, 15}, {152, 15}, {153, 15},
{154, 15}, {155, 15}, {156, 15}, {157, 15}, {158, 15}, {159, 15},
{160, 15}, {161, 15}, {162, 15}, {163, 15}, {164, 15}, {165, 15},
{166, 15}, {167, 15}, {168, 15}, {169, 15}, {170, 15}, {171, 15},
{172, 15}, {173, 15}, {174, 15}, {175, 15}, {176, 15}, {177, 15},
{178, 15}, {179, 15}, {180, 15}, {181, 15}, {182, 15}, {183, 15},
{184, 15}, {185, 15}, {186, 15}, {187, 15}, {188, 15}, {189, 15},
{190, 15}, {191, 15}, {192, 15}, {193, 15}, {194, 15}, {195, 15},
{196, 15}, {197, 15}, {198, 15}, {199, 15}, {200, 15}, {201, 15},
{202, 15}, {203, 15}, {204, 15}, {205, 15}, {206, 15}, {207, 15},
{208, 15}, {209, 15}, {210, 15}, {211, 15}, {212, 15}, {213, 15},
{214, 15}, {215, 15}, {216, 15}, {217, 15}, {218, 15}, {219, 15},
{220, 15}, {221, 15}, {222, 15}, {223, 15}, {224, 15}, {225, 15},
{227, 15}, {228, 15}, {229, 15}, {230, 15}, {231, 15}, {232, 15},
{233, 15}, {234, 15}, {235, 15}, {236, 15}, {237, 15}, {238, 15},
{239, 15}, {240, 15}, {241, 15}, {242, 15}, {243, 15}, {244, 15},
{245, 15}, {246, 15}, {247, 15}, {248, 15}, {249, 15}, {250, 15},
{251, 15}, {252, 15}, {253, 15}, {254, 15}, {255, 15},
};
/* English text */
static huffman_code_t codes_english[] = {
{ 32, 3}, {'e', 3}, {'a', 4}, {'i', 4}, {'n', 4}, {'o', 4},
{'s', 4}, {'t', 4}, {'d', 5}, {'h', 5}, {'l', 5}, {'r', 5},
{'u', 5}, { 10, 6}, { 13, 6}, { 44, 6}, {'c', 6}, {'f', 6},
{'g', 6}, {'m', 6}, {'p', 6}, {'w', 6}, {'y', 6}, { 46, 7},
{'b', 7}, {'v', 7}, { 34, 8}, {'I', 8}, {'k', 8}, { 45, 9},
{'A', 9}, {'N', 9}, {'T', 9}, { 39, 10}, { 59, 10}, { 63, 10},
{'B', 10}, {'C', 10}, {'E', 10}, {'H', 10}, {'M', 10}, {'S', 10},
{'W', 10}, {'x', 10}, { 33, 11}, {'0', 11}, {'1', 11}, {'F', 11},
{'G', 11}, { 0, 15}, { 1, 15}, { 2, 15}, { 3, 15}, { 4, 15},
{ 5, 15}, { 6, 15}, { 7, 15}, { 8, 15}, { 9, 15}, { 11, 15},
{ 12, 15}, { 14, 15}, { 15, 15}, { 16, 15}, { 17, 15}, { 18, 15},
{ 19, 15}, { 20, 15}, { 21, 15}, { 22, 15}, { 23, 15}, { 24, 15},
{ 25, 15}, { 26, 15}, { 27, 15}, { 28, 15}, { 29, 15}, { 30, 15},
{ 31, 15}, { 35, 15}, { 36, 15}, { 37, 15}, { 38, 15}, { 40, 15},
{ 41, 15}, { 42, 15}, { 43, 15}, { 47, 15}, {'2', 15}, {'3', 15},
{'4', 15}, {'5', 15}, {'6', 15}, {'7', 15}, {'8', 15}, {'9', 15},
{ 58, 15}, { 60, 15}, { 61, 15}, { 62, 15}, { 64, 15}, {'D', 15},
{'J', 15}, {'K', 15}, {'L', 15}, {'O', 15}, {'P', 15}, {'Q', 15},
{'R', 15}, {'U', 15}, {'V', 15}, {'X', 15}, {'Y', 15}, {'Z', 15},
{ 91, 15}, { 92, 15}, { 93, 15}, { 94, 15}, { 95, 15}, { 96, 15},
{'j', 15}, {'q', 15}, {'z', 15}, {123, 15}, {124, 15}, {125, 15},
{126, 15}, {127, 15}, {128, 15}, {129, 15}, {130, 15}, {131, 15},
{132, 15}, {133, 15}, {134, 15}, {135, 15}, {136, 15}, {137, 15},
{138, 15}, {139, 15}, {140, 15}, {141, 15}, {142, 15}, {143, 15},
{144, 15}, {145, 15}, {146, 15}, {147, 15}, {148, 15}, {149, 15},
{150, 15}, {151, 15}, {152, 15}, {153, 15}, {154, 15}, {155, 15},
{156, 15}, {157, 15}, {158, 15}, {159, 15}, {160, 15}, {161, 15},
{162, 15}, {163, 15}, {164, 15}, {165, 15}, {166, 15}, {167, 15},
{168, 15}, {169, 15}, {170, 15}, {171, 15}, {172, 15}, {173, 15},
{174, 15}, {175, 15}, {176, 15}, {177, 15}, {178, 15}, {179, 15},
{180, 15}, {181, 15}, {182, 15}, {183, 15}, {184, 15}, {185, 15},
{186, 15}, {187, 15}, {188, 15}, {189, 15}, {190, 15}, {191, 15},
{192, 15}, {193, 15}, {194, 15}, {195, 15}, {196, 15}, {197, 15},
{198, 15}, {199, 15}, {200, 15}, {201, 15}, {202, 15}, {203, 15},
{204, 15}, {205, 15}, {206, 15}, {207, 15}, {208, 15}, {209, 15},
{210, 15}, {211, 15}, {212, 15}, {213, 15}, {214, 15}, {215, 15},
{216, 15}, {217, 15}, {218, 15}, {219, 15}, {220, 15}, {221, 15},
{222, 15}, {223, 15}, {224, 15}, {225, 15}, {226, 15}, {227, 15},
{228, 15}, {229, 15}, {230, 15}, {231, 15}, {232, 15}, {233, 15},
{234, 15}, {235, 15}, {236, 15}, {237, 15}, {238, 15}, {239, 15},
{240, 15}, {241, 15}, {242, 15}, {243, 15}, {244, 15}, {245, 15},
{246, 15}, {247, 15}, {248, 15}, {249, 15}, {250, 15}, {251, 15},
{252, 15}, {253, 15}, {254, 15}, {255, 15}, {SYM_EOF, 15},
};
static huffman_codeset_t *static_codeset[NCODES_STATIC];
/*
* ---------------------------------------------------------------------------
* Block_t structure support
*/
/*
* Allocates and returns a new block_t struct of a specified default size.
* A default 'data' pointer may be passed in, in which it must have
* been created using malloc(size). Otherwise if data is NULL then
* size indicates the amount of memory to allocate. Size maybe zero to
* defer allocation.
*
* Returns newly created block_t* on success
* NULL on failure
*/
block_t *block_create(unsigned char *data, size_t size) {
block_t *b = (block_t *)malloc(sizeof(*b));
if (!b)
return NULL;
b->data = data;
b->alloc = size;
b->byte = 0;
b->bit = 0;
if (size && !data && NULL == (b->data = calloc(size, 1))) {
free(b);
return NULL;
}
return b;
}
/*
* Deallocates memory created by block_create().
* keep_data is a boolean which if true requests that the data held within
* the block should not be deallocated as it is in use elsewhere.
*/
void block_destroy(block_t *blk, int keep_data) {
if (!blk)
return;
if (!keep_data && blk->data)
free(blk->data);
free(blk);
}
/*
* Ensures a block_t holds at least 'size' bytes.
* Newly allocated data is initialised to zero.
*
* Returns 0 on success
* -1 on failure, leaving block pointing to the existing data
*/
int block_resize(block_t *blk, size_t size) {
unsigned char *newp = NULL;
if (!blk)
return -1;
/* Grow size to next power of 2, if we're growing */
if (size > blk->alloc) {
size--;
size |= size >> 1;
size |= size >> 2;
size |= size >> 4;
size |= size >> 8;
size |= size >> 16;
size++;
}
if (NULL == (newp = realloc(blk->data, size)))
return -1;
else
blk->data = newp;
if (size > blk->alloc)
memset(&blk->data[blk->alloc], 0, size - blk->alloc);
blk->alloc = size;
return 0;
}
/*
* ---------------------------------------------------------------------------
* Tree building and code generation functions
*/
/*
* Reverses the order of bits in the bottom nbits of val.
* Returns the bit-reverse value.
*/
unsigned int bit_reverse(unsigned int val, int nbits) {
unsigned int new = 0, i;
for (i = 0; i < nbits; i++) {
new = (new << 1) | (val & 1);
val >>= 1;
}
return new;
}
/*
* Generates canonical huffman codes given a set of symbol bit lengths.
* The results are stored within the supplied huffman_codes_t struct.
*
* Returns 0 on success
* -1 on failure
*/
static int canonical_codes(huffman_codes_t *c) {
int i, j;
unsigned int code, last_len;
int clens[33];
int offs[33];
huffman_code_t ctmp[258];
signed int symtab[258];
/* Sort by bit-length, subfield symbol - much faster than qsort() */
for (i = 0; i < 258; i++)
symtab[i] = -1;
for (i = 0; i < c->ncodes; i++)
symtab[c->codes[i].symbol] = i;
for (i = 0; i <= 32; i++)
offs[i] = clens[i] = 0;
for (i = 0; i < c->ncodes; i++)
clens[c->codes[i].nbits]++;
for (i = 1; i <= 32; i++)
offs[i] = offs[i-1] + clens[i-1];
for (i = 0; i < 258; i++) {
if (symtab[i] != -1)
ctmp[offs[c->codes[symtab[i]].nbits]++] = c->codes[symtab[i]];
}
memcpy(c->codes, ctmp, c->ncodes * sizeof(huffman_code_t));
/*
* Force all codes to be <= max_code_len. This is needed due to the
* 15-bit length limitation of Deflate literal codes and the 7-bit
* limit of the code bit-length table.
*/
/* Find first point of failure */
for (i = 0; i < c->ncodes; i++) {
if (c->codes[i].nbits > c->max_code_len)
break;
}
/*
* From here on we shrink the length of the current code by increasing
* the length of an earlier symbol, at last_code.
*/
if (i != c->ncodes) {
int delta = 0;
/*
fprintf(stderr, "=== REORDERING %d ===\n", c->code_set);
output_code_set(stderr, c);
output_code_set2(stderr, c);
*/
for (; i < c->ncodes; i++) {
int k, cur_len;
c->codes[i].nbits -= delta;
if (c->codes[i].nbits <= c->max_code_len)
continue;
for (j = i; j >= 0 && c->codes[j].nbits >= c->max_code_len; j--)
;
if (j < 0) {
fprintf(stderr,
"Too many symbols to fit in bit-length requirements\n");
fprintf(stderr, "=== FAILING ===\n");
output_code_set(stderr, c);
output_code_set2(stderr, c);
abort();
}
/*
fprintf(stderr, "Changing code %d/%d to len %d\n",
c->codes[i].symbol, c->codes[j].symbol,
c->codes[j].nbits+1);
*/
cur_len = c->codes[i].nbits;
c->codes[i].nbits = ++c->codes[j].nbits;
/*
* Shrink the next code by one, or if none at that bit-length
* the next 2, and so on
*/
delta = 1;
for (k = i+1; delta && k < c->ncodes; k++) {
while (c->codes[k].nbits > cur_len) {
delta *= 2;
cur_len++;
}
c->codes[k].nbits--;
delta--;
}
assert(delta == 0);
}
/*
fprintf(stderr, "=== REORDERED TO %d ===\n", c->code_set);
output_code_set(stderr, c);
output_code_set2(stderr, c);
*/
/* Ordering is shot - regenerate via brute force way */
return canonical_codes(c);
}
/* Generate codes */
code = last_len = 0; /* stop warning */
for (i = 0; i < c->ncodes; i++) {
int nbits = c->codes[i].nbits;
if (i == 0) {
code = 0;
last_len = nbits;
} else {
code++;
}
if (nbits > last_len) {
code <<= (nbits - last_len);
last_len = nbits;
}
c->codes[i].code = bit_reverse(code, nbits);
}
/* Reindex so the symbol is the primary index into codes */
for (i = 0; i <= 257; i++) {
c->lookup[i].nbits = 0;
}
for (i = 0; i < c->ncodes; i++) {
c->lookup[c->codes[i].symbol] = c->codes[i];
}
return 0;
}
static int node_compar2(const void *vp1, const void *vp2) {
const node_t *n1 = *(const node_t **)vp1;
const node_t *n2 = *(const node_t **)vp2;
/*
* The sort order is vital here. This needs to return the same collating
* order on all systems so that differing qsort() functions will not
* swap around symbols with the same bit lengths, hence we sort by both
* fields to force a unique stable ordering.
*/
if (n1->count != n2->count)
return n1->count - n2->count;
else
return n2->sym - n1->sym;
}
/*
* Computes the huffman bit-lengths for a data set. We don't care
* about the actual tree, just how deep the symbols end up.
*
* Huffman trees are constructed by constructing a set of nodes
* initially containing the symbol and it's frequency. We then merge
* the two least used nodes to produce a new node with a combined
* frequency. Repeat until one root node is left.
*
* data/len is the input data to analyse.
*
* 'eof' is a boolean to indicate whether the EOF symbol should be included
* in the symbols produced.
*
* all_codes is a boolean to indicate whether we should include symbols not
* found in the input data set. (This was used to create the static lookup
* tables.)
*
* Returns huffman_codes_t* on success
* NULL on failure
*/
huffman_codes_t *calc_bit_lengths(unsigned char *data, int len,
int eof, int max_code_len, int all_codes,
int start, int skip) {
int i, ncodes;
node_t nodes[258+257], *head, *new = &nodes[258];
node_t *n2[258+257];
int map[258];
huffman_codes_t *c;
int hist[256];
if (NULL == (c = (huffman_codes_t *)malloc(sizeof(*c))))
return NULL;
c->codes_static = 0;
c->max_code_len = max_code_len;
/* Count frequencies of symbols */
memset(hist, 0, 256*sizeof(*hist));
/* Calc freqs */
for (i = start; i < len; i+=skip) {
hist[data[i]]++;
}
/*
* Initialise nodes. We build a map of ASCII character code to node
* number. (By default it's a simple 1:1 mapping unless legal_chars is
* defined.)
*/
ncodes = 0;
if (eof) {
nodes[ncodes].sym = SYM_EOF;
nodes[ncodes].count = eof;
nodes[ncodes].parent = NULL;
n2[ncodes] = &nodes[ncodes];
map[SYM_EOF] = ncodes++;
}
/* All 256 chars existing at a minimal level */
if (all_codes) {
for (i = 0; i < 256; i++) {
nodes[ncodes].sym = i;
nodes[ncodes].count = hist[i];
nodes[ncodes].parent = NULL;
n2[ncodes] = &nodes[ncodes];
map[i] = ncodes++;
}
} else {
/* Only include non-zero symbols */
for (i = 0; i < 256; i++) {
if (hist[i] == 0)
continue;
nodes[ncodes].sym = i;
nodes[ncodes].count = hist[i];
nodes[ncodes].parent = NULL;
n2[ncodes] = &nodes[ncodes];
map[i] = ncodes++;
}
}
/* Sort by counts, smallest first and form a sorted linked list */
qsort(n2, ncodes, sizeof(*n2), node_compar2);
/* Skip symbols that do not occur, unless all_codes is true */
for (i = 0; i < ncodes; i++) {
n2[i]->next = i+1 < ncodes ? n2[i+1] : NULL;
}
/* Repeatedly merge two smallest values */
head = n2[0];
while (head && head->next) {
node_t *after = head->next, *n;
int sum = head->count + head->next->count;
for (n = head->next->next; n; after = n, n = n->next) {
if (sum < n->count)
break;
}
/* Produce a new summation node and link it in place */
after->next = new;
new->next = n;
new->sym = '?';
new->count = sum;
new->parent = NULL;
head->parent = new;
head->next->parent = new;
head = head->next->next;
new++;
}
/* Walk up tree computing the bit-lengths for our symbols */
c->ncodes = ncodes;
c->codes = (huffman_code_t *)malloc(c->ncodes * sizeof(*c->codes));
if (NULL == c->codes) {
free(c);
return NULL;
}
for (i = 0; i < ncodes; i++) {
int len = 0;
node_t *n;
for (n = n2[i]->parent; n; n = n->parent) {
len++;
}
c->codes[i].symbol = n2[i]->sym;
c->codes[i].freq = n2[i]->count;
c->codes[i].nbits = len ? len : 1; /* special case, nul input */
}
if (0 != canonical_codes(c)) {
free(c);
return NULL;
}
return c;
}
/*
* A special case of the generate_code_set() function below, but for
* creating predefined code sets from bit-length arrays. Useful for
* code that wants to use a predetermined huffman tree.
*
*
* Returns huffman_codes_t* on success; free using huffman_codes_destroy().
* NULL on failure.
*/
huffman_codeset_t *codes2codeset(huffman_code_t *codes, int ncodes,
int code_num) {
huffman_codeset_t *cs;
huffman_codes_t *c;
if (NULL == (cs = (huffman_codeset_t *)malloc(sizeof(*cs))))
return NULL;
cs->codes = (huffman_codes_t **)malloc(sizeof(*cs->codes));
cs->codes[0] = c;
cs->ncodes = 1;
cs->code_set = code_num;
cs->blk = NULL;
cs->bit_num = 0;
cs->decode_t = NULL;
cs->decode_J4 = NULL;
if (NULL == (c = (huffman_codes_t *)malloc(sizeof(*c))))
return NULL;
c->codes_static = 1;
c->max_code_len = MAX_CODE_LEN;
c->codes = codes;
c->ncodes = ncodes;
cs->bit_num = 0; /* FIXME: need to know this */
canonical_codes(c);
return cs;
}
/*
* Initialises and returns a huffman_codes_t struct from a specified code_set.
* If code_set is not one of the standard predefined values then the
* input data is analysed using calc_bit_lengths() above to produce the
* optimal set of huffman codes, otherwise we return predefined values.
*
* 'eof' is a boolean to indicate whether the EOF symbol should be included
* in the symbols produced.
*
* all_codes is a boolean to indicate whether we should include symbols not
* found in the input data set. (This was used to create the static lookup
* tables.)
*
* Returns huffman_codes_t* on success; free using huffman_codes_destroy().
* NULL on failure.
*/
huffman_codeset_t *generate_code_set(int code_set, int ncodes,
unsigned char *data, int len,
int eof, int max_code_len,
int all_codes) {
huffman_codeset_t *cs;
/*
* Either INLINE or a CODE_USER+ set of codes.
* => analyse the data and compute a new set of bit-lengths & codes.
*/
if (code_set >= 128 || code_set == CODE_INLINE) {
int i;
if (NULL == (cs = (huffman_codeset_t *)malloc(sizeof(*cs))))
return NULL;
cs->code_set = code_set;
cs->ncodes = ncodes;
cs->codes = (huffman_codes_t **)malloc(cs->ncodes*sizeof(*cs->codes));
cs->blk = NULL;
cs->bit_num = 0;
cs->decode_t = NULL;
cs->decode_J4 = NULL;
for (i = 0; i < ncodes; i++) {
/*
* If requested, include EOF all code sets, but at a
* frequency of only '1' occurrance where we predict it
* not to be needed.
*/
if (eof && (len+i)%ncodes)
eof = 1;
cs->codes[i] = calc_bit_lengths(data, len, eof, max_code_len,
all_codes, i, ncodes);
cs->codes[i]->codes_static = 0;
if (NULL == cs->codes[i]) {
/* FIXME: tidy up */
return NULL;
}
canonical_codes(cs->codes[i]);
}
/*
* Otherwise we use the determined codes at the top of this file, such
* as codes_dna and codes_english.
*/
} else {
if (code_set < 1 || code_set >= NCODES_STATIC) {
fprintf(stderr, "Unknown huffman code set '%d'\n", code_set);
return NULL;
}
/* If our global codeset hasn't been initialised yet, do so */
if (!static_codeset[code_set]) {
huffman_codes_t *c = (huffman_codes_t *)malloc(sizeof(*c));
if (NULL == (cs = (huffman_codeset_t *)malloc(sizeof(*cs))))
return NULL;
cs->codes = (huffman_codes_t **)malloc(sizeof(*cs->codes));
cs->codes[0] = c;
cs->ncodes = 1;
cs->code_set = code_set;
cs->blk = NULL;
cs->bit_num = 0;
cs->decode_t = NULL;
cs->decode_J4 = NULL;
c->codes_static = 1;
c->max_code_len = MAX_CODE_LEN;
switch(code_set) {
case CODE_DNA:
c->codes = codes_dna;
c->ncodes = sizeof(codes_dna)/sizeof(*c->codes);
cs->bit_num = 5;
break;
case CODE_DNA_AMBIG:
c->codes = codes_dna_ambig;
c->ncodes = sizeof(codes_dna_ambig)/sizeof(*c->codes);
cs->bit_num = 1;
break;
case CODE_ENGLISH:
c->codes = codes_english;
c->ncodes = sizeof(codes_english)/sizeof(*c->codes);
cs->bit_num = 1;
break;
default:
fprintf(stderr, "Unknown huffman code set '%d'\n", code_set);
return NULL;
}
canonical_codes(c);
static_codeset[code_set] = cs;
}
cs = static_codeset[code_set];
}
return cs;
}
void huffman_codes_destroy(huffman_codes_t *c) {
if (!c)
return;
if (!c->codes_static && c->codes)
free(c->codes);
free(c);
}
void huffman_codeset_destroy(huffman_codeset_t *cs) {
int i;
if (!cs)
return;
/* If this codeset is one of the predefined global ones we do nothing */
if (cs->ncodes == 1 && cs->codes[0]->codes_static)
return;
for (i = 0; i < cs->ncodes; i++)
huffman_codes_destroy(cs->codes[i]);
if (cs->codes)
free(cs->codes);
if (cs->blk)
block_destroy(cs->blk, 0);
if (cs->decode_t)
free(cs->decode_t);
if (cs->decode_J4)
free(cs->decode_J4);
free(cs);
}
/*
* ---------------------------------------------------------------------------
* Encoding and decoding related functions
*/
/*
* Can store up to 24-bits worth of data encoded in an integer value
* Possibly we'd want to have a less optimal store_bits function when dealing
* with nbits > 24, but for now we assume the codes generated are never
* that big. (Given this is only possible with 121392 or more
* characters with exactly the correct frequency distribution we check
* for it elsewhere.)
*/
static void store_bits(block_t *block, unsigned int val, int nbits) {
/* fprintf(stderr, " store_bits: %02x %d\n", val, nbits); */
#if 1
{
unsigned int curr = block->data[block->byte];
curr |= (val & ((1 << nbits)-1)) << block->bit;
block->bit += nbits;
while (block->bit >= 8) {
block->data[block->byte++] = curr & 0xff;
curr >>= 8;
block->bit -= 8;
}
block->data[block->byte] = curr & 0xff;
}
return;
#else
{
/* Slow, but simple */
unsigned int mask = 1;
int bit = 1 << block->bit;
do {
if (val & mask)
block->data[block->byte] |= bit;
/*
* Data should be zeroed anyway, so this is not needed.
*
* else
* block->data[block->byte] &= ~bit;
*/
if (++block->bit == 8) {
block->bit = 0;
block->byte++;
block->data[block->byte] = 0;
bit = 1;
} else {
bit <<= 1;
}
mask <<= 1;
} while(--nbits);
}
#endif
}
/*
* Reads up to 24-bits worth of data and returns. Updates the block
* byte and bit values to indicate the current 'read' position.
*
* Returns unsigned value on success (>=0)
* -1 on failure
*/
static signed int get_bits(block_t *block, int nbits) {
unsigned int val, bnum = 0;
if (block->byte*8 + block->bit + nbits > block->alloc * 8)
return -1;
/* Fetch the partial byte of data */
val = (block->data[block->byte]) >> block->bit;
bnum = 8 - block->bit;
/* And additional entire bytes worth as required */
while (bnum <= nbits) {
val |= block->data[++block->byte] << bnum;
bnum += 8;
}
block->bit = (block->bit + nbits) % 8;
return val & ((1 << nbits) - 1);
}
/* stores nbytes bytes, padding to align on the next byte boundary */
void store_bytes(block_t *block, unsigned char *val, int nbytes) {
/* Align */
if (block->bit != 0) {
block->byte++;
block->bit = 0;
}
/* Resize */
block_resize(block, block->byte + nbytes + 1);
/* Store */
memcpy(&block->data[block->byte], val, nbytes);
block->byte += nbytes;
}
/*
* Encodes the huffman symbol bit-lengths as a serialised block of data
* suitable for storing in a ZTR "ZLBH" chunk. This uses the Deflate
* storage format defined in RFC1951.
*
* Returns: 0 on success
* -1 on failure
*/
int store_codes_single(block_t *out, huffman_codes_t *c) {
int i;
unsigned char bl_code[257]; /* bit-length codes and for codes 16-18 */
unsigned char bl_opt[257]; /* the operand to the blcode */
unsigned char sorted_codes[258];
int bl_freq[19]; /* frequency of bit-length codes produced */
int bl_count;
huffman_codes_t *bl_cds = NULL;
int hclen_order[] = {
16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
};
int hlit, hdist, hclen, hcmap[19];
/* output_code_set (stderr, c); */
if (out->alloc < out->byte + 1000) {
out->alloc = out->byte + 1000;
if (NULL == (out->data = realloc(out->data, out->alloc)))
return -1;
}
/*
*-----------------------------------------------------------------
* Reformat the dynamic code bit-lengths into an alphabet of 19
* "code length" symbols as defined in RFC1951.
*/
memset(sorted_codes, 0, 258);
for (i = 0; i < c->ncodes; i++) {
sorted_codes[c->codes[i].symbol] = c->codes[i].nbits;
}
for (i = 0; i < 19; i++)
bl_freq[i] = 0;
bl_count = 0;
for (i = 0; i < 257; ) {
int j = i+1, n;
int v = sorted_codes[i];
while (j < 257 && sorted_codes[j] == v)
j++;
n = j-i; /* n = run-length */
/* fprintf(stderr, "value=%d, run_len=%d\n", v, n); */
if (v == 0) {
/* bit-len zero => no code and uses code 17/18 for run len */
while (n > 0) {
while (n >= 11) {
bl_freq[18]++;
bl_code[bl_count] = 18;
bl_opt[bl_count] = MIN(n, 138)-11;
n -= bl_opt[bl_count]+11;
bl_count++;
}
while (n >= 3) {
bl_freq[17]++;
bl_code[bl_count] = 17;
bl_opt[bl_count] = MIN(n, 10)-3;
n -= bl_opt[bl_count]+3;
bl_count++;
}
switch (n) {
case 2: bl_code[bl_count++] = 0; bl_freq[0]++; n--;
case 1: bl_code[bl_count++] = 0; bl_freq[0]++; n--;