/
maxvid_decode.c
4090 lines (3382 loc) · 155 KB
/
maxvid_decode.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// maxvid module
//
// License terms defined in License.txt.
//
// This module defines a runtime execution speed optimized video decoder library for iOS.
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <math.h>
#include <assert.h>
#include <limits.h>
#include <unistd.h>
// Define EXTRA_CHECKS to enable assert checks in the decoder
//#define EXTRA_CHECKS
//#define MAXVID_ALWAYS_ASSERT_EXTRA_CHECKS
/*
#ifndef __OPTIMIZE__
// Automatically define EXTRA_CHECKS when not optimizing (in debug mode)
# define EXTRA_CHECKS
#endif // DEBUG
*/
/*
#if TARGET_IPHONE_SIMULATOR
// Automatically define EXTRA_CHECKS when running in the simulator
# define EXTRA_CHECKS
#endif // DEBUG
*/
#if defined(MAXVID_EXTRA_CHECKS) && !defined(EXTRA_CHECKS)
# define EXTRA_CHECKS
#endif
// Note that ASM logic needs to be defined after optional EXTRA_CHECKS
#if defined(__arm__)
# define COMPILE_ARM 1
# if defined(__thumb__)
# define COMPILE_ARM_THUMB_ASM 1
# else
# define COMPILE_ARM_ASM 1
# endif
#endif
// Xcode 4.2 supports clang only, but the ARM asm integration depends on specifics
// of register allocation and as a result only works when compiled with gcc.
#if defined(__clang__)
# define COMPILE_CLANG 1
#endif // defined(__clang__)
// For CLANG build on ARM, skip this entire module and use custom ARM asm imp instead.
// It is possible that clang is compiling in Thumb2 mode, just use the already generated
// ARM code and do not generate an error in this case.
#if defined(COMPILE_CLANG) && defined(COMPILE_ARM)
#define USE_GENERATED_ARM_ASM 1
#endif // SKIP __clang__ && ARM
// GCC 4.2 and newer seems to allocate registers in a way that breaks the inline
// arm asm in maxvid_decode.c, so use the ARM asm in this case.
#if defined(__GNUC__) && !defined(__clang__) && defined(COMPILE_ARM)
# define __GNUC_PREREQ(maj, min) \
((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))
# if __GNUC_PREREQ(4,2)
# define USE_GENERATED_ARM_ASM 1
# endif
#endif
// This inline asm flag would only be used if USE_GENERATED_ARM_ASM was not defined
#if defined(COMPILE_ARM)
# define USE_INLINE_ARM_ASM 1
#endif
// It is possible one might want to actually compile the C code on an
// ARM system and simply not use the inline ASM blocks and let the
// compiler generate ARM code automatically. Set the argument
// value for this if to 1 to enable build on ARM without inline ASM.
#if 0 && defined(USE_GENERATED_ARM_ASM)
#undef USE_GENERATED_ARM_ASM
#undef USE_INLINE_ARM_ASM
#endif
#if defined(USE_GENERATED_ARM_ASM)
// No-op, skip compilation of this entire module!
#else // defined(USE_GENERATED_ARM_ASM)
#ifdef COMPILE_ARM_ASM
#define ASM_NOP __asm__ __volatile__ ("nop");
#else
#define ASM_NOP
#endif
// Generate a compile time error if compiled in Thumb mode. This module includes ARM specific
// ASM code, so it can't be compiled in Thumb mode.
#if defined(COMPILE_ARM_THUMB_ASM)
#error "Module should not be compiled in Thumb mode, enable ARM mode by adding -mno-thumb to file specific target flags"
#endif
#if defined(MAXVID_MODULE_PREFIX)
# define MODULE_PREFIX MAXVID_MODULE_PREFIX
# define MAXVID_NON_DEFAULT_MODULE_PREFIX
#else
# define MODULE_PREFIX maxvid_
# define MAXVID_DEFAULT_MODULE_PREFIX
#endif
// The header defines only the default exported function names. If this module is being pulled into
// the test module, then the default symbols are not declared to avoid accidently using them.
#include "maxvid_decode.h"
// Fancy macro expansion so that FUNCTION_NAME(MODULE_PREFIX, decode_sample16) -> maxvid_decode_sample16
#define MAKE_FN_NAME(mprefix, x) mprefix ## x
#define FUNCTION_NAME(mprefix, fname) MAKE_FN_NAME(mprefix, fname)
//#ifdef EXTRA_CHECKS
// Combine two 16 bit pixels into a single uint32_t word
#define HwToWord(pixel) ((((uint32_t) pixel) << 16) | (uint32_t)pixel)
#define MAX_5_BITS MV_MAX_5_BITS
#define MAX_11_BITS MV_MAX_11_BITS
// The ARM docs indicate that stm is faster when the output buffer is 64 bit
// aligned since 2 words can be written with each cycle. If the cache line
// can be filled with 1 write, that will perform better. The cache line
// size for a Cortex-A8 is 16 words (64 bytes), the cache line size for
// a Cortext-A9 is 8 words (32 bytes). Old iPhone 3G Arm 11 CPUs also used
// a cache line that was 8 words. Since there is no method to attempt to
// detect device capabilities, use the common 8 word bound size
// and assume that writing 8 word blocks will fill one cache line for 2/3
// known ARM CPUs.
//
// While reads do need to be 32 bit aligned, it is not as critical for the
// reads to be 64 bit aligned in a memcpy type read/write loop.
#define MV_CACHE_LINE_SIZE 8
#define BOUNDSIZE (MV_CACHE_LINE_SIZE * sizeof(uint32_t))
// Note that the following code can only be built with EXTRA_CHECKS in debug mode, because the
// inlined ASM uses the stack frame register.
#undef EXTRA_CHECKS
// Note that this module can't be compiled with debug symbols without also enabling
// EXTRA_CHECKS. The result is that execution time is significantly slowed down
// when debug mode is enabled. But, gcc will bomb out with a register allocation
// error if EXTRA_CHECKS is not enabled, so there is little choice.
#ifndef __OPTIMIZE__
// Automatically define EXTRA_CHECKS when not optimizing (in debug mode)
# define EXTRA_CHECKS
#endif // __OPTIMIZE__
// "No stack" safe assert macro, invokes inlined function call and restores
// the registers. This macro is needed to use ASSERT logic in a function with no free registers.
#if defined(USE_INLINE_ARM_ASM)
#ifdef EXTRA_CHECKS
static uint32_t r0r1r2r3r4r5r6r8r10r11r12r14[12];
#endif
#endif // USE_INLINE_ARM_ASM
#ifndef __OPTIMIZE__
//__attribute__ ((noinline))
static inline
void maxvid_test_assert_util_c4(int cond) {
#if defined(USE_INLINE_ARM_ASM)
#if 0 && defined(EXTRA_CHECKS)
__asm__ __volatile__ (
"cmp r0, #0\n\t"
"mov r9, %[ptr]\n\t"
"ldm r9, {r0, r1, r2, r3, r4, r5, r6, r8, r10, r11, r12, r14}\n\t"
"pop {r9}\n\t"
"moveq r0, #0\n\t"
"streq r0, [r0]\n\t"
:
:
[ptr] "l" (r0r1r2r3r4r5r6r8r10r11r12r14)
);
#else
__asm__ __volatile__ (
"cmp r0, #0\n\t"
"moveq r0, #0\n\t"
"streq r0, [r0]\n\t"
);
#endif // EXTRA_CHECKS
#else // USE_INLINE_ARM_ASM
if (cond == 0) {
// This is handy so that one can set a breakpoint on this assert call
*((volatile uint32_t*)NULL) = 0;
}
#endif // USE_INLINE_ARM_ASM
return;
}
#endif // __OPTIMIZE__
#undef MAXVID_ASSERT
#if defined(USE_INLINE_ARM_ASM)
# define MAXVID_ASSERT(cond, cstr) \
__asm__ __volatile__ ( \
"nop\n\t" \
"push {r9}\n\t" \
"push {r0, r1, r2, r3, r4, r5, r6, r8, r10, r11, r12, r14}\n\t" \
: \
: \
); \
__asm__ __volatile__ ( \
"mov r9, %[ptr]\n\t" \
"pop {r0, r1, r2, r3, r4, r5, r6, r8, r10, r11, r12, r14}\n\t" \
"stm r9, {r0, r1, r2, r3, r4, r5, r6, r8, r10, r11, r12, r14}\n\t" \
"pop {r9}\n\t" \
: \
: \
[ptr] "l" (r0r1r2r3r4r5r6r8r10r11r12r14) \
); \
maxvid_test_assert_util_c4(cond); \
__asm__ __volatile__ ( \
"push {r9}\n\t" \
"mov r9, %[ptr]\n\t" \
"ldm r9, {r0, r1, r2, r3, r4, r5, r6, r8, r10, r11, r12, r14}\n\t" \
"pop {r9}\n\t" \
"nop\n\t" \
: \
: \
[ptr] "l" (r0r1r2r3r4r5r6r8r10r11r12r14) \
);
#else
# define MAXVID_ASSERT(cond, cstr) maxvid_test_assert_util_c4(cond);
#endif // USE_INLINE_ARM_ASM
// Create optimized impl
// This template is used to create a test and an optimized version of the c4 decode sample logic.
// This can't be implemented with the inline trick used elsewhere because the compiler runs
// out of registers while inlining static functions.
// maxvid_decode_c4_sample16
// Decode input RLE, input data is already validated
__attribute__ ((noinline))
uint32_t
FUNCTION_NAME(MODULE_PREFIX, decode_c4_sample16) (
uint16_t * restrict frameBuffer16Arg,
const uint32_t * restrict inputBuffer32Arg,
const uint32_t inputBuffer32NumWords,
const uint32_t frameBufferSize)
{
// Usable registers:
// r0 -> r3 (scratch, compiler will write over these registers at sneaky times)
// r4 -> r10 (r7 in thumb mode is the frame pointer, gdb uses r7 in arm mode)
// r11 is the frame pointer in ARM mode
// r12 tmp register (can't seem to bind to this register)
// r13 stack pointer (only usable if no stack use in function)
// r14 link register (gcc runs out of registers if you use this one)
// r15 is the program counter (don't use)
#if !defined(USE_INLINE_ARM_ASM) || defined(EXTRA_CHECKS)
const uint32_t copyOnePixelHighHalfWord = (((uint32_t)COPY) << 14 | 0x1);
const uint32_t dupTwoPixelsHighHalfWord = (((uint32_t)DUP) << 14 | 0x2);
const uint32_t extractNumPixelsHighHalfWord = 0x3FFF;
#endif // !USE_INLINE_ARM_ASM || EXTRA_CHECKS
#if defined(EXTRA_CHECKS)
// Double check that half word constants are not just totally wrong
{
//uint32_t expectedDup2Word = maxvid16_c4_code(DUP, 2, 0xFFFF);
uint32_t expectedDup2Word = 0x4002FFFF;
uint16_t expectedDup2HighHalfWord = (expectedDup2Word >> 16);
if (dupTwoPixelsHighHalfWord != expectedDup2HighHalfWord) {
MAXVID_ASSERT(dupTwoPixelsHighHalfWord == expectedDup2HighHalfWord, "dupTwoPixelsHighHalfWord");
}
}
{
//uint32_t expectedCopy1Word = maxvid16_c4_code(COPY, 1, 0xFFFF);
uint32_t expectedCopy1Word = 0x8001ffff;
uint16_t expectedCopy1HighHalfWord = (expectedCopy1Word >> 16);
if (copyOnePixelHighHalfWord != expectedCopy1HighHalfWord) {
MAXVID_ASSERT(copyOnePixelHighHalfWord == expectedCopy1HighHalfWord, "copyOnePixelHighHalfWord");
}
}
#endif
#if 1 && defined(USE_INLINE_ARM_ASM)
register uint32_t * restrict inputBuffer32 __asm__ ("r9") = (uint32_t * restrict) inputBuffer32Arg;
register uint16_t * restrict frameBuffer16 __asm__ ("r10") = frameBuffer16Arg;
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(inputBuffer32 == inputBuffer32Arg, "inputBuffer32Arg");
MAXVID_ASSERT(frameBuffer16 == frameBuffer16Arg, "frameBuffer16Arg");
#endif
// This register holds the input word, it is clobbered by the 8 word loop
register uint32_t inW1 __asm__ ("r8"); // AKA WR8
// This register holds the op code during the initial parse, it is clobbered by the 8 word loop
register uint32_t opCode __asm__ ("r6"); // AKA WR7
// This register holds the numPixels value during a COPY or DUP operation.
// It does not get clobbered by the 8 word loop.
register uint32_t numPixels __asm__ ("r12");
// This register counts down the numWords during a COPY or DUP operation,
// it does not get clobbered by the 8 word loop.
register uint32_t numWords __asm__ ("r14");
// This register holds a one pixel constant value, it is not clobbered by the word 8 loop.
#ifdef EXTRA_CHECKS
// Frame and stack pointers needed in debug mode
uint32_t copyOnePixelHighHalfWordConstRegister;
#else // EXTRA_CHECKS
register uint32_t copyOnePixelHighHalfWordConstRegister __asm__ ("r11");
#endif // EXTRA_CHECKS
// These alias vars is used to hold a constant value for use in the DECODE block. Clobbered by the word8 loop.
register uint32_t extractNumPixelsHighHalfWordConstRegister __asm__ ("r4"); // AKA WR5
register uint32_t dupTwoPixelsHighHalfWordConstRegister __asm__ ("r5"); // AKA WR6
// Explicitly define the registers outside the r0 -> r3 range
// These registers are used with ldm and stm instructions.
// During a write loop, these values could write over other
// values mapped to the same registers. Note that we skip r7
// since gdb uses it for debugging. Also be aware that gcc
// could secretly write over the value in r0 to r3 in
// debug mode.
register uint32_t WR1 __asm__ ("r0");
register uint32_t WR2 __asm__ ("r1");
register uint32_t WR3 __asm__ ("r2");
register uint32_t WR4 __asm__ ("r3");
register uint32_t WR5 __asm__ ("r4");
register uint32_t WR6 __asm__ ("r5");
register uint32_t WR7 __asm__ ("r6");
register uint32_t WR8 __asm__ ("r8");
// gcc is buggy when it comes to initializing register variables. Explicitly initialize the
// registers with inline ASM. This is required to avoid problems with the optimizer removing
// init code because it incorrectly thinks assignments are aliases.
__asm__ __volatile__ (
"mov %[inW1], #0\n\t"
:
[inW1] "+l" (inW1)
);
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(inW1 == 0, "inW1");
#endif
#else // USE_INLINE_ARM_ASM
register uint32_t * restrict inputBuffer32 = (uint32_t * restrict) inputBuffer32Arg;
register uint16_t * restrict frameBuffer16 = frameBuffer16Arg;
register uint32_t inW1 = 0;
uint32_t opCode;
register uint32_t numPixels;
register uint32_t numWords;
register uint32_t WR1 = 0;
register uint32_t WR2;
register uint32_t WR3;
register uint32_t WR4 = 0;
register uint32_t WR5;
#endif // USE_INLINE_ARM_ASM
// Init constants in registers
// FIXME: If DUP were 2 and COPY were 1, only a single right shift would be needed
// to convert from COPY1 to DUP2. Could also do 0xC003 with an xor. But, all this
// could be done in 1 immediate compare is ccode and numPixels were switchedin the word
#if defined(USE_INLINE_ARM_ASM)
__asm__ __volatile__ (
"mov %[constReg1], #1\n\t"
"mvn %[constReg2], #0xC000\n\t"
"orr %[constReg1], %[constReg1], %[constReg1], lsl #15\n\t"
"mov %[constReg3], #2\n\t"
"orr %[constReg3], %[constReg3], %[constReg1], lsr #1\n\t"
:
[constReg1] "+l" (copyOnePixelHighHalfWordConstRegister),
[constReg2] "+l" (extractNumPixelsHighHalfWordConstRegister),
[constReg3] "+l" (dupTwoPixelsHighHalfWordConstRegister)
);
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(copyOnePixelHighHalfWordConstRegister == copyOnePixelHighHalfWord, "copyOnePixelHighHalfWordConstRegister");
MAXVID_ASSERT(dupTwoPixelsHighHalfWordConstRegister == dupTwoPixelsHighHalfWord, "dupTwoPixelsHighHalfWordConstRegister");
MAXVID_ASSERT(extractNumPixelsHighHalfWordConstRegister == ((0xFFFF << 16) | extractNumPixelsHighHalfWord), "extractNumPixelsHighHalfConstRegister");
#endif // EXTRA_CHECKS
#endif // USE_INLINE_ARM_ASM
#ifdef EXTRA_CHECKS
const int pagesize = getpagesize();
#if __LP64__
MAXVID_ASSERT((MV_PAGESIZE % pagesize) == 0, "pagesize");
#else
MAXVID_ASSERT(pagesize == MV_PAGESIZE/4, "pagesize");
#endif // __LP64__
MAXVID_ASSERT(inputBuffer32 != NULL, "inputBuffer32");
// The input buffer must be word aligned
MAXVID_ASSERT(UINTMOD(inputBuffer32, sizeof(uint32_t)) == 0, "inputBuffer32 initial alignment");
MAXVID_ASSERT(frameBuffer16 != NULL, "frameBuffer16");
// The framebuffer must be word aligned to start out with
MAXVID_ASSERT(UINTMOD(frameBuffer16, sizeof(uint32_t)) == 0, "frameBuffer16 initial alignment");
// In addition, the framebuffer must begin on a page boundry
MAXVID_ASSERT(UINTMOD(frameBuffer16, pagesize) == 0, "frameBuffer16 initial page alignment");
MAXVID_ASSERT(frameBufferSize > 0, "frameBufferSize");
uint16_t * restrict inframeBuffer16 = frameBuffer16;
uint16_t * restrict frameBuffer16Max = frameBuffer16 + frameBufferSize;
uint32_t * restrict inInputBuffer32 = (uint32_t *)inputBuffer32;
uint32_t * restrict inputBuffer32Max = inInputBuffer32 + inputBuffer32NumWords;
// inputBuffer32 - inInputBuffer32 gives the input word offset
MAXVID_ASSERT(inInputBuffer32 != NULL, "inInputBuffer32");
// Init to phony value
uint32_t * restrict prevInputBuffer32 = inInputBuffer32 - 1;
// Verify that the DONE code appears at the end of the input, followed by a zero word.
MAXVID_ASSERT(*(inputBuffer32Max - 1) == (DONE << 30), "DONE");
// These stack values save the expected contents of registers on the stack, to double check that
// the values were not between the time they were set and when they were used.
uint32_t opCodeSaved;
uint32_t numPixelsSaved;
uint32_t numWordsSaved;
uint32_t inW1Saved;
uint32_t pixel32Saved;
#endif
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(inputBuffer32 != NULL, "inputBuffer32");
MAXVID_ASSERT(UINTMOD(inputBuffer32, sizeof(uint32_t)) == 0, "inputBuffer32 alignment");
// inputBuffer32 should be 1 word ahead of the previous read (ignored in COPY case)
MAXVID_ASSERT(inputBuffer32 == (prevInputBuffer32 + 1), "inputBuffer32 != previous");
prevInputBuffer32 = inputBuffer32;
#endif
#if defined(USE_INLINE_ARM_ASM)
__asm__ __volatile__ (
// inW1 = *inputBuffer32++
"ldr %[inW1], [%[inputBuffer32]], #4\n\t"
:
[inW1] "+l" (inW1),
[inputBuffer32] "+l" (inputBuffer32)
);
#else
inW1 = *inputBuffer32++;
#endif // USE_INLINE_ARM_ASM
#ifdef EXTRA_CHECKS
inW1Saved = inW1;
MAXVID_ASSERT(inputBuffer32 == (prevInputBuffer32 + 1), "inputBuffer32 != previous");
#endif
#if defined(USE_INLINE_ARM_ASM)
__asm__ __volatile__ (
"@ goto DECODE_16BPP\n\t"
:
);
#endif // USE_INLINE_ARM_ASM
goto DECODE_16BPP;
DUP_16BPP:
// This block is just here to work around what appears to be a compiler bug related to a label.
{
}
#ifdef USE_INLINE_ARM_ASM
__asm__ __volatile__ (
"@ DUP\n\t"
);
#endif // USE_INLINE_ARM_ASM
// Word align the framebuffer, if needed.
// Note that DUP2 was handled inline already.
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(numPixels == numPixelsSaved, "numPixelsSaved");
MAXVID_ASSERT(numPixels != 0, "numPixels != 0");
MAXVID_ASSERT(numPixels != 1, "numPixels != 1");
MAXVID_ASSERT(numPixels > 2, "numPixels > 2");
#endif
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(inputBuffer32 != NULL, "inputBuffer32");
MAXVID_ASSERT(UINTMOD(inputBuffer32, sizeof(uint32_t)) == 0, "inputBuffer32 alignment");
MAXVID_ASSERT(frameBuffer16 != NULL, "frameBuffer16");
MAXVID_ASSERT(UINTMOD(frameBuffer16, sizeof(uint16_t)) == 0, "frameBuffer16 alignment");
MAXVID_ASSERT((((frameBuffer16 + numPixels - 1) - inframeBuffer16) < frameBufferSize), "MV16_CODE_COPY past end of framebuffer");
#endif
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(inW1 == inW1Saved, "inW1Saved");
#endif // EXTRA_CHECKS
// Duplicate the 16 bit pixel as a pair of 32 bit pixels in the first write register
#define pixel32Alias WR1
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(((frameBuffer16 - inframeBuffer16) < frameBufferSize), "word align: already past end of framebuffer");
MAXVID_ASSERT(numPixels == numPixelsSaved, "numPixelsSaved");
#endif // EXTRA_CHECKS
#if defined(USE_INLINE_ARM_ASM)
// Duplicate the 16 bit pixel as a pair of 32 bit pixels in the first write register.
// This logic is mixed into the framebuffer align between the compare and the
// conditional instruction in an attempt to get better pipeline results.
__asm__ __volatile__ (
"tst %[frameBuffer16], #3\n\t"
"pkhbt %[pixel32], %[inW1], %[inW1], lsl #16\n\t"
"subne %[numPixels], %[numPixels], #1\n\t"
"strneh %[inW1], [%[frameBuffer16]], #2\n\t"
:
[frameBuffer16] "+l" (frameBuffer16),
[numPixels] "+l" (numPixels),
[pixel32] "+l" (pixel32Alias)
:
[inW1] "l" (inW1)
);
#else // USE_INLINE_ARM_ASM
if (UINTMOD(frameBuffer16, 4) != 0) {
// Framebuffer is half word aligned, write 16 bit pixel in the low half word
*frameBuffer16++ = inW1;
numPixels--;
}
#endif // USE_INLINE_ARM_ASM
#ifdef EXTRA_CHECKS
numPixelsSaved = numPixels;
pixel32Saved = pixel32Alias;
MAXVID_ASSERT(pixel32Alias == pixel32Saved, "pixel32Saved");
// DUP numPixels min was 3, now min is 2 because of word alignment
MAXVID_ASSERT(numPixels >= 2, "numPixels >= 2");
#endif // EXTRA_CHECKS
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(numPixels == numPixelsSaved, "numPixelsSaved");
MAXVID_ASSERT(opCode == opCodeSaved, "opCodeSaved");
MAXVID_ASSERT(frameBuffer16 != NULL, "frameBuffer16");
MAXVID_ASSERT(UINTMOD(frameBuffer16, sizeof(uint16_t)) == 0, "frameBuffer16 alignment");
#endif // EXTRA_CHECKS
// numWords is numPixels/2, counts down to zero in the word8 loop.
// num is a 14 bit number that indicates the number of pixels to copy.
// This logic must appear after the framebuffer has been word aligned
// since that logic can decrement the numPixels by 1 in the
// unaligned case.
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(numPixels == numPixelsSaved, "numPixelsSaved");
#endif
#ifdef USE_INLINE_ARM_ASM
__asm__ __volatile__ (
"mov %[numWords], %[numPixels], lsr #1\n\t"
:
[numWords] "+l" (numWords)
:
[numPixels] "l" (numPixels)
);
#else // USE_INLINE_ARM_ASM
// Note that the inline ASM above is needed to avoid stack use in conditional case
numWords = (numPixels >> 1);
#endif // USE_INLINE_ARM_ASM
#ifdef EXTRA_CHECKS
numWordsSaved = numWords;
MAXVID_ASSERT(numWordsSaved == (numPixels >> 1), "numWordsSaved");
MAXVID_ASSERT(numPixels > numWords, "numPixels > numPixels");
// numPixels is a 14 bit number, so numWords can't be larger than 0x3FFF / 2
MAXVID_ASSERT(numWords <= (0x3FFF >> 1), "numWords");
// The min num pixels at this point is 2, so min number of words is 1, zero is not a valid value
MAXVID_ASSERT(numWords >= 1, "numWords >= 1");
#endif
// pixel32
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(inW1 == inW1Saved, "inW1Saved");
#endif // EXTRA_CHECKS
#if defined(USE_INLINE_ARM_ASM)
// Copy the low half word into the high half with with 1 ASM instruction, instead of 2
// PKHBT r0, r3, r5, LSL #16 ; combine the bottom halfword of r3 with the bottom halfword of r5
// __asm__ __volatile__ (
// "pkhbt %[pixel32], %[inW1], %[inW1], lsl #16\n\t"
// :
// [pixel32] "+l" (pixel32Alias)
// :
// [inW1] "l" (inW1)
// );
#else // USE_INLINE_ARM_ASM
pixel32Alias = (uint16_t) inW1;
pixel32Alias |= (inW1 << 16);
#ifdef EXTRA_CHECKS
pixel32Saved = pixel32Alias;
#endif // EXTRA_CHECKS
#endif // USE_INLINE_ARM_ASM
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(pixel32Alias == pixel32Saved, "pixel32Saved");
MAXVID_ASSERT(pixel32Alias == (((uint16_t) inW1) | (inW1 << 16)), "pixel32");
#endif // EXTRA_CHECKS
// Read next word into inW1, this is with enough latency that fall through to DECODE will not be delayed
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(inputBuffer32 != NULL, "inputBuffer32");
MAXVID_ASSERT(UINTMOD(inputBuffer32, sizeof(uint32_t)) == 0, "inputBuffer32 alignment");
// inputBuffer32 should be 1 word ahead of the previous read (ignored in COPY case)
MAXVID_ASSERT(inputBuffer32 == (prevInputBuffer32 + 1), "inputBuffer32 != previous");
prevInputBuffer32 = inputBuffer32;
#endif
#if defined(USE_INLINE_ARM_ASM)
__asm__ __volatile__ (
// inW1 = *inputBuffer32++
"ldr %[inW1], [%[inputBuffer32]], #4\n\t"
:
[inW1] "+l" (inW1),
[inputBuffer32] "+l" (inputBuffer32)
);
#else
inW1 = *inputBuffer32++;
#endif // USE_INLINE_ARM_ASM
#ifdef EXTRA_CHECKS
inW1Saved = inW1;
MAXVID_ASSERT(inputBuffer32 == (prevInputBuffer32 + 1), "inputBuffer32 != previous");
MAXVID_ASSERT(inW1 == inW1Saved, "inW1Saved");
#endif
// DUPBIG_16BPP : branch forward to handle the case of a large number of words to DUP.
// The code in this path is optimized for 6 words or fewer. (12 pixels)
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(numWords == numWordsSaved, "numWordsSaved");
MAXVID_ASSERT(numWords > 0, "numWords");
#endif
#if defined(USE_INLINE_ARM_ASM)
__asm__ __volatile__ (
"@ if (numWords > 6) goto DUPBIG_16BPP\n\t"
:
);
#endif // USE_INLINE_ARM_ASM
if (numWords > 6) {
goto DUPBIG_16BPP;
}
#if defined(USE_INLINE_ARM_ASM)
__asm__ __volatile__ (
"@ DUPSMALL_16BPP\n\t"
);
#endif // USE_INLINE_ARM_ASM
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(numWords == numWordsSaved, "numWordsSaved");
MAXVID_ASSERT(numWords >= 1, "numWords");
MAXVID_ASSERT(numWords <= 6, "numWords");
MAXVID_ASSERT(frameBuffer16 != NULL, "frameBuffer16");
MAXVID_ASSERT(UINTMOD(frameBuffer16, sizeof(uint32_t)) == 0, "frameBuffer16 alignment");
MAXVID_ASSERT(numPixels == numPixelsSaved, "numPixelsSaved");
MAXVID_ASSERT(numWords == numWordsSaved, "numWordsSaved");
uint16_t *expectedDUPSmallPost8FrameBuffer16 = frameBuffer16;
expectedDUPSmallPost8FrameBuffer16 += (numWords * 2);
uint16_t *expectedDUPSmallFinalFrameBuffer16 = frameBuffer16 + numPixels;
MAXVID_ASSERT((expectedDUPSmallFinalFrameBuffer16 == expectedDUPSmallPost8FrameBuffer16) ||
(expectedDUPSmallFinalFrameBuffer16 == expectedDUPSmallPost8FrameBuffer16+1), "expected pointers");
MAXVID_ASSERT(numPixels == numPixelsSaved, "numPixelsSaved");
MAXVID_ASSERT(numWords == numWordsSaved, "numWordsSaved");
MAXVID_ASSERT(numPixels == numPixelsSaved, "numPixelsSaved");
MAXVID_ASSERT(numWords == numWordsSaved, "numWordsSaved");
if (numWords >= 3) {
MAXVID_ASSERT((numWords - 3) <= 3, "numWords - 3");
}
#endif
#if defined(USE_INLINE_ARM_ASM)
__asm__ __volatile__ (
"mov %[wr2], %[wr1]\n\t"
// if (numWords >= 3) then write 3 words
"cmp %[numWords], #2\n\t"
"mov %[wr3], %[wr1]\n\t"
"subgt %[numWords], %[numWords], #3\n\t"
"stmgt %[frameBuffer16]!, {%[wr1], %[wr2], %[wr3]}\n\t"
// if (numWords >= 2) then write 2 words
"cmp %[numWords], #1\n\t"
"stmgt %[frameBuffer16], {%[wr1], %[wr2]}\n\t"
// frameBuffer32 += numPixels;
"add %[frameBuffer16], %[frameBuffer16], %[numWords], lsl #2\n\t"
// if (numWords == 1 || numWords == 3) then write 1 word
"tst %[numWords], #0x1\n\t"
"strne %[wr1], [%[frameBuffer16], #-4]\n\t"
:
[frameBuffer16] "+l" (frameBuffer16),
[numWords] "+l" (numWords),
[wr1] "+l" (WR1),
[wr2] "+l" (WR2),
[wr3] "+l" (WR3)
);
#else // USE_INLINE_ARM_ASM
{
if (numWords >= 3) {
*((uint32_t*)frameBuffer16) = pixel32Alias;
*(((uint32_t*)frameBuffer16) + 1) = pixel32Alias;
*(((uint32_t*)frameBuffer16) + 2) = pixel32Alias;
frameBuffer16 += (3 * 2);
numWords -= 3;
}
if (numWords >= 2) {
*((uint32_t*)frameBuffer16) = pixel32Alias;
*(((uint32_t*)frameBuffer16) + 1) = pixel32Alias;
}
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(numWords >=0 && numWords <= 3, "numWords must be in range 0 to 3 here");
#endif
frameBuffer16 += (numWords << 1);
if (numWords & 0x1) {
*(((uint32_t*)frameBuffer16) - 1) = pixel32Alias;
}
}
#endif // USE_INLINE_ARM_ASM
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(frameBuffer16 != NULL, "frameBuffer16");
MAXVID_ASSERT(UINTMOD(frameBuffer16, sizeof(uint32_t)) == 0, "frameBuffer16 alignment");
MAXVID_ASSERT(frameBuffer16 == expectedDUPSmallPost8FrameBuffer16, "frameBuffer16 post8");
MAXVID_ASSERT(numWords >= 0 && numWords <= 3, "numWords");
#endif
// Emit trailing single pixel, if needed
#if defined(USE_INLINE_ARM_ASM)
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(numPixels == numPixelsSaved, "numPixelsSaved");
if (numPixels & 0x1) {
MAXVID_ASSERT(((frameBuffer16 - inframeBuffer16) < frameBufferSize), "DUP already past end of framebuffer");
}
#endif // EXTRA_CHECKS
__asm__ __volatile__ (
"tst %[numPixels], #1\n\t"
"strneh %[pixel32], [%[outPtr]], #2\n\t"
:
[outPtr] "+l" (frameBuffer16),
[numPixels] "+l" (numPixels),
[pixel32] "+l" (pixel32Alias)
);
#else // USE_INLINE_ARM_ASM
// By default, gcc would emit a conditional branch backwards,
// then the half word assign followed by an unconditional
// branch backwards. Putting the NOP asm in makes gcc
// emit the one conditional instruction folowed by an
// unconditional branch backwards.
if (numPixels & 0x1) {
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(((frameBuffer16 - inframeBuffer16) < frameBufferSize), "DUP already past end of framebuffer");
#endif // EXTRA_CHECKS
*frameBuffer16++ = pixel32Alias;
}
ASM_NOP
#endif // USE_INLINE_ARM_ASM
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(frameBuffer16 != NULL, "frameBuffer16");
MAXVID_ASSERT(UINTMOD(frameBuffer16, sizeof(uint16_t)) == 0, "frameBuffer16 alignment");
MAXVID_ASSERT(frameBuffer16 == expectedDUPSmallFinalFrameBuffer16, "frameBuffer16 final");
#endif
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(inW1 == inW1Saved, "inW1Saved");
#endif // EXTRA_CHECKS
// Regen constants in registers (not needed in small DUP case)
#if defined(USE_INLINE_ARM_ASM)
// __asm__ __volatile__ (
// "mov %[constReg3], #2\n\t"
// "mvn %[constReg2], #0xC000\n\t"
// "orr %[constReg3], %[constReg3], %[constReg1], lsr #1\n\t"
// :
// [constReg1] "+l" (copyOnePixelHighHalfWordConstRegister),
// [constReg2] "+l" (extractNumPixelsHighHalfWordConstRegister),
// [constReg3] "+l" (dupTwoPixelsHighHalfWordConstRegister)
// );
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(copyOnePixelHighHalfWordConstRegister == copyOnePixelHighHalfWord, "copyOnePixelHighHalfWordConstRegister");
MAXVID_ASSERT(dupTwoPixelsHighHalfWordConstRegister == dupTwoPixelsHighHalfWord, "dupTwoPixelsHighHalfWordConstRegister");
MAXVID_ASSERT(extractNumPixelsHighHalfWordConstRegister == ((0xFFFF << 16) | extractNumPixelsHighHalfWord), "extractNumPixelsHighHalfConstRegister");
#endif // EXTRA_CHECKS
#endif // USE_INLINE_ARM_ASM
#if defined(USE_INLINE_ARM_ASM)
__asm__ __volatile__ (
"@ fall through to DECODE\n\t"
:
);
#endif // USE_INLINE_ARM_ASM
DECODE_16BPP:
#ifdef USE_INLINE_ARM_ASM
// These checks are done before the read, after the DECODE_16BPP label
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(inputBuffer32 != NULL, "inputBuffer32");
MAXVID_ASSERT(UINTMOD(inputBuffer32, sizeof(uint32_t)) == 0, "inputBuffer32 alignment");
MAXVID_ASSERT(frameBuffer16 != NULL, "frameBuffer16");
MAXVID_ASSERT(UINTMOD(frameBuffer16, sizeof(uint16_t)) == 0, "frameBuffer16 alignment");
// inputBuffer32 should be 1 word ahead of the previous read (ignored in COPY case)
MAXVID_ASSERT(inputBuffer32 == (prevInputBuffer32 + 1), "inputBuffer32 != previous");
#endif
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(inW1 == inW1Saved, "inW1Saved");
MAXVID_ASSERT(copyOnePixelHighHalfWordConstRegister == copyOnePixelHighHalfWord, "copyOnePixelHighHalfWordConstRegister");
MAXVID_ASSERT(dupTwoPixelsHighHalfWordConstRegister == dupTwoPixelsHighHalfWord, "dupTwoPixelsHighHalfWordConstRegister");
#endif // EXTRA_CHECKS
// This impl of the inline ASM no longer matches the optimized implementation. Too much
// of the structure has changed to keep this inline block and the optimized ARM asm in sync.
// The optimized assembly does forward branches instead of conditional instrs as it is faster.
__asm__ __volatile__ (
"@ DECODE_16BPP\n\t"
"2:\n\t"
"@ if ((opCode = (inW1 >> 30)) == SKIP) ...\n\t"
"movs %[opCode], %[inW1], lsr #30\n\t"
"addeq %[frameBuffer16], %[frameBuffer16], %[inW1], lsl #1\n\t"
"ldreq %[inW1], [%[inputBuffer32]], #4\n\t"
"beq 2b\n\t"
"@ if (COPY1 == (inW1 >> 16)) ...\n\t"
"cmp %[copyOnePixelHighHalfWordConstRegister], %[inW1], lsr #16\n\t"
"streqh %[inW1], [%[frameBuffer16]], #2\n\t"
"ldreq %[inW1], [%[inputBuffer32]], #4\n\t"
"beq 2b\n\t"
"@ if (DUP2 == (inW1 >> 16)) ...\n\t"
"cmp %[dupTwoPixelsHighHalfWordConstRegister], %[inW1], lsr #16\n\t"
"streqh %[inW1], [%[frameBuffer16]], #2\n\t"
"streqh %[inW1], [%[frameBuffer16]], #2\n\t"
"ldreq %[inW1], [%[inputBuffer32]], #4\n\t"
"beq 2b\n\t"
:
[inputBuffer32] "+l" (inputBuffer32),
[inW1] "+l" (inW1),
[frameBuffer16] "+l" (frameBuffer16),
[opCode] "+l" (opCode),
[copyOnePixelHighHalfWordConstRegister] "+l" (copyOnePixelHighHalfWordConstRegister),
[dupTwoPixelsHighHalfWordConstRegister] "+l" (dupTwoPixelsHighHalfWordConstRegister)
);
#ifdef EXTRA_CHECKS
prevInputBuffer32 = inputBuffer32 - 1;
inW1Saved = inW1;
MAXVID_ASSERT(inputBuffer32 == (prevInputBuffer32 + 1), "inputBuffer32 != previous");
#endif
#ifdef EXTRA_CHECKS
opCodeSaved = opCode;
#endif // EXTRA_CHECKS
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(inW1 == inW1Saved, "inW1Saved");
MAXVID_ASSERT(copyOnePixelHighHalfWordConstRegister == copyOnePixelHighHalfWord, "copyOnePixelHighHalfWordConstRegister");
MAXVID_ASSERT(dupTwoPixelsHighHalfWordConstRegister == dupTwoPixelsHighHalfWord, "dupTwoPixelsHighHalfWordConstRegister");
#endif // EXTRA_CHECKS
#else // USE_INLINE_ARM_ASM
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(inputBuffer32 != NULL, "inputBuffer32");
MAXVID_ASSERT(UINTMOD(inputBuffer32, sizeof(uint32_t)) == 0, "inputBuffer32 alignment");
MAXVID_ASSERT(frameBuffer16 != NULL, "frameBuffer16");
MAXVID_ASSERT(UINTMOD(frameBuffer16, sizeof(uint16_t)) == 0, "frameBuffer16 alignment");
#endif
if ((opCode = (inW1 >> 30)) == SKIP) {
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(((frameBuffer16 - inframeBuffer16) < frameBufferSize), "SKIP already past end of framebuffer");
MAXVID_ASSERT(inW1 == inW1Saved, "inW1Saved");
#endif // EXTRA_CHECKS
// SKIP over N 16bit pixels
frameBuffer16 += inW1;
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(((frameBuffer16 - inframeBuffer16) <= frameBufferSize), "post SKIP now past end of framebuffer");
MAXVID_ASSERT(inputBuffer32 != NULL, "inputBuffer32");
MAXVID_ASSERT(UINTMOD(inputBuffer32, sizeof(uint32_t)) == 0, "inputBuffer32 alignment");
MAXVID_ASSERT(frameBuffer16 != NULL, "frameBuffer16");
MAXVID_ASSERT(UINTMOD(frameBuffer16, sizeof(uint16_t)) == 0, "frameBuffer16 alignment");
// inputBuffer32 should be 1 word ahead of the previous read (ignored in COPY case)
MAXVID_ASSERT(inputBuffer32 == (prevInputBuffer32 + 1), "inputBuffer32 != previous");
prevInputBuffer32 = inputBuffer32;
#endif
inW1 = *inputBuffer32++;
#ifdef EXTRA_CHECKS
inW1Saved = inW1;
MAXVID_ASSERT(inputBuffer32 == (prevInputBuffer32 + 1), "inputBuffer32 != previous");
#endif
goto DECODE_16BPP;
}
#ifdef EXTRA_CHECKS
opCodeSaved = opCode;
#endif // EXTRA_CHECKS
// FIXME: if the code were on the low part of the word, would only need to
// shift by 2 to get the number. Instead of this having to compare to
// a big constant. Could be a compare to 0x5 in this case.
// Use WR2 = r1 as a scratch tmp to check for COPY1
WR2 = copyOnePixelHighHalfWord;
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(inW1 == inW1Saved, "inW1Saved");
MAXVID_ASSERT(WR2 == copyOnePixelHighHalfWord, "copyOnePixelHighHalfWord");
#endif // EXTRA_CHECKS
if (WR2 == (inW1 >> 16))
// if (num == 1 && opCode == COPY)
{
// Special case where a COPY operation operates on only one 16 bit pixel.
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(((frameBuffer16 - inframeBuffer16) < frameBufferSize), "COPY already past end of framebuffer");
MAXVID_ASSERT(inW1 == inW1Saved, "inW1Saved");
#endif // EXTRA_CHECKS
*frameBuffer16++ = inW1;
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(inputBuffer32 != NULL, "inputBuffer32");
MAXVID_ASSERT(UINTMOD(inputBuffer32, sizeof(uint32_t)) == 0, "inputBuffer32 alignment");
MAXVID_ASSERT(frameBuffer16 != NULL, "frameBuffer16");
MAXVID_ASSERT(UINTMOD(frameBuffer16, sizeof(uint16_t)) == 0, "frameBuffer16 alignment");
// inputBuffer32 should be 1 word ahead of the previous read (ignored in COPY case)
MAXVID_ASSERT(inputBuffer32 == (prevInputBuffer32 + 1), "inputBuffer32 != previous");
prevInputBuffer32 = inputBuffer32;
#endif
inW1 = *inputBuffer32++;
#ifdef EXTRA_CHECKS
inW1Saved = inW1;
MAXVID_ASSERT(inputBuffer32 == (prevInputBuffer32 + 1), "inputBuffer32 != previous");
#endif
goto DECODE_16BPP;
}
// Use WR2 = r1 as a scratch tmp to check for DUP2
WR2 = dupTwoPixelsHighHalfWord;
#ifdef EXTRA_CHECKS
MAXVID_ASSERT(inW1 == inW1Saved, "inW1Saved");
MAXVID_ASSERT(WR2 == dupTwoPixelsHighHalfWord, "dupTwoPixelsHighHalfWord");
#endif // EXTRA_CHECKS
if (WR2 == (inW1 >> 16))
// if (num == 2 && opCode == DUP)
{