From fc023c880ce1d6c908fb78ccc25f5d5fd910ccc5 Mon Sep 17 00:00:00 2001 From: mayeut Date: Tue, 12 Jan 2016 22:37:45 +0100 Subject: [PATCH] NEON SIMD implementation of Huffman encoding Compression speedups relative to libjpeg-turbo master 2016-01-12 f3a8684cd1c28e557d394470962a7a224c76ddbc Using nightshot_iso_100.ppm, Q95, 444 iPhone 5S arm: 26% iPhone 5S arm64: 17% Speed-up on arm64 is limited by other functions. Refer to libjpeg-turbo/libjpeg-turbo#44 for more details. --- simd/jsimd.h | 6 +- simd/jsimd_arm.c | 12 +- simd/jsimd_arm64.c | 12 +- simd/jsimd_arm64_neon.S | 265 ++++++++++++++++++++++++ simd/jsimd_arm_neon.S | 435 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 727 insertions(+), 3 deletions(-) diff --git a/simd/jsimd.h b/simd/jsimd.h index 04277fc69..e259feab8 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -5,7 +5,7 @@ * Copyright (C) 2011, 2014-2016 D. R. Commander * Copyright (C) 2013-2014, MIPS Technologies, Inc., California * Copyright (C) 2014 Linaro Limited - * Copyright (C) 2015 Matthieu Darbois + * Copyright (C) 2015-2016 Matthieu Darbois * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -835,3 +835,7 @@ extern const int jconst_huff_encode_one_block[]; EXTERN(JOCTET*) jsimd_huff_encode_one_block_sse2 (void * state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, c_derived_tbl *dctbl, c_derived_tbl *actbl); + +EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon + (void * state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, + c_derived_tbl *dctbl, c_derived_tbl *actbl); diff --git a/simd/jsimd_arm.c b/simd/jsimd_arm.c index e715291e4..acb1f7e81 100644 --- a/simd/jsimd_arm.c +++ b/simd/jsimd_arm.c @@ -3,7 +3,7 @@ * * Copyright 2009 Pierre Ossman for Cendio AB * Copyright 2009-2011, 2013-2014 D. R. Commander - * Copyright 2015 Matthieu Darbois + * Copyright 2015-2016 Matthieu Darbois * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -711,6 +711,14 @@ jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr, GLOBAL(int) jsimd_can_huff_encode_one_block (void) { + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (simd_support & JSIMD_ARM_NEON) + return 1; return 0; } @@ -719,5 +727,7 @@ jsimd_huff_encode_one_block (void * state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, c_derived_tbl *dctbl, c_derived_tbl *actbl) { + if (simd_support & JSIMD_ARM_NEON) + return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val, dctbl, actbl); return NULL; } diff --git a/simd/jsimd_arm64.c b/simd/jsimd_arm64.c index 585feeb97..3f5d128be 100644 --- a/simd/jsimd_arm64.c +++ b/simd/jsimd_arm64.c @@ -3,7 +3,7 @@ * * Copyright 2009 Pierre Ossman for Cendio AB * Copyright 2009-2011, 2013-2014 D. R. Commander - * Copyright 2015 Matthieu Darbois + * Copyright 2015-2016 Matthieu Darbois * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -547,6 +547,14 @@ jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr, GLOBAL(int) jsimd_can_huff_encode_one_block (void) { + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (simd_support & JSIMD_ARM_NEON) + return 1; return 0; } @@ -555,5 +563,7 @@ jsimd_huff_encode_one_block (void * state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, c_derived_tbl *dctbl, c_derived_tbl *actbl) { + if (simd_support & JSIMD_ARM_NEON) + return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val, dctbl, actbl); return NULL; } diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S index 89fc66031..67395694c 100644 --- a/simd/jsimd_arm64_neon.S +++ b/simd/jsimd_arm64_neon.S @@ -7,6 +7,7 @@ * Copyright (C) 2013-2014, Linaro Limited * Author: Ragesh Radhakrishnan * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * Copyright (C) 2015-2016, Matthieu Darbois. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -1055,6 +1056,7 @@ asm_function jsimd_idct_ifast_neon .unreq TMP2 .unreq TMP3 .unreq TMP4 + .unreq TMP5 /*****************************************************************************/ @@ -1859,3 +1861,266 @@ generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, . generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b .purgem do_load .purgem do_store + +/*****************************************************************************/ + +/* + * GLOBAL(JOCTET*) + * jsimd_chuff_encode_one_block (working_state * state, JOCTET *buffer, + * JCOEFPTR block, int last_dc_val, + * c_derived_tbl *dctbl, c_derived_tbl *actbl) + * + */ + +.macro emit_byte + sub x7, x7, #0x8 /* put_bits -= 8 */ + lsr x19, x6, x7 + uxtb w19, w19 + strb w19, [x1, #1]! + cmp w19, #0xff + bne 14f + strb wzr, [x1, #1]! +14: +.endm +.macro put_bits CODE, SIZE + lsl x6, x6, \SIZE + add x7, x7, \SIZE + orr x6, x6, \CODE +.endm +.macro checkbuf31 + cmp x7, #0x20 + blt 31f + emit_byte + emit_byte + emit_byte + emit_byte +31: +.endm +.macro checkbuf47 + cmp x7, #0x30 + blt 47f + emit_byte + emit_byte + emit_byte + emit_byte + emit_byte + emit_byte +47: +.endm + +.balign 16 +jsimd_huff_encode_one_block_neon_consts: + .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 + .byte 0, 1, 2, 3, 16, 17, 32, 33, 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */ + .byte 34, 35, 48, 49, 255, 255, 50, 51, 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */ + .byte 8, 9, 22, 23, 36, 37, 50, 51, 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */ + .byte 54, 55, 40, 41, 26, 27, 12, 13, 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */ + .byte 6, 7, 20, 21, 34, 35, 48, 49, 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */ + .byte 42, 43, 28, 29, 14, 15, 30, 31, 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */ + .byte 255, 255, 255, 255, 56, 57, 42, 43, 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */ + .byte 26, 27, 40, 41, 42, 43, 28, 29, 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */ + + .byte 255, 255, 255, 255, 0, 1, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */ + .byte 255, 255, 255, 255, 255, 255, 255, 255, 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */ + .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */ + .byte 4, 5, 6, 7, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */ + +asm_function jsimd_huff_encode_one_block_neon + sub sp, sp, 272 + /* Save ARM registers */ + sub x1, x1, #0x1 /* r10=buffer-- */ + stp x19, x20, [sp], 16 + adr x15, jsimd_huff_encode_one_block_neon_consts + ldr x6, [x0, #0x10] /* x6 = put_buffer */ + mov w13, #0x10 + ldr w7, [x0, #0x18] /* x7 = put_bits */ + mov w14, #0x1 + ldrsh w12, [x2] + mov x11, sp + /* prepare data */ + ld1 {v20.8b}, [x15], #8 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64 + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64 + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64 + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64 + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64 + sub x2, x2, #128 + dup v21.8h, w13 + dup v22.8h, w14 + eor v23.16b, v23.16b, v23.16b + sub w12, w12, w3 /* last_dc_val, not used afterwards */ + tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b + tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b + tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b + tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b + tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b + tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b + tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b + tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b + ins v0.h[0], w12 + tbx v1.16b, {v28.16b}, v16.16b + tbx v2.16b, {v29.16b, v30.16b}, v17.16b + tbx v5.16b, {v29.16b, v30.16b}, v18.16b + tbx v6.16b, {v31.16b}, v19.16b + eor v16.16b, v16.16b, v16.16b + cmgt v24.8h, v23.8h, v0.8h + cmgt v25.8h, v16.8h, v1.8h + cmgt v26.8h, v23.8h, v2.8h + cmgt v27.8h, v16.8h, v3.8h + cmgt v28.8h, v23.8h, v4.8h + cmgt v29.8h, v16.8h, v5.8h + cmgt v30.8h, v23.8h, v6.8h + cmgt v31.8h, v16.8h, v7.8h + abs v0.8h, v0.8h + abs v1.8h, v1.8h + abs v2.8h, v2.8h + abs v3.8h, v3.8h + abs v4.8h, v4.8h + abs v5.8h, v5.8h + abs v6.8h, v6.8h + abs v7.8h, v7.8h + eor v24.16b, v24.16b, v0.16b + eor v25.16b, v25.16b, v1.16b + eor v26.16b, v26.16b, v2.16b + eor v27.16b, v27.16b, v3.16b + eor v28.16b, v28.16b, v4.16b + eor v29.16b, v29.16b, v5.16b + eor v30.16b, v30.16b, v6.16b + eor v31.16b, v31.16b, v7.16b + clz v0.8h, v0.8h + clz v1.8h, v1.8h + clz v2.8h, v2.8h + clz v3.8h, v3.8h + clz v4.8h, v4.8h + clz v5.8h, v5.8h + clz v6.8h, v6.8h + clz v7.8h, v7.8h + sub v0.8h, v21.8h, v0.8h + sub v1.8h, v21.8h, v1.8h + sub v2.8h, v21.8h, v2.8h + sub v3.8h, v21.8h, v3.8h + sub v4.8h, v21.8h, v4.8h + sub v5.8h, v21.8h, v5.8h + sub v6.8h, v21.8h, v6.8h + sub v7.8h, v21.8h, v7.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 + sshl v0.8h, v22.8h, v0.8h + sshl v1.8h, v22.8h, v1.8h + sshl v2.8h, v22.8h, v2.8h + sshl v3.8h, v22.8h, v3.8h + sshl v4.8h, v22.8h, v4.8h + sshl v5.8h, v22.8h, v5.8h + sshl v6.8h, v22.8h, v6.8h + sshl v7.8h, v22.8h, v7.8h + sub v0.8h, v0.8h, v22.8h + sub v1.8h, v1.8h, v22.8h + sub v2.8h, v2.8h, v22.8h + sub v3.8h, v3.8h, v22.8h + sub v4.8h, v4.8h, v22.8h + sub v5.8h, v5.8h, v22.8h + sub v6.8h, v6.8h, v22.8h + sub v7.8h, v7.8h, v22.8h + and v24.16b, v24.16b, v0.16b + and v25.16b, v25.16b, v1.16b + and v26.16b, v26.16b, v2.16b + and v27.16b, v27.16b, v3.16b + and v28.16b, v28.16b, v4.16b + and v29.16b, v29.16b, v5.16b + and v30.16b, v30.16b, v6.16b + and v31.16b, v31.16b, v7.16b + st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 + st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 + add x3, x4, #0x400 /* r1 = dctbl->ehufsi */ + add x15, sp, #0x80 /* x15 = t2 */ + ldrh w12, [sp] + ldrh w13, [x15] + ldr w10, [x4, x12, lsl #2] + ldrb w11, [x3, x12] + checkbuf47 + put_bits x10, x11 + checkbuf47 + put_bits x13, x12 + cmeq v0.8h, v0.8h, v23.8h + cmeq v1.8h, v1.8h, v23.8h + cmeq v2.8h, v2.8h, v23.8h + cmeq v3.8h, v3.8h, v23.8h + cmeq v4.8h, v4.8h, v23.8h + cmeq v5.8h, v5.8h, v23.8h + cmeq v6.8h, v6.8h, v23.8h + cmeq v7.8h, v7.8h, v23.8h + xtn v0.8b, v0.8h + xtn v1.8b, v1.8h + xtn v2.8b, v2.8h + xtn v3.8b, v3.8h + xtn v4.8b, v4.8h + xtn v5.8b, v5.8h + xtn v6.8b, v6.8h + xtn v7.8b, v7.8h + and v0.8b, v0.8b, v20.8b + and v1.8b, v1.8b, v20.8b + and v2.8b, v2.8b, v20.8b + and v3.8b, v3.8b, v20.8b + and v4.8b, v4.8b, v20.8b + and v5.8b, v5.8b, v20.8b + and v6.8b, v6.8b, v20.8b + and v7.8b, v7.8b, v20.8b + addp v0.8b, v0.8b, v1.8b + addp v2.8b, v2.8b, v3.8b + addp v4.8b, v4.8b, v5.8b + addp v6.8b, v6.8b, v7.8b + addp v0.8b, v0.8b, v2.8b + addp v4.8b, v4.8b, v6.8b + addp v0.8b, v0.8b, v4.8b + umov x9,v0.D[0] + mvn x9, x9 + add x4, x5, #0x400 /* x4 = actbl->ehufsi */ + lsr x9, x9, #0x1 /* clear AC coeff */ + ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */ + rbit x9, x9 /* x9 = index0 */ + ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */ + cmp x9, #0x0 + beq 6f +1: + clz x2, x9 + add x15, x15, x2, lsl #1 + lsl x9, x9, x2 + ldrh w11, [x15, #-126] +2: + cmp x2, #0x10 + blt 3f + sub x2, x2, #0x10 + checkbuf47 + put_bits x13, x14 + b 2b +3: + add x2, x11, x2, lsl #4 + ldrh w3, [x15, #2]! + lsl x9, x9, #0x1 + ldr w12, [x5, x2, lsl #2] + ldrb w10, [x4, x2] + checkbuf31 + put_bits x12, x10 + put_bits x3, x11 + cbnz x9, 1b +6: + add x13, sp, #0xfe + cmp x15, x13 + bhs 1f + ldr w12, [x5] + ldrb w14, [x4] + checkbuf47 + put_bits x12, x14 +1: + sub sp, sp, 16 + str x6, [x0, #0x10] + str w7, [x0, #0x18] + ldp x19, x20, [sp], 16 + add x0, x1, #0x1 + add sp, sp, 256 + br x30 + +.purgem emit_byte +.purgem put_bits +.purgem checkbuf31 +.purgem checkbuf47 diff --git a/simd/jsimd_arm_neon.S b/simd/jsimd_arm_neon.S index c83e1c744..242cbffad 100644 --- a/simd/jsimd_arm_neon.S +++ b/simd/jsimd_arm_neon.S @@ -7,6 +7,7 @@ * Copyright (C) 2014 Siarhei Siamashka. All Rights Reserved. * Copyright (C) 2014 Linaro Limited. All Rights Reserved. * Copyright (C) 2015 D. R. Commander. All Rights Reserved. + * Copyright (C) 2015-2016 Matthieu Darbois. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -2438,3 +2439,437 @@ asm_function jsimd_h2v1_fancy_upsample_neon .purgem upsample16 .purgem upsample32 .purgem upsample_row + +/*****************************************************************************/ + +/* + * GLOBAL(JOCTET*) + * jsimd_chuff_encode_one_block (working_state * state, JOCTET *buffer, + * JCOEFPTR block, int last_dc_val, + * c_derived_tbl *dctbl, c_derived_tbl *actbl) + * + */ + +.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP + sub \PUT_BITS, \PUT_BITS, #0x8 + lsr \TMP, \PUT_BUFFER, \PUT_BITS + uxtb \TMP, \TMP + strb \TMP, [\BUFFER, #1]! + cmp \TMP, #0xff + /*it eq*/ + streqb \ZERO, [\BUFFER, #1]! +.endm +.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE + /*lsl \PUT_BUFFER, \PUT_BUFFER, \SIZE*/ + add \PUT_BITS, \SIZE + /*orr \PUT_BUFFER, \PUT_BUFFER, \CODE*/ + orr \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE +.endm +.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP + cmp \PUT_BITS, #0x10 + blt 15f + eor \ZERO, \ZERO, \ZERO + emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP + emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP +15: +.endm + +.balign 16 +jsimd_huff_encode_one_block_neon_consts: + .byte 0x01 + .byte 0x02 + .byte 0x04 + .byte 0x08 + .byte 0x10 + .byte 0x20 + .byte 0x40 + .byte 0x80 + +asm_function jsimd_huff_encode_one_block_neon + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + add r7, sp, #0x1c + sub r4, sp, #0x40 + bfc r4, #0, #5 + mov sp, r4 /* align sp on 32 bytes */ + vst1.64 {d8, d9, d10, d11}, [r4, :128]! + vst1.64 {d12, d13, d14, d15}, [r4, :128] + sub sp, #0x140 /* reserve 320 bytes */ + str r0, [sp, #0x18] /* working state > sp + Ox18 */ + add r4, sp, #0x20 /* r4 = t1 */ + ldr lr, [r7, #0x8] /* lr = dctbl */ + sub r10, r1, #0x1 /* r10=buffer-- */ + ldrsh r1, [r2] + mov r9, #0x10 + mov r8, #0x1 + adr r5, jsimd_huff_encode_one_block_neon_consts + /* prepare data */ + vld1.8 {d26}, [r5, :64] + veor q8, q8, q8 + veor q9, q9, q9 + vdup.16 q14, r9 + vdup.16 q15, r8 + veor q10, q10, q10 + veor q11, q11, q11 + sub r1, r1, r3 + add r9, r2, #0x22 + add r8, r2, #0x18 + add r3, r2, #0x36 + vmov.16 d0[0], r1 + vld1.16 {d2[0]}, [r9, :16] + vld1.16 {d4[0]}, [r8, :16] + vld1.16 {d6[0]}, [r3, :16] + add r1, r2, #0x2 + add r9, r2, #0x30 + add r8, r2, #0x26 + add r3, r2, #0x28 + vld1.16 {d0[1]}, [r1, :16] + vld1.16 {d2[1]}, [r9, :16] + vld1.16 {d4[1]}, [r8, :16] + vld1.16 {d6[1]}, [r3, :16] + add r1, r2, #0x10 + add r9, r2, #0x40 + add r8, r2, #0x34 + add r3, r2, #0x1a + vld1.16 {d0[2]}, [r1, :16] + vld1.16 {d2[2]}, [r9, :16] + vld1.16 {d4[2]}, [r8, :16] + vld1.16 {d6[2]}, [r3, :16] + add r1, r2, #0x20 + add r9, r2, #0x32 + add r8, r2, #0x42 + add r3, r2, #0xc + vld1.16 {d0[3]}, [r1, :16] + vld1.16 {d2[3]}, [r9, :16] + vld1.16 {d4[3]}, [r8, :16] + vld1.16 {d6[3]}, [r3, :16] + add r1, r2, #0x12 + add r9, r2, #0x24 + add r8, r2, #0x50 + add r3, r2, #0xe + vld1.16 {d1[0]}, [r1, :16] + vld1.16 {d3[0]}, [r9, :16] + vld1.16 {d5[0]}, [r8, :16] + vld1.16 {d7[0]}, [r3, :16] + add r1, r2, #0x4 + add r9, r2, #0x16 + add r8, r2, #0x60 + add r3, r2, #0x1c + vld1.16 {d1[1]}, [r1, :16] + vld1.16 {d3[1]}, [r9, :16] + vld1.16 {d5[1]}, [r8, :16] + vld1.16 {d7[1]}, [r3, :16] + add r1, r2, #0x6 + add r9, r2, #0x8 + add r8, r2, #0x52 + add r3, r2, #0x2a + vld1.16 {d1[2]}, [r1, :16] + vld1.16 {d3[2]}, [r9, :16] + vld1.16 {d5[2]}, [r8, :16] + vld1.16 {d7[2]}, [r3, :16] + add r1, r2, #0x14 + add r9, r2, #0xa + add r8, r2, #0x44 + add r3, r2, #0x38 + vld1.16 {d1[3]}, [r1, :16] + vld1.16 {d3[3]}, [r9, :16] + vld1.16 {d5[3]}, [r8, :16] + vld1.16 {d7[3]}, [r3, :16] + vcgt.s16 q8, q8, q0 + vcgt.s16 q9, q9, q1 + vcgt.s16 q10, q10, q2 + vcgt.s16 q11, q11, q3 + vabs.s16 q0, q0 + vabs.s16 q1, q1 + vabs.s16 q2, q2 + vabs.s16 q3, q3 + veor q8, q8, q0 + veor q9, q9, q1 + veor q10, q10, q2 + veor q11, q11, q3 + add r9, r4, #0x20 + add r8, r4, #0x80 + add r3, r4, #0xa0 + vclz.i16 q0, q0 + vclz.i16 q1, q1 + vclz.i16 q2, q2 + vclz.i16 q3, q3 + vsub.i16 q0, q14, q0 + vsub.i16 q1, q14, q1 + vsub.i16 q2, q14, q2 + vsub.i16 q3, q14, q3 + vst1.16 {d0, d1, d2, d3}, [r4, :256] + vst1.16 {d4, d5, d6, d7}, [r9, :256] + vshl.s16 q0, q15, q0 + vshl.s16 q1, q15, q1 + vshl.s16 q2, q15, q2 + vshl.s16 q3, q15, q3 + vsub.i16 q0, q0, q15 + vsub.i16 q1, q1, q15 + vsub.i16 q2, q2, q15 + vsub.i16 q3, q3, q15 + vand q8, q8, q0 + vand q9, q9, q1 + vand q10, q10, q2 + vand q11, q11, q3 + vst1.16 {d16, d17, d18, d19}, [r8, :256] + vst1.16 {d20, d21, d22, d23}, [r3, :256] + add r1, r2, #0x46 + add r9, r2, #0x3a + add r8, r2, #0x74 + add r3, r2, #0x6a + vld1.16 {d8[0]}, [r1, :16] + vld1.16 {d10[0]}, [r9, :16] + vld1.16 {d12[0]}, [r8, :16] + vld1.16 {d14[0]}, [r3, :16] + veor q8, q8, q8 + veor q9, q9, q9 + veor q10, q10, q10 + veor q11, q11, q11 + add r1, r2, #0x54 + add r9, r2, #0x2c + add r8, r2, #0x76 + add r3, r2, #0x78 + vld1.16 {d8[1]}, [r1, :16] + vld1.16 {d10[1]}, [r9, :16] + vld1.16 {d12[1]}, [r8, :16] + vld1.16 {d14[1]}, [r3, :16] + add r1, r2, #0x62 + add r9, r2, #0x1e + add r8, r2, #0x68 + add r3, r2, #0x7a + vld1.16 {d8[2]}, [r1, :16] + vld1.16 {d10[2]}, [r9, :16] + vld1.16 {d12[2]}, [r8, :16] + vld1.16 {d14[2]}, [r3, :16] + add r1, r2, #0x70 + add r9, r2, #0x2e + add r8, r2, #0x5a + add r3, r2, #0x6c + vld1.16 {d8[3]}, [r1, :16] + vld1.16 {d10[3]}, [r9, :16] + vld1.16 {d12[3]}, [r8, :16] + vld1.16 {d14[3]}, [r3, :16] + add r1, r2, #0x72 + add r9, r2, #0x3c + add r8, r2, #0x4c + add r3, r2, #0x5e + vld1.16 {d9[0]}, [r1, :16] + vld1.16 {d11[0]}, [r9, :16] + vld1.16 {d13[0]}, [r8, :16] + vld1.16 {d15[0]}, [r3, :16] + add r1, r2, #0x64 + add r9, r2, #0x4a + add r8, r2, #0x3e + add r3, r2, #0x6e + vld1.16 {d9[1]}, [r1, :16] + vld1.16 {d11[1]}, [r9, :16] + vld1.16 {d13[1]}, [r8, :16] + vld1.16 {d15[1]}, [r3, :16] + add r1, r2, #0x56 + add r9, r2, #0x58 + add r8, r2, #0x4e + add r3, r2, #0x7c + vld1.16 {d9[2]}, [r1, :16] + vld1.16 {d11[2]}, [r9, :16] + vld1.16 {d13[2]}, [r8, :16] + vld1.16 {d15[2]}, [r3, :16] + add r1, r2, #0x48 + add r9, r2, #0x66 + add r8, r2, #0x5c + add r3, r2, #0x7e + vld1.16 {d9[3]}, [r1, :16] + vld1.16 {d11[3]}, [r9, :16] + vld1.16 {d13[3]}, [r8, :16] + vld1.16 {d15[3]}, [r3, :16] + vcgt.s16 q8, q8, q4 + vcgt.s16 q9, q9, q5 + vcgt.s16 q10, q10, q6 + vcgt.s16 q11, q11, q7 + vabs.s16 q4, q4 + vabs.s16 q5, q5 + vabs.s16 q6, q6 + vabs.s16 q7, q7 + veor q8, q8, q4 + veor q9, q9, q5 + veor q10, q10, q6 + veor q11, q11, q7 + add r1, r4, #0x40 + add r9, r4, #0x60 + add r8, r4, #0xc0 + add r3, r4, #0xe0 + vclz.i16 q4, q4 + vclz.i16 q5, q5 + vclz.i16 q6, q6 + vclz.i16 q7, q7 + vsub.i16 q4, q14, q4 + vsub.i16 q5, q14, q5 + vsub.i16 q6, q14, q6 + vsub.i16 q7, q14, q7 + vst1.16 {d8, d9, d10, d11}, [r1, :256] + vst1.16 {d12, d13, d14, d15}, [r9, :256] + vshl.s16 q4, q15, q4 + vshl.s16 q5, q15, q5 + vshl.s16 q6, q15, q6 + vshl.s16 q7, q15, q7 + vsub.i16 q4, q4, q15 + vsub.i16 q5, q5, q15 + vsub.i16 q6, q6, q15 + vsub.i16 q7, q7, q15 + vand q8, q8, q4 + vand q9, q9, q5 + vand q10, q10, q6 + vand q11, q11, q7 + vst1.16 {d16, d17, d18, d19}, [r8, :256] + vst1.16 {d20, d21, d22, d23}, [r3, :256] + ldr r12, [r7, #0xc] /* r12 = actbl */ + add r1, lr, #0x400 /* r1 = dctbl->ehufsi */ + mov r9, r12 /* r9 = actbl */ + add r6, r4, #0x80 /* r6 = t2 */ + ldr r11, [r0, #0x8] /* r11 = put_buffer */ + ldr r4, [r0, #0xc] /* r4 = put_bits */ + ldrh r2, [r6, #-128] /* r2 = nbits */ + ldrh r3, [r6] /* r3 = temp2 & (((JLONG) 1)<ehufsi */ + ldrsb r6, [r5, #0xf0] /* r6 = actbl->ehufsi[0xf0] */ + veor q8, q8, q8 + vceq.i16 q0, q0, q8 + vceq.i16 q1, q1, q8 + vceq.i16 q2, q2, q8 + vceq.i16 q3, q3, q8 + vceq.i16 q4, q4, q8 + vceq.i16 q5, q5, q8 + vceq.i16 q6, q6, q8 + vceq.i16 q7, q7, q8 + vmovn.i16 d0, q0 + vmovn.i16 d2, q1 + vmovn.i16 d4, q2 + vmovn.i16 d6, q3 + vmovn.i16 d8, q4 + vmovn.i16 d10, q5 + vmovn.i16 d12, q6 + vmovn.i16 d14, q7 + vand d0, d0, d26 + vand d2, d2, d26 + vand d4, d4, d26 + vand d6, d6, d26 + vand d8, d8, d26 + vand d10, d10, d26 + vand d12, d12, d26 + vand d14, d14, d26 + vpadd.i8 d0, d0, d2 + vpadd.i8 d4, d4, d6 + vpadd.i8 d8, d8, d10 + vpadd.i8 d12, d12, d14 + vpadd.i8 d0, d0, d4 + vpadd.i8 d8, d8, d12 + vpadd.i8 d0, d0, d8 + vmov.32 r1, d0[1] + vmov.32 r8, d0[0] + mvn r1, r1 + mvn r8, r8 + lsrs r1, r1, #0x1 + rrx r8, r8 /* shift in last r1 bit while shifting out DC bit */ + rbit r1, r1 /* r1 = index1 */ + rbit r8, r8 /* r8 = index0 */ + ldr r0, [r9, #0x3c0] /* r0 = actbl->ehufco[0xf0] */ + str r1, [sp, #0x14] /* index1 > sp + 0x14 */ + cmp r8, #0x0 + beq 6f +1: + clz r2, r8 + add lr, lr, r2, lsl #1 + lsl r8, r8, r2 + ldrh r1, [lr, #-126] +2: + cmp r2, #0x10 + blt 3f + sub r2, r2, #0x10 + put_bits r11, r4, r0, r6 + cmp r4, #0x10 + blt 2b + eor r3, r3, r3 + emit_byte r10, r11, r4, r3, r12 + emit_byte r10, r11, r4, r3, r12 + b 2b +3: + add r2, r1, r2, lsl #4 + ldrh r3, [lr, #2]! + ldr r12, [r9, r2, lsl #2] + ldrb r2, [r5, r2] + put_bits r11, r4, r12, r2 + checkbuf15 r10, r11, r4, r2, r12 + put_bits r11, r4, r3, r1 + checkbuf15 r10, r11, r4, r2, r12 + lsls r8, r8, #0x1 + bne 1b +6: + add r12, sp, #0x20 /* r12 = t1 */ + ldr r8,[sp, #0x14] /* r8 = index1 */ + adds r12, #0xc0 /* r12 = t2 + (DCTSIZE2/2) */ + cmp r8, #0x0 + beq 6f + clz r2, r8 + sub r12, r12, lr + lsl r8, r8, r2 + add r2, r2, r12, lsr #1 + add lr, lr, r2, lsl #1 + b 7f +1: + clz r2, r8 + add lr, lr, r2, lsl #1 + lsl r8, r8, r2 +7: + ldrh r1, [lr, #-126] +2: + cmp r2, #0x10 + blt 3f + sub r2, r2, #0x10 + put_bits r11, r4, r0, r6 + cmp r4, #0x10 + blt 2b + eor r3, r3, r3 + emit_byte r10, r11, r4, r3, r12 + emit_byte r10, r11, r4, r3, r12 + b 2b +3: + add r2, r1, r2, lsl #4 + ldrh r3, [lr, #2]! + ldr r12, [r9, r2, lsl #2] + ldrb r2, [r5, r2] + put_bits r11, r4, r12, r2 + checkbuf15 r10, r11, r4, r2, r12 + put_bits r11, r4, r3, r1 + checkbuf15 r10, r11, r4, r2, r12 + lsls r8, r8, #0x1 + bne 1b +6: + add r0, sp, #0x20 + add r0, #0xfe + cmp lr, r0 + bhs 1f + ldr r1, [r9] + ldrb r0, [r5] + put_bits r11, r4, r1, r0 + checkbuf15 r10, r11, r4, r0, r1 +1: + ldr r12, [sp, #0x18] + str r11, [r12, #0x8] + str r4, [r12, #0xc] + add r0, r10, #0x1 + add r4, sp, #0x140 + vld1.64 {d8, d9, d10, d11}, [r4, :128]! + vld1.64 {d12, d13, d14, d15}, [r4, :128] + sub r4, r7, #0x1c + mov sp, r4 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + +.purgem emit_byte +.purgem put_bits +.purgem checkbuf15