Skip to content

Commit

Permalink
builtins: Add ARM Thumb1 implementation for uidiv and uidivmod
Browse files Browse the repository at this point in the history
Summary:
The current uidiv supports archs without clz. However, the asm is for thumb2/arm.
For uidivmod, the existing code calls the C version of uidivmodsi4, which then calls uidiv. The extra push/pop/bl makes it less efficient.

Reviewers: jmolloy, jroelofs, joerg, compnerd, rengolin

Subscribers: llvm-commits, aemerson

Differential Revision: https://reviews.llvm.org/D27309

llvm-svn: 288710
  • Loading branch information
Weiming Zhao committed Dec 5, 2016
1 parent 6ad7b9f commit adf4258
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 23 deletions.
15 changes: 15 additions & 0 deletions compiler-rt/lib/builtins/arm/aeabi_uidivmod.S
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,20 @@
.syntax unified
.p2align 2
DEFINE_COMPILERRT_FUNCTION(__aeabi_uidivmod)
#if __ARM_ARCH_ISA_THUMB == 1
cmp r0, r1
bcc LOCAL_LABEL(case_denom_larger)
push {r0, r1, lr}
bl SYMBOL_NAME(__aeabi_uidiv)
pop {r1, r2, r3}
muls r2, r2, r0 // r2 = quot * denom
subs r1, r1, r2
JMP (r3)
LOCAL_LABEL(case_denom_larger):
movs r1, r0
movs r0, #0
JMP (lr)
#else
push { lr }
sub sp, sp, #4
mov r2, sp
Expand All @@ -35,6 +49,7 @@ DEFINE_COMPILERRT_FUNCTION(__aeabi_uidivmod)
ldr r1, [sp]
add sp, sp, #4
pop { pc }
#endif
END_COMPILERRT_FUNCTION(__aeabi_uidivmod)

NO_EXEC_STACK_DIRECTIVE
Expand Down
129 changes: 107 additions & 22 deletions compiler-rt/lib/builtins/arm/udivsi3.S
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,26 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3)
#else
cmp r1, #1
bcc LOCAL_LABEL(divby0)
#if __ARM_ARCH_ISA_THUMB == 1
bne LOCAL_LABEL(num_neq_denom)
JMP(lr)
LOCAL_LABEL(num_neq_denom):
#else
IT(eq)
JMPc(lr, eq)
#endif
cmp r0, r1
#if __ARM_ARCH_ISA_THUMB == 1
bhs LOCAL_LABEL(num_ge_denom)
movs r0, #0
JMP(lr)
LOCAL_LABEL(num_ge_denom):
#else
ITT(cc)
movcc r0, #0
JMPc(lr, cc)
#endif

/*
* Implement division using binary long division algorithm.
*
Expand All @@ -62,7 +76,7 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3)
* that (r0 << shift) < 2 * r1. The quotient is stored in r3.
*/

# ifdef __ARM_FEATURE_CLZ
# if defined(__ARM_FEATURE_CLZ)
clz ip, r0
clz r3, r1
/* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */
Expand All @@ -77,49 +91,128 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3)
sub ip, ip, r3, lsl #3
mov r3, #0
bx ip
# else
# else /* No CLZ Feature */
# if __ARM_ARCH_ISA_THUMB == 2
# error THUMB mode requires CLZ or UDIV
# endif
# if __ARM_ARCH_ISA_THUMB == 1
# define BLOCK_SIZE 10
# else
# define BLOCK_SIZE 12
# endif

mov r2, r0
# if __ARM_ARCH_ISA_THUMB == 1
mov ip, r0
adr r0, LOCAL_LABEL(div0block)
adds r0, #1
# else
adr ip, LOCAL_LABEL(div0block)

lsr r3, r2, #16
# endif
lsrs r3, r2, #16
cmp r3, r1
# if __ARM_ARCH_ISA_THUMB == 1
blo LOCAL_LABEL(skip_16)
movs r2, r3
subs r0, r0, #(16 * BLOCK_SIZE)
LOCAL_LABEL(skip_16):
# else
movhs r2, r3
subhs ip, ip, #(16 * 12)
subhs ip, ip, #(16 * BLOCK_SIZE)
# endif

lsr r3, r2, #8
lsrs r3, r2, #8
cmp r3, r1
# if __ARM_ARCH_ISA_THUMB == 1
blo LOCAL_LABEL(skip_8)
movs r2, r3
subs r0, r0, #(8 * BLOCK_SIZE)
LOCAL_LABEL(skip_8):
# else
movhs r2, r3
subhs ip, ip, #(8 * 12)
subhs ip, ip, #(8 * BLOCK_SIZE)
# endif

lsr r3, r2, #4
lsrs r3, r2, #4
cmp r3, r1
# if __ARM_ARCH_ISA_THUMB == 1
blo LOCAL_LABEL(skip_4)
movs r2, r3
subs r0, r0, #(4 * BLOCK_SIZE)
LOCAL_LABEL(skip_4):
# else
movhs r2, r3
subhs ip, #(4 * 12)
subhs ip, #(4 * BLOCK_SIZE)
# endif

lsr r3, r2, #2
lsrs r3, r2, #2
cmp r3, r1
# if __ARM_ARCH_ISA_THUMB == 1
blo LOCAL_LABEL(skip_2)
movs r2, r3
subs r0, r0, #(2 * BLOCK_SIZE)
LOCAL_LABEL(skip_2):
# else
movhs r2, r3
subhs ip, ip, #(2 * 12)
subhs ip, ip, #(2 * BLOCK_SIZE)
# endif

/* Last block, no need to update r2 or r3. */
# if __ARM_ARCH_ISA_THUMB == 1
lsrs r3, r2, #1
cmp r3, r1
blo LOCAL_LABEL(skip_1)
subs r0, r0, #(1 * BLOCK_SIZE)
LOCAL_LABEL(skip_1):
movs r2, r0
mov r0, ip
movs r3, #0
JMP (r2)

# else
cmp r1, r2, lsr #1
subls ip, ip, #(1 * 12)
subls ip, ip, #(1 * BLOCK_SIZE)

mov r3, #0
movs r3, #0

JMP(ip)
# endif
# endif
# endif /* __ARM_FEATURE_CLZ */


#define IMM #
/* due to the range limit of branch in Thumb1, we have to place the
block closer */
LOCAL_LABEL(divby0):
movs r0, #0
# if defined(__ARM_EABI__)
bl __aeabi_idiv0 // due to relocation limit, can't use b.
# endif
JMP(lr)


#if __ARM_ARCH_ISA_THUMB == 1
#define block(shift) \
lsls r2, r1, IMM shift; \
cmp r0, r2; \
blo LOCAL_LABEL(block_skip_##shift); \
subs r0, r0, r2; \
LOCAL_LABEL(block_skip_##shift) :; \
adcs r3, r3 /* same as ((r3 << 1) | Carry). Carry is set if r0 >= r2. */

/* TODO: if current location counter is not not word aligned, we don't
need the .p2align and nop */
/* Label div0block must be word-aligned. First align block 31 */
.p2align 2
nop /* Padding to align div0block as 31 blocks = 310 bytes */

#else
#define block(shift) \
cmp r0, r1, lsl IMM shift; \
ITT(hs); \
WIDE(addhs) r3, r3, IMM (1 << shift); \
WIDE(subhs) r0, r0, r1, lsl IMM shift
#endif

block(31)
block(30)
Expand Down Expand Up @@ -159,14 +252,6 @@ LOCAL_LABEL(div0block):
JMP(lr)
#endif /* __ARM_ARCH_EXT_IDIV__ */

LOCAL_LABEL(divby0):
mov r0, #0
#ifdef __ARM_EABI__
b __aeabi_idiv0
#else
JMP(lr)
#endif

END_COMPILERRT_FUNCTION(__udivsi3)

NO_EXEC_STACK_DIRECTIVE
Expand Down
3 changes: 2 additions & 1 deletion compiler-rt/lib/builtins/assembly.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@
#define ARM_HAS_BX
#endif
#if !defined(__ARM_FEATURE_CLZ) && \
(__ARM_ARCH >= 6 || (__ARM_ARCH == 5 && !defined(__ARM_ARCH_5__)))
((__ARM_ARCH >= 6 && __ARM_ARCH_PROFILE != 'M') || \
(__ARM_ARCH == 5 && !defined(__ARM_ARCH_5__)))
#define __ARM_FEATURE_CLZ
#endif

Expand Down

0 comments on commit adf4258

Please sign in to comment.