diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 12d2d678ff63a..14b585fb866fa 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1064,7 +1064,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, // Only ARMv6 has BSWAP. if (!Subtarget->hasV6Ops()) - setOperationAction(ISD::BSWAP, MVT::i32, Expand); + setOperationAction(ISD::BSWAP, MVT::i32, Custom); bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() : Subtarget->hasDivideInARMMode(); @@ -9508,6 +9508,42 @@ static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { return false; } +static SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) { + + // eor r1, r0, r0, ror #16 + // bic r1, r1, #0xff0000 + // mov r1, r1, lsr #8 + // eor r0, r1, r0, ror #8 + + SDLoc DL(Op); + SDValue Src = Op.getOperand(0); + + // ror rtmp, r0, #16 + SDValue Ror16 = DAG.getNode(ISD::ROTR, DL, MVT::i32, Src, + DAG.getConstant(16, DL, MVT::i32)); + // eor r1, r0, rtmp ; r1 = r0 ^ (r0 ror 16) + SDValue Xor1 = DAG.getNode(ISD::XOR, DL, MVT::i32, Src, Ror16); + + // bic r1, r1, #0xff0000 (clear bits 16-23) + // BIC r1, r1, #0xff0000 becomes AND r1, r1, ~0x00ff0000 + // So we need the negated value: ~0x00FF0000 = 0xFF00FFFF + SDValue Mask = DAG.getConstant(0xFF00FFFFu, DL, MVT::i32); + SDValue BicResult = DAG.getNode(ISD::AND, DL, MVT::i32, Xor1, Mask); + + // mov r1, r1, lsr #8 + SDValue Lsr8 = DAG.getNode(ISD::SRL, DL, MVT::i32, BicResult, + DAG.getConstant(8, DL, MVT::i32)); + + // ror r0, r0, #8 + SDValue Ror8 = DAG.getNode(ISD::ROTR, DL, MVT::i32, Src, + DAG.getConstant(8, DL, MVT::i32)); + + // eor r0, Lsr8, Ror8 + SDValue Result = DAG.getNode(ISD::XOR, DL, MVT::i32, Lsr8, Ror8); + + return Result; +} + static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { // Multiplications are only custom-lowered for 128-bit vectors so that // VMULL can be detected. Otherwise v2i64 multiplications are not legal. @@ -10708,6 +10744,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::UCMP: case ISD::SCMP: return LowerCMP(Op, DAG); + case ISD::BSWAP: + return LowerBSWAP(Op, DAG); } } diff --git a/llvm/lib/Target/ARM/README.txt b/llvm/lib/Target/ARM/README.txt index def67cfae7277..15d4447f8c649 100644 --- a/llvm/lib/Target/ARM/README.txt +++ b/llvm/lib/Target/ARM/README.txt @@ -606,32 +606,6 @@ constant which was already loaded). Not sure what's necessary to do that. //===---------------------------------------------------------------------===// -The code generated for bswap on armv4/5 (CPUs without rev) is less than ideal: - -int a(int x) { return __builtin_bswap32(x); } - -a: - mov r1, #255, 24 - mov r2, #255, 16 - and r1, r1, r0, lsr #8 - and r2, r2, r0, lsl #8 - orr r1, r1, r0, lsr #24 - orr r0, r2, r0, lsl #24 - orr r0, r0, r1 - bx lr - -Something like the following would be better (fewer instructions/registers): - eor r1, r0, r0, ror #16 - bic r1, r1, #0xff0000 - mov r1, r1, lsr #8 - eor r0, r1, r0, ror #8 - bx lr - -A custom Thumb version would also be a slight improvement over the generic -version. - -//===---------------------------------------------------------------------===// - Consider the following simple C code: void foo(unsigned char *a, unsigned char *b, int *c) { diff --git a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll index 4b6d14efd0ecb..9f0cdf96d91da 100644 --- a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll +++ b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll @@ -54,13 +54,12 @@ define i32 @load_i32_by_i8_bswap(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_bswap: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov r2, #65280 +; CHECK-NEXT: orr r2, r2, #-16777216 +; CHECK-NEXT: eor r1, r0, r0, ror #16 +; CHECK-NEXT: and r1, r1, r2 +; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap: @@ -224,20 +223,17 @@ define i64 @load_i64_by_i8_bswap(ptr %arg) { ; CHECK-LABEL: load_i64_by_i8_bswap: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: mov r12, #65280 +; CHECK-NEXT: mov r3, #65280 ; CHECK-NEXT: ldr r0, [r0, #4] -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r3, r12, r0, lsr #8 -; CHECK-NEXT: orr r3, r3, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: and r2, r12, r1, lsr #8 -; CHECK-NEXT: orr r0, r0, r3 -; CHECK-NEXT: and r3, r1, #65280 -; CHECK-NEXT: orr r2, r2, r1, lsr #24 -; CHECK-NEXT: lsl r1, r1, #24 -; CHECK-NEXT: orr r1, r1, r3, lsl #8 -; CHECK-NEXT: orr r1, r1, r2 +; CHECK-NEXT: orr r3, r3, #-16777216 +; CHECK-NEXT: eor r2, r0, r0, ror #16 +; CHECK-NEXT: and r2, r2, r3 +; CHECK-NEXT: lsr r2, r2, #8 +; CHECK-NEXT: eor r0, r2, r0, ror #8 +; CHECK-NEXT: eor r2, r1, r1, ror #16 +; CHECK-NEXT: and r2, r2, r3 +; CHECK-NEXT: lsr r2, r2, #8 +; CHECK-NEXT: eor r1, r2, r1, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap: @@ -378,13 +374,12 @@ define i32 @load_i32_by_i8_nonzero_offset(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr r0, [r0, #1] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov r2, #65280 +; CHECK-NEXT: orr r2, r2, #-16777216 +; CHECK-NEXT: eor r1, r0, r0, ror #16 +; CHECK-NEXT: and r1, r1, r2 +; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset: @@ -435,13 +430,12 @@ define i32 @load_i32_by_i8_neg_offset(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr r0, [r0, #-4] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov r2, #65280 +; CHECK-NEXT: orr r2, r2, #-16777216 +; CHECK-NEXT: eor r1, r0, r0, ror #16 +; CHECK-NEXT: and r1, r1, r2 +; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset: @@ -588,13 +582,12 @@ define i32 @load_i32_by_bswap_i16(ptr %arg) { ; CHECK-LABEL: load_i32_by_bswap_i16: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov r2, #65280 +; CHECK-NEXT: orr r2, r2, #-16777216 +; CHECK-NEXT: eor r1, r0, r0, ror #16 +; CHECK-NEXT: and r1, r1, r2 +; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16: @@ -667,14 +660,13 @@ define i32 @load_i32_by_i8_base_offset_index(ptr %arg, i32 %i) { ; CHECK-LABEL: load_i32_by_i8_base_offset_index: ; CHECK: @ %bb.0: ; CHECK-NEXT: add r0, r0, r1 -; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #65280 +; CHECK-NEXT: orr r2, r2, #-16777216 ; CHECK-NEXT: ldr r0, [r0, #12] -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: eor r1, r0, r0, ror #16 +; CHECK-NEXT: and r1, r1, r2 +; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index: @@ -733,14 +725,13 @@ define i32 @load_i32_by_i8_base_offset_index_2(ptr %arg, i32 %i) { ; CHECK-LABEL: load_i32_by_i8_base_offset_index_2: ; CHECK: @ %bb.0: ; CHECK-NEXT: add r0, r1, r0 -; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #65280 +; CHECK-NEXT: orr r2, r2, #-16777216 ; CHECK-NEXT: ldr r0, [r0, #13] -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: eor r1, r0, r0, ror #16 +; CHECK-NEXT: and r1, r1, r2 +; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2: diff --git a/llvm/test/CodeGen/ARM/load-combine.ll b/llvm/test/CodeGen/ARM/load-combine.ll index 0f6ec8aa47386..9688bf7e99f4f 100644 --- a/llvm/test/CodeGen/ARM/load-combine.ll +++ b/llvm/test/CodeGen/ARM/load-combine.ll @@ -114,17 +114,15 @@ define i32 @load_i32_by_i8_aligned(ptr %arg) { ; ptr p; // p is 4 byte aligned ; ((i32) p[0] << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3] define i32 @load_i32_by_i8_bswap(ptr %arg) { -; BSWAP is not supported by 32 bit target ; CHECK-LABEL: load_i32_by_i8_bswap: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov r2, #65280 +; CHECK-NEXT: orr r2, r2, #-16777216 +; CHECK-NEXT: eor r1, r0, r0, ror #16 +; CHECK-NEXT: and r1, r1, r2 +; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap: @@ -238,20 +236,17 @@ define i64 @load_i64_by_i8_bswap(ptr %arg) { ; CHECK-LABEL: load_i64_by_i8_bswap: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: mov r12, #65280 +; CHECK-NEXT: mov r3, #65280 ; CHECK-NEXT: ldr r0, [r0, #4] -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r3, r12, r0, lsr #8 -; CHECK-NEXT: orr r3, r3, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: and r2, r12, r1, lsr #8 -; CHECK-NEXT: orr r0, r0, r3 -; CHECK-NEXT: and r3, r1, #65280 -; CHECK-NEXT: orr r2, r2, r1, lsr #24 -; CHECK-NEXT: lsl r1, r1, #24 -; CHECK-NEXT: orr r1, r1, r3, lsl #8 -; CHECK-NEXT: orr r1, r1, r2 +; CHECK-NEXT: orr r3, r3, #-16777216 +; CHECK-NEXT: eor r2, r0, r0, ror #16 +; CHECK-NEXT: and r2, r2, r3 +; CHECK-NEXT: lsr r2, r2, #8 +; CHECK-NEXT: eor r0, r2, r0, ror #8 +; CHECK-NEXT: eor r2, r1, r1, ror #16 +; CHECK-NEXT: and r2, r2, r3 +; CHECK-NEXT: lsr r2, r2, #8 +; CHECK-NEXT: eor r1, r2, r1, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap: @@ -414,13 +409,12 @@ define i32 @load_i32_by_i8_nonzero_offset_bswap(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr r0, [r0, #1] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov r2, #65280 +; CHECK-NEXT: orr r2, r2, #-16777216 +; CHECK-NEXT: eor r1, r0, r0, ror #16 +; CHECK-NEXT: and r1, r1, r2 +; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap: @@ -470,13 +464,12 @@ define i32 @load_i32_by_i8_neg_offset_bswap(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr r0, [r0, #-4] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov r2, #65280 +; CHECK-NEXT: orr r2, r2, #-16777216 +; CHECK-NEXT: eor r1, r0, r0, ror #16 +; CHECK-NEXT: and r1, r1, r2 +; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap: @@ -528,13 +521,12 @@ define i32 @load_i32_by_bswap_i16(ptr %arg) { ; CHECK-LABEL: load_i32_by_bswap_i16: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov r2, #65280 +; CHECK-NEXT: orr r2, r2, #-16777216 +; CHECK-NEXT: eor r1, r0, r0, ror #16 +; CHECK-NEXT: and r1, r1, r2 +; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16: