Skip to content

Commit 834b13a

Browse files
committed
[SelectionDAG] Optimize BSWAP yet again once more
1 parent fc95ffb commit 834b13a

File tree

4 files changed

+79
-104
lines changed

4 files changed

+79
-104
lines changed

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9899,17 +9899,30 @@ SDValue TargetLowering::expandBSWAP(SDNode *N, SelectionDAG &DAG) const {
98999899
// Use a rotate by 8. This can be further expanded if necessary.
99009900
return DAG.getNode(ISD::ROTL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
99019901
case MVT::i32:
9902-
// This is meant for ARM speficially, which has ROTR but no ROTL.
9902+
// This is meant for ARM specifically, which has ROTR but no ROTL.
99039903
if (isOperationLegalOrCustom(ISD::ROTR, VT)) {
9904-
SDValue Mask = DAG.getConstant(0x00FF00FF, dl, VT);
9905-
// (x & 0x00FF00FF) rotr 8 | (x rotl 8) & 0x00FF00FF
9906-
SDValue And = DAG.getNode(ISD::AND, dl, VT, Op, Mask);
9907-
SDValue Rotr =
9908-
DAG.getNode(ISD::ROTR, dl, VT, And, DAG.getConstant(8, dl, SHVT));
9909-
SDValue Rotl =
9910-
DAG.getNode(ISD::ROTR, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
9911-
SDValue And2 = DAG.getNode(ISD::AND, dl, VT, Rotl, Mask);
9912-
return DAG.getNode(ISD::OR, dl, VT, Rotr, And2);
9904+
// ror rtmp, r0, #16
9905+
SDValue Ror16 =
9906+
DAG.getNode(ISD::ROTR, dl, VT, Op, DAG.getConstant(16, dl, SHVT));
9907+
// eor r1, r0, rtmp ; r1 = r0 ^ (r0 ror 16)
9908+
SDValue Xor1 = DAG.getNode(ISD::XOR, dl, VT, Op, Ror16);
9909+
9910+
// bic r1, r1, #0xff0000 (clear bits 16-23)
9911+
// BIC r1, r1, #0xff0000 becomes AND r1, r1, ~0x00ff0000
9912+
// So we need the negated value: ~0x00FF0000 = 0xFF00FFFF
9913+
SDValue Mask = DAG.getConstant(0xFF00FFFFu, dl, VT);
9914+
SDValue BicResult = DAG.getNode(ISD::AND, dl, VT, Xor1, Mask);
9915+
9916+
// mov r1, r1, lsr #8
9917+
SDValue Lsr8 = DAG.getNode(ISD::SRL, dl, VT, BicResult,
9918+
DAG.getConstant(8, dl, SHVT));
9919+
9920+
// ror r0, r0, #8
9921+
SDValue Ror8 =
9922+
DAG.getNode(ISD::ROTR, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
9923+
9924+
// eor r0, Lsr8, Ror8
9925+
return DAG.getNode(ISD::XOR, dl, VT, Lsr8, Ror8);
99139926
}
99149927
Tmp4 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
99159928
Tmp3 = DAG.getNode(ISD::AND, dl, VT, Op,

llvm/lib/Target/ARM/README.txt

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -606,32 +606,6 @@ constant which was already loaded). Not sure what's necessary to do that.
606606

607607
//===---------------------------------------------------------------------===//
608608

609-
The code generated for bswap on armv4/5 (CPUs without rev) is less than ideal:
610-
611-
int a(int x) { return __builtin_bswap32(x); }
612-
613-
a:
614-
mov r1, #255, 24
615-
mov r2, #255, 16
616-
and r1, r1, r0, lsr #8
617-
and r2, r2, r0, lsl #8
618-
orr r1, r1, r0, lsr #24
619-
orr r0, r2, r0, lsl #24
620-
orr r0, r0, r1
621-
bx lr
622-
623-
Something like the following would be better (fewer instructions/registers):
624-
eor r1, r0, r0, ror #16
625-
bic r1, r1, #0xff0000
626-
mov r1, r1, lsr #8
627-
eor r0, r1, r0, ror #8
628-
bx lr
629-
630-
A custom Thumb version would also be a slight improvement over the generic
631-
version.
632-
633-
//===---------------------------------------------------------------------===//
634-
635609
Consider the following simple C code:
636610

637611
void foo(unsigned char *a, unsigned char *b, int *c) {

llvm/test/CodeGen/ARM/load-combine-big-endian.ll

Lines changed: 32 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -50,15 +50,13 @@ define i32 @load_i32_by_i8_big_endian(ptr %arg) {
5050
; ptr p; // p is 4 byte aligned
5151
; (i32) p[0] | ((i32) p[1] << 8) | ((i32) p[2] << 16) | ((i32) p[3] << 24)
5252
define i32 @load_i32_by_i8_bswap(ptr %arg) {
53-
; BSWAP is not supported by 32 bit target
5453
; CHECK-LABEL: load_i32_by_i8_bswap:
5554
; CHECK: @ %bb.0:
56-
; CHECK-NEXT: mov r1, #255
5755
; CHECK-NEXT: ldr r0, [r0]
58-
; CHECK-NEXT: orr r1, r1, #16711680
59-
; CHECK-NEXT: and r2, r0, r1
60-
; CHECK-NEXT: and r0, r1, r0, ror #24
61-
; CHECK-NEXT: orr r0, r0, r2, ror #8
56+
; CHECK-NEXT: eor r1, r0, r0, ror #16
57+
; CHECK-NEXT: bic r1, r1, #16711680
58+
; CHECK-NEXT: lsr r1, r1, #8
59+
; CHECK-NEXT: eor r0, r1, r0, ror #8
6260
; CHECK-NEXT: mov pc, lr
6361
;
6462
; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap:
@@ -221,16 +219,16 @@ define i32 @load_i32_by_i16_i8(ptr %arg) {
221219
define i64 @load_i64_by_i8_bswap(ptr %arg) {
222220
; CHECK-LABEL: load_i64_by_i8_bswap:
223221
; CHECK: @ %bb.0:
224-
; CHECK-NEXT: mov r2, #255
225222
; CHECK-NEXT: ldr r1, [r0]
226223
; CHECK-NEXT: ldr r0, [r0, #4]
227-
; CHECK-NEXT: orr r2, r2, #16711680
228-
; CHECK-NEXT: and r3, r0, r2
229-
; CHECK-NEXT: and r0, r2, r0, ror #24
230-
; CHECK-NEXT: orr r0, r0, r3, ror #8
231-
; CHECK-NEXT: and r3, r1, r2
232-
; CHECK-NEXT: and r1, r2, r1, ror #24
233-
; CHECK-NEXT: orr r1, r1, r3, ror #8
224+
; CHECK-NEXT: eor r2, r0, r0, ror #16
225+
; CHECK-NEXT: bic r2, r2, #16711680
226+
; CHECK-NEXT: lsr r2, r2, #8
227+
; CHECK-NEXT: eor r0, r2, r0, ror #8
228+
; CHECK-NEXT: eor r2, r1, r1, ror #16
229+
; CHECK-NEXT: bic r2, r2, #16711680
230+
; CHECK-NEXT: lsr r2, r2, #8
231+
; CHECK-NEXT: eor r1, r2, r1, ror #8
234232
; CHECK-NEXT: mov pc, lr
235233
;
236234
; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap:
@@ -370,12 +368,11 @@ define i64 @load_i64_by_i8(ptr %arg) {
370368
define i32 @load_i32_by_i8_nonzero_offset(ptr %arg) {
371369
; CHECK-LABEL: load_i32_by_i8_nonzero_offset:
372370
; CHECK: @ %bb.0:
373-
; CHECK-NEXT: mov r1, #255
374371
; CHECK-NEXT: ldr r0, [r0, #1]
375-
; CHECK-NEXT: orr r1, r1, #16711680
376-
; CHECK-NEXT: and r2, r0, r1
377-
; CHECK-NEXT: and r0, r1, r0, ror #24
378-
; CHECK-NEXT: orr r0, r0, r2, ror #8
372+
; CHECK-NEXT: eor r1, r0, r0, ror #16
373+
; CHECK-NEXT: bic r1, r1, #16711680
374+
; CHECK-NEXT: lsr r1, r1, #8
375+
; CHECK-NEXT: eor r0, r1, r0, ror #8
379376
; CHECK-NEXT: mov pc, lr
380377
;
381378
; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset:
@@ -425,12 +422,11 @@ define i32 @load_i32_by_i8_nonzero_offset(ptr %arg) {
425422
define i32 @load_i32_by_i8_neg_offset(ptr %arg) {
426423
; CHECK-LABEL: load_i32_by_i8_neg_offset:
427424
; CHECK: @ %bb.0:
428-
; CHECK-NEXT: mov r1, #255
429425
; CHECK-NEXT: ldr r0, [r0, #-4]
430-
; CHECK-NEXT: orr r1, r1, #16711680
431-
; CHECK-NEXT: and r2, r0, r1
432-
; CHECK-NEXT: and r0, r1, r0, ror #24
433-
; CHECK-NEXT: orr r0, r0, r2, ror #8
426+
; CHECK-NEXT: eor r1, r0, r0, ror #16
427+
; CHECK-NEXT: bic r1, r1, #16711680
428+
; CHECK-NEXT: lsr r1, r1, #8
429+
; CHECK-NEXT: eor r0, r1, r0, ror #8
434430
; CHECK-NEXT: mov pc, lr
435431
;
436432
; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset:
@@ -576,12 +572,11 @@ declare i16 @llvm.bswap.i16(i16)
576572
define i32 @load_i32_by_bswap_i16(ptr %arg) {
577573
; CHECK-LABEL: load_i32_by_bswap_i16:
578574
; CHECK: @ %bb.0:
579-
; CHECK-NEXT: mov r1, #255
580575
; CHECK-NEXT: ldr r0, [r0]
581-
; CHECK-NEXT: orr r1, r1, #16711680
582-
; CHECK-NEXT: and r2, r0, r1
583-
; CHECK-NEXT: and r0, r1, r0, ror #24
584-
; CHECK-NEXT: orr r0, r0, r2, ror #8
576+
; CHECK-NEXT: eor r1, r0, r0, ror #16
577+
; CHECK-NEXT: bic r1, r1, #16711680
578+
; CHECK-NEXT: lsr r1, r1, #8
579+
; CHECK-NEXT: eor r0, r1, r0, ror #8
585580
; CHECK-NEXT: mov pc, lr
586581
;
587582
; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16:
@@ -654,12 +649,11 @@ define i32 @load_i32_by_i8_base_offset_index(ptr %arg, i32 %i) {
654649
; CHECK-LABEL: load_i32_by_i8_base_offset_index:
655650
; CHECK: @ %bb.0:
656651
; CHECK-NEXT: add r0, r0, r1
657-
; CHECK-NEXT: mov r1, #255
658-
; CHECK-NEXT: orr r1, r1, #16711680
659652
; CHECK-NEXT: ldr r0, [r0, #12]
660-
; CHECK-NEXT: and r2, r0, r1
661-
; CHECK-NEXT: and r0, r1, r0, ror #24
662-
; CHECK-NEXT: orr r0, r0, r2, ror #8
653+
; CHECK-NEXT: eor r1, r0, r0, ror #16
654+
; CHECK-NEXT: bic r1, r1, #16711680
655+
; CHECK-NEXT: lsr r1, r1, #8
656+
; CHECK-NEXT: eor r0, r1, r0, ror #8
663657
; CHECK-NEXT: mov pc, lr
664658
;
665659
; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index:
@@ -718,12 +712,11 @@ define i32 @load_i32_by_i8_base_offset_index_2(ptr %arg, i32 %i) {
718712
; CHECK-LABEL: load_i32_by_i8_base_offset_index_2:
719713
; CHECK: @ %bb.0:
720714
; CHECK-NEXT: add r0, r1, r0
721-
; CHECK-NEXT: mov r1, #255
722-
; CHECK-NEXT: orr r1, r1, #16711680
723715
; CHECK-NEXT: ldr r0, [r0, #13]
724-
; CHECK-NEXT: and r2, r0, r1
725-
; CHECK-NEXT: and r0, r1, r0, ror #24
726-
; CHECK-NEXT: orr r0, r0, r2, ror #8
716+
; CHECK-NEXT: eor r1, r0, r0, ror #16
717+
; CHECK-NEXT: bic r1, r1, #16711680
718+
; CHECK-NEXT: lsr r1, r1, #8
719+
; CHECK-NEXT: eor r0, r1, r0, ror #8
727720
; CHECK-NEXT: mov pc, lr
728721
;
729722
; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2:

llvm/test/CodeGen/ARM/load-combine.ll

Lines changed: 24 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -114,15 +114,13 @@ define i32 @load_i32_by_i8_aligned(ptr %arg) {
114114
; ptr p; // p is 4 byte aligned
115115
; ((i32) p[0] << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3]
116116
define i32 @load_i32_by_i8_bswap(ptr %arg) {
117-
; BSWAP is not supported by 32 bit target
118117
; CHECK-LABEL: load_i32_by_i8_bswap:
119118
; CHECK: @ %bb.0:
120-
; CHECK-NEXT: mov r1, #255
121119
; CHECK-NEXT: ldr r0, [r0]
122-
; CHECK-NEXT: orr r1, r1, #16711680
123-
; CHECK-NEXT: and r2, r0, r1
124-
; CHECK-NEXT: and r0, r1, r0, ror #24
125-
; CHECK-NEXT: orr r0, r0, r2, ror #8
120+
; CHECK-NEXT: eor r1, r0, r0, ror #16
121+
; CHECK-NEXT: bic r1, r1, #16711680
122+
; CHECK-NEXT: lsr r1, r1, #8
123+
; CHECK-NEXT: eor r0, r1, r0, ror #8
126124
; CHECK-NEXT: mov pc, lr
127125
;
128126
; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap:
@@ -235,16 +233,16 @@ define i64 @load_i64_by_i8(ptr %arg) {
235233
define i64 @load_i64_by_i8_bswap(ptr %arg) {
236234
; CHECK-LABEL: load_i64_by_i8_bswap:
237235
; CHECK: @ %bb.0:
238-
; CHECK-NEXT: mov r2, #255
239236
; CHECK-NEXT: ldr r1, [r0]
240237
; CHECK-NEXT: ldr r0, [r0, #4]
241-
; CHECK-NEXT: orr r2, r2, #16711680
242-
; CHECK-NEXT: and r3, r0, r2
243-
; CHECK-NEXT: and r0, r2, r0, ror #24
244-
; CHECK-NEXT: orr r0, r0, r3, ror #8
245-
; CHECK-NEXT: and r3, r1, r2
246-
; CHECK-NEXT: and r1, r2, r1, ror #24
247-
; CHECK-NEXT: orr r1, r1, r3, ror #8
238+
; CHECK-NEXT: eor r2, r0, r0, ror #16
239+
; CHECK-NEXT: bic r2, r2, #16711680
240+
; CHECK-NEXT: lsr r2, r2, #8
241+
; CHECK-NEXT: eor r0, r2, r0, ror #8
242+
; CHECK-NEXT: eor r2, r1, r1, ror #16
243+
; CHECK-NEXT: bic r2, r2, #16711680
244+
; CHECK-NEXT: lsr r2, r2, #8
245+
; CHECK-NEXT: eor r1, r2, r1, ror #8
248246
; CHECK-NEXT: mov pc, lr
249247
;
250248
; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap:
@@ -406,12 +404,11 @@ define i32 @load_i32_by_i8_neg_offset(ptr %arg) {
406404
define i32 @load_i32_by_i8_nonzero_offset_bswap(ptr %arg) {
407405
; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap:
408406
; CHECK: @ %bb.0:
409-
; CHECK-NEXT: mov r1, #255
410407
; CHECK-NEXT: ldr r0, [r0, #1]
411-
; CHECK-NEXT: orr r1, r1, #16711680
412-
; CHECK-NEXT: and r2, r0, r1
413-
; CHECK-NEXT: and r0, r1, r0, ror #24
414-
; CHECK-NEXT: orr r0, r0, r2, ror #8
408+
; CHECK-NEXT: eor r1, r0, r0, ror #16
409+
; CHECK-NEXT: bic r1, r1, #16711680
410+
; CHECK-NEXT: lsr r1, r1, #8
411+
; CHECK-NEXT: eor r0, r1, r0, ror #8
415412
; CHECK-NEXT: mov pc, lr
416413
;
417414
; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap:
@@ -460,12 +457,11 @@ define i32 @load_i32_by_i8_nonzero_offset_bswap(ptr %arg) {
460457
define i32 @load_i32_by_i8_neg_offset_bswap(ptr %arg) {
461458
; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap:
462459
; CHECK: @ %bb.0:
463-
; CHECK-NEXT: mov r1, #255
464460
; CHECK-NEXT: ldr r0, [r0, #-4]
465-
; CHECK-NEXT: orr r1, r1, #16711680
466-
; CHECK-NEXT: and r2, r0, r1
467-
; CHECK-NEXT: and r0, r1, r0, ror #24
468-
; CHECK-NEXT: orr r0, r0, r2, ror #8
461+
; CHECK-NEXT: eor r1, r0, r0, ror #16
462+
; CHECK-NEXT: bic r1, r1, #16711680
463+
; CHECK-NEXT: lsr r1, r1, #8
464+
; CHECK-NEXT: eor r0, r1, r0, ror #8
469465
; CHECK-NEXT: mov pc, lr
470466
;
471467
; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap:
@@ -516,12 +512,11 @@ declare i16 @llvm.bswap.i16(i16)
516512
define i32 @load_i32_by_bswap_i16(ptr %arg) {
517513
; CHECK-LABEL: load_i32_by_bswap_i16:
518514
; CHECK: @ %bb.0:
519-
; CHECK-NEXT: mov r1, #255
520515
; CHECK-NEXT: ldr r0, [r0]
521-
; CHECK-NEXT: orr r1, r1, #16711680
522-
; CHECK-NEXT: and r2, r0, r1
523-
; CHECK-NEXT: and r0, r1, r0, ror #24
524-
; CHECK-NEXT: orr r0, r0, r2, ror #8
516+
; CHECK-NEXT: eor r1, r0, r0, ror #16
517+
; CHECK-NEXT: bic r1, r1, #16711680
518+
; CHECK-NEXT: lsr r1, r1, #8
519+
; CHECK-NEXT: eor r0, r1, r0, ror #8
525520
; CHECK-NEXT: mov pc, lr
526521
;
527522
; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16:

0 commit comments

Comments
 (0)