From 97ee721b22544879f84c3467ffcaff861f84b467 Mon Sep 17 00:00:00 2001 From: AZero13 Date: Thu, 23 Oct 2025 12:54:48 -0400 Subject: [PATCH 1/2] [DAGCombine] Improve bswap lowering for machines that support bit rotates Source: Hacker's delight. --- .../CodeGen/SelectionDAG/TargetLowering.cpp | 12 +++ .../CodeGen/ARM/load-combine-big-endian.ll | 93 ++++++++----------- llvm/test/CodeGen/ARM/load-combine.ll | 69 ++++++-------- 3 files changed, 78 insertions(+), 96 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 920dff935daed..748261709c47f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -9899,6 +9899,18 @@ SDValue TargetLowering::expandBSWAP(SDNode *N, SelectionDAG &DAG) const { // Use a rotate by 8. This can be further expanded if necessary. return DAG.getNode(ISD::ROTL, dl, VT, Op, DAG.getConstant(8, dl, SHVT)); case MVT::i32: + // This is meant for ARM speficially, which has ROTR but no ROTL. + if (isOperationLegal(ISD::ROTR, VT)) { + SDValue Mask = DAG.getConstant(0x00FF00FF, dl, VT); + // (x & 0x00FF00FF) rotr 8 | (x rotl 8) & 0x00FF00FF + SDValue And = DAG.getNode(ISD::AND, dl, VT, Op, Mask); + SDValue Rotr = + DAG.getNode(ISD::ROTR, dl, VT, And, DAG.getConstant(8, dl, SHVT)); + SDValue Rotl = + DAG.getNode(ISD::ROTR, dl, VT, Op, DAG.getConstant(24, dl, SHVT)); + SDValue And2 = DAG.getNode(ISD::AND, dl, VT, Rotl, Mask); + return DAG.getNode(ISD::OR, dl, VT, Rotr, And2); + } Tmp4 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT)); Tmp3 = DAG.getNode(ISD::AND, dl, VT, Op, DAG.getConstant(0xFF00, dl, VT)); diff --git a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll index 4b6d14efd0ecb..1d5c8589429a4 100644 --- a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll +++ b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll @@ -53,14 +53,12 @@ define i32 @load_i32_by_i8_bswap(ptr %arg) { ; BSWAP is not supported by 32 bit target ; CHECK-LABEL: load_i32_by_i8_bswap: ; CHECK: @ %bb.0: +; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: orr r1, r1, #16711680 +; CHECK-NEXT: and r2, r0, r1 +; CHECK-NEXT: and r0, r1, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r2, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap: @@ -223,21 +221,16 @@ define i32 @load_i32_by_i16_i8(ptr %arg) { define i64 @load_i64_by_i8_bswap(ptr %arg) { ; CHECK-LABEL: load_i64_by_i8_bswap: ; CHECK: @ %bb.0: +; CHECK-NEXT: mov r2, #255 ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: mov r12, #65280 ; CHECK-NEXT: ldr r0, [r0, #4] -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r3, r12, r0, lsr #8 -; CHECK-NEXT: orr r3, r3, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: and r2, r12, r1, lsr #8 -; CHECK-NEXT: orr r0, r0, r3 -; CHECK-NEXT: and r3, r1, #65280 -; CHECK-NEXT: orr r2, r2, r1, lsr #24 -; CHECK-NEXT: lsl r1, r1, #24 -; CHECK-NEXT: orr r1, r1, r3, lsl #8 -; CHECK-NEXT: orr r1, r1, r2 +; CHECK-NEXT: orr r2, r2, #16711680 +; CHECK-NEXT: and r3, r0, r2 +; CHECK-NEXT: and r0, r2, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r3, ror #8 +; CHECK-NEXT: and r3, r1, r2 +; CHECK-NEXT: and r1, r2, r1, ror #24 +; CHECK-NEXT: orr r1, r1, r3, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap: @@ -377,14 +370,12 @@ define i64 @load_i64_by_i8(ptr %arg) { define i32 @load_i32_by_i8_nonzero_offset(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset: ; CHECK: @ %bb.0: +; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0, #1] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: orr r1, r1, #16711680 +; CHECK-NEXT: and r2, r0, r1 +; CHECK-NEXT: and r0, r1, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r2, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset: @@ -434,14 +425,12 @@ define i32 @load_i32_by_i8_nonzero_offset(ptr %arg) { define i32 @load_i32_by_i8_neg_offset(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset: ; CHECK: @ %bb.0: +; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0, #-4] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: orr r1, r1, #16711680 +; CHECK-NEXT: and r2, r0, r1 +; CHECK-NEXT: and r0, r1, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r2, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset: @@ -587,14 +576,12 @@ declare i16 @llvm.bswap.i16(i16) define i32 @load_i32_by_bswap_i16(ptr %arg) { ; CHECK-LABEL: load_i32_by_bswap_i16: ; CHECK: @ %bb.0: +; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: orr r1, r1, #16711680 +; CHECK-NEXT: and r2, r0, r1 +; CHECK-NEXT: and r0, r1, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r2, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16: @@ -667,14 +654,12 @@ define i32 @load_i32_by_i8_base_offset_index(ptr %arg, i32 %i) { ; CHECK-LABEL: load_i32_by_i8_base_offset_index: ; CHECK: @ %bb.0: ; CHECK-NEXT: add r0, r0, r1 -; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r1, #255 +; CHECK-NEXT: orr r1, r1, #16711680 ; CHECK-NEXT: ldr r0, [r0, #12] -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: and r2, r0, r1 +; CHECK-NEXT: and r0, r1, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r2, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index: @@ -733,14 +718,12 @@ define i32 @load_i32_by_i8_base_offset_index_2(ptr %arg, i32 %i) { ; CHECK-LABEL: load_i32_by_i8_base_offset_index_2: ; CHECK: @ %bb.0: ; CHECK-NEXT: add r0, r1, r0 -; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r1, #255 +; CHECK-NEXT: orr r1, r1, #16711680 ; CHECK-NEXT: ldr r0, [r0, #13] -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: and r2, r0, r1 +; CHECK-NEXT: and r0, r1, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r2, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2: diff --git a/llvm/test/CodeGen/ARM/load-combine.ll b/llvm/test/CodeGen/ARM/load-combine.ll index 0f6ec8aa47386..70873672d6523 100644 --- a/llvm/test/CodeGen/ARM/load-combine.ll +++ b/llvm/test/CodeGen/ARM/load-combine.ll @@ -117,14 +117,12 @@ define i32 @load_i32_by_i8_bswap(ptr %arg) { ; BSWAP is not supported by 32 bit target ; CHECK-LABEL: load_i32_by_i8_bswap: ; CHECK: @ %bb.0: +; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: orr r1, r1, #16711680 +; CHECK-NEXT: and r2, r0, r1 +; CHECK-NEXT: and r0, r1, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r2, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap: @@ -237,21 +235,16 @@ define i64 @load_i64_by_i8(ptr %arg) { define i64 @load_i64_by_i8_bswap(ptr %arg) { ; CHECK-LABEL: load_i64_by_i8_bswap: ; CHECK: @ %bb.0: +; CHECK-NEXT: mov r2, #255 ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: mov r12, #65280 ; CHECK-NEXT: ldr r0, [r0, #4] -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r3, r12, r0, lsr #8 -; CHECK-NEXT: orr r3, r3, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: and r2, r12, r1, lsr #8 -; CHECK-NEXT: orr r0, r0, r3 -; CHECK-NEXT: and r3, r1, #65280 -; CHECK-NEXT: orr r2, r2, r1, lsr #24 -; CHECK-NEXT: lsl r1, r1, #24 -; CHECK-NEXT: orr r1, r1, r3, lsl #8 -; CHECK-NEXT: orr r1, r1, r2 +; CHECK-NEXT: orr r2, r2, #16711680 +; CHECK-NEXT: and r3, r0, r2 +; CHECK-NEXT: and r0, r2, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r3, ror #8 +; CHECK-NEXT: and r3, r1, r2 +; CHECK-NEXT: and r1, r2, r1, ror #24 +; CHECK-NEXT: orr r1, r1, r3, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap: @@ -413,14 +406,12 @@ define i32 @load_i32_by_i8_neg_offset(ptr %arg) { define i32 @load_i32_by_i8_nonzero_offset_bswap(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: ; CHECK: @ %bb.0: +; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0, #1] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: orr r1, r1, #16711680 +; CHECK-NEXT: and r2, r0, r1 +; CHECK-NEXT: and r0, r1, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r2, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap: @@ -469,14 +460,12 @@ define i32 @load_i32_by_i8_nonzero_offset_bswap(ptr %arg) { define i32 @load_i32_by_i8_neg_offset_bswap(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: ; CHECK: @ %bb.0: +; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0, #-4] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: orr r1, r1, #16711680 +; CHECK-NEXT: and r2, r0, r1 +; CHECK-NEXT: and r0, r1, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r2, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap: @@ -527,14 +516,12 @@ declare i16 @llvm.bswap.i16(i16) define i32 @load_i32_by_bswap_i16(ptr %arg) { ; CHECK-LABEL: load_i32_by_bswap_i16: ; CHECK: @ %bb.0: +; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: and r2, r0, #65280 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r2, lsl #8 -; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: orr r1, r1, #16711680 +; CHECK-NEXT: and r2, r0, r1 +; CHECK-NEXT: and r0, r1, r0, ror #24 +; CHECK-NEXT: orr r0, r0, r2, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16: From f754e72f83b7d732b36efdbcaed4a8513ef71c2c Mon Sep 17 00:00:00 2001 From: AZero13 Date: Sat, 25 Oct 2025 11:59:46 -0400 Subject: [PATCH 2/2] LegalOrCustom --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 748261709c47f..da4e40953b39a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -9900,7 +9900,7 @@ SDValue TargetLowering::expandBSWAP(SDNode *N, SelectionDAG &DAG) const { return DAG.getNode(ISD::ROTL, dl, VT, Op, DAG.getConstant(8, dl, SHVT)); case MVT::i32: // This is meant for ARM speficially, which has ROTR but no ROTL. - if (isOperationLegal(ISD::ROTR, VT)) { + if (isOperationLegalOrCustom(ISD::ROTR, VT)) { SDValue Mask = DAG.getConstant(0x00FF00FF, dl, VT); // (x & 0x00FF00FF) rotr 8 | (x rotl 8) & 0x00FF00FF SDValue And = DAG.getNode(ISD::AND, dl, VT, Op, Mask);