[GlobalISel] Add srem-by-pow2 combine to intrem_combines#194673
[GlobalISel] Add srem-by-pow2 combine to intrem_combines#194673AlexMaclean wants to merge 3 commits intollvm:mainfrom
Conversation
|
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Alex MacLean (AlexMaclean) ChangesFull diff: https://github.com/llvm/llvm-project/pull/194673.diff 5 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 97f29015c6911..9d8d44f5b5dde 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -752,6 +752,10 @@ class CombinerHelper {
/// return expressions that implements it by shifting.
void applyUDivByPow2(MachineInstr &MI) const;
+ /// Combine G_SREM x, (+/-2^k) to a bias-and-mask sequence.
+ bool matchAbsPow2(Register Reg) const;
+ void applySimplifySRemByPow2(MachineInstr &MI) const;
+
// G_UMULH x, (1 << c)) -> x >> (bitwidth - c)
bool matchUMulHToLShr(MachineInstr &MI) const;
void applyUMulHToLShr(MachineInstr &MI) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index d4ef5722f44c5..3393769f215c9 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1338,7 +1338,14 @@ def srem_by_const : GICombineRule<
[{ return Helper.matchSDivOrSRemByConst(*${root}); }]),
(apply [{ Helper.applySDivOrSRemByConst(*${root}); }])>;
-def intrem_combines : GICombineGroup<[urem_by_const, srem_by_const]>;
+def srem_pow2_to_mask : GICombineRule<
+ (defs root:$root),
+ (match (G_SREM $dst, $x, $y):$root,
+ [{ return Helper.matchAbsPow2(${y}.getReg()); }]),
+ (apply [{ Helper.applySimplifySRemByPow2(*${root}); }])>;
+
+def intrem_combines : GICombineGroup<[srem_pow2_to_mask, urem_by_const,
+ srem_by_const]>;
def reassoc_ptradd : GICombineRule<
(defs root:$root, build_fn_matchinfo:$matchinfo),
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 95b7f864c16fd..f2612191aec47 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6103,6 +6103,58 @@ void CombinerHelper::applyUDivByPow2(MachineInstr &MI) const {
MI.eraseFromParent();
}
+/// \p Reg is known to be +/- a power of 2.
+bool CombinerHelper::matchAbsPow2(Register Reg) const {
+ // Known bits only work for positive powers of 2.
+ if (isKnownToBeAPowerOfTwo(Reg, MRI, VT))
+ return true;
+
+ // Otherwise accept any constant whose absolute value is a power of 2.
+ auto MatchAbsPow2 = [](const Constant *C) {
+ auto *CI = dyn_cast<ConstantInt>(C);
+ return CI && (CI->getValue().isPowerOf2() ||
+ CI->getValue().isNegatedPowerOf2());
+ };
+ return matchUnaryPredicate(MRI, Reg, MatchAbsPow2, /*AllowUndefs=*/false);
+}
+
+void CombinerHelper::applySimplifySRemByPow2(MachineInstr &MI) const {
+ assert(MI.getOpcode() == TargetOpcode::G_SREM && "Expected SREM");
+ auto &SRem = cast<GenericMachineInstr>(MI);
+ Register Dst = SRem.getReg(0);
+ Register LHS = SRem.getReg(1);
+ Register RHS = SRem.getReg(2);
+ LLT Ty = MRI.getType(Dst);
+ LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
+
+ // Effectively we want to lower G_SREM %lhs, %rhs, where %rhs is +/- a power
+ // of 2, to the following branch-free bias-and-mask version:
+ //
+ // %abs = G_ABS %rhs
+ // %mask = G_SUB %abs, 1
+ // %sign = G_ASHR %lhs, $(bitwidth - 1)
+ // %bias = G_AND %sign, %mask
+ // %biased = G_ADD %lhs, %bias
+ // %masked = G_AND %biased, %mask
+ // %res = G_SUB %masked, %bias
+ //
+ // The bias adds (|%rhs| - 1) for negative %lhs, correcting rounding towards
+ // zero (instead of towards -inf that a plain mask would give). Constant
+ // divisors collapse %mask to a single G_CONSTANT via the CSEMIRBuilder folds
+ // for G_ABS and G_SUB.
+
+ unsigned BitWidth = Ty.getScalarSizeInBits();
+ auto AbsRHS = Builder.buildAbs(Ty, RHS);
+ auto Mask = Builder.buildSub(Ty, AbsRHS, Builder.buildConstant(Ty, 1));
+ auto BWMinusOne = Builder.buildConstant(ShiftAmtTy, BitWidth - 1);
+ auto Sign = Builder.buildAShr(Ty, LHS, BWMinusOne);
+ auto Bias = Builder.buildAnd(Ty, Sign, Mask);
+ auto Biased = Builder.buildAdd(Ty, LHS, Bias);
+ auto Masked = Builder.buildAnd(Ty, Biased, Mask);
+ Builder.buildSub(Dst, Masked, Bias);
+ MI.eraseFromParent();
+}
+
bool CombinerHelper::matchUMulHToLShr(MachineInstr &MI) const {
assert(MI.getOpcode() == TargetOpcode::G_UMULH);
Register RHS = MI.getOperand(2).getReg();
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-srem-by-pow2.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-srem-by-pow2.mir
new file mode 100644
index 0000000000000..c22a421ada922
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-srem-by-pow2.mir
@@ -0,0 +1,281 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: srem_pos_pow2_s32
+body: |
+ bb.1:
+ liveins: $w0
+
+ ; CHECK-LABEL: name: srem_pos_pow2_s32
+ ; CHECK: liveins: $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C1]](s32)
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ASHR]], [[C]]
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[AND]]
+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[AND1]], [[AND]]
+ ; CHECK-NEXT: $w0 = COPY [[SUB]](s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = G_CONSTANT i32 4
+ %2:_(s32) = G_SREM %0, %1
+ $w0 = COPY %2(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+name: srem_neg_pow2_s32
+body: |
+ bb.1:
+ liveins: $w0
+
+ ; CHECK-LABEL: name: srem_neg_pow2_s32
+ ; CHECK: liveins: $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C1]](s32)
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ASHR]], [[C]]
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[AND]]
+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[AND1]], [[AND]]
+ ; CHECK-NEXT: $w0 = COPY [[SUB]](s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = G_CONSTANT i32 -4
+ %2:_(s32) = G_SREM %0, %1
+ $w0 = COPY %2(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+name: srem_one_s32
+body: |
+ bb.1:
+ liveins: $w0
+
+ ; CHECK-LABEL: name: srem_one_s32
+ ; CHECK: liveins: $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: $w0 = COPY [[C]](s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = G_CONSTANT i32 1
+ %2:_(s32) = G_SREM %0, %1
+ $w0 = COPY %2(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+name: srem_minus_one_s32
+body: |
+ bb.1:
+ liveins: $w0
+
+ ; CHECK-LABEL: name: srem_minus_one_s32
+ ; CHECK: liveins: $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: $w0 = COPY [[C]](s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = G_CONSTANT i32 -1
+ %2:_(s32) = G_SREM %0, %1
+ $w0 = COPY %2(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+name: srem_intmin_s32
+body: |
+ bb.1:
+ liveins: $w0
+
+ ; CHECK-LABEL: name: srem_intmin_s32
+ ; CHECK: liveins: $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C1]](s32)
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ASHR]], [[C]]
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[AND]]
+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[AND1]], [[AND]]
+ ; CHECK-NEXT: $w0 = COPY [[SUB]](s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = G_CONSTANT i32 -2147483648
+ %2:_(s32) = G_SREM %0, %1
+ $w0 = COPY %2(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+name: srem_pow2_s64
+body: |
+ bb.1:
+ liveins: $x0
+
+ ; CHECK-LABEL: name: srem_pow2_s64
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 7
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
+ ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY]], [[C1]](s64)
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ASHR]], [[C]]
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[AND]]
+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C]]
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[AND1]], [[AND]]
+ ; CHECK-NEXT: $x0 = COPY [[SUB]](s64)
+ ; CHECK-NEXT: RET_ReallyLR implicit $x0
+ %0:_(s64) = COPY $x0
+ %1:_(s64) = G_CONSTANT i64 8
+ %2:_(s64) = G_SREM %0, %1
+ $x0 = COPY %2(s64)
+ RET_ReallyLR implicit $x0
+
+...
+---
+name: srem_pow2_v4s32_splat
+body: |
+ bb.1:
+ liveins: $q0
+
+ ; CHECK-LABEL: name: srem_pow2_v4s32_splat
+ ; CHECK: liveins: $q0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32), [[C1]](s32), [[C1]](s32)
+ ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(<4 x s32>) = G_ASHR [[COPY]], [[BUILD_VECTOR1]](<4 x s32>)
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s32>) = G_AND [[ASHR]], [[BUILD_VECTOR]]
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<4 x s32>) = G_ADD [[COPY]], [[AND]]
+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<4 x s32>) = G_AND [[ADD]], [[BUILD_VECTOR]]
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<4 x s32>) = G_SUB [[AND1]], [[AND]]
+ ; CHECK-NEXT: $q0 = COPY [[SUB]](<4 x s32>)
+ ; CHECK-NEXT: RET_ReallyLR implicit $q0
+ %0:_(<4 x s32>) = COPY $q0
+ %1:_(s32) = G_CONSTANT i32 4
+ %2:_(<4 x s32>) = G_BUILD_VECTOR %1(s32), %1(s32), %1(s32), %1(s32)
+ %3:_(<4 x s32>) = G_SREM %0, %2
+ $q0 = COPY %3(<4 x s32>)
+ RET_ReallyLR implicit $q0
+
+...
+---
+name: srem_pow2_v4s32_mixed_sign
+body: |
+ bb.1:
+ liveins: $q0
+
+ ; CHECK-LABEL: name: srem_pow2_v4s32_mixed_sign
+ ; CHECK: liveins: $q0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+ ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32), [[C2]](s32), [[C3]](s32)
+ ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C4]](s32), [[C4]](s32), [[C4]](s32), [[C4]](s32)
+ ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(<4 x s32>) = G_ASHR [[COPY]], [[BUILD_VECTOR1]](<4 x s32>)
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s32>) = G_AND [[ASHR]], [[BUILD_VECTOR]]
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<4 x s32>) = G_ADD [[COPY]], [[AND]]
+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<4 x s32>) = G_AND [[ADD]], [[BUILD_VECTOR]]
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<4 x s32>) = G_SUB [[AND1]], [[AND]]
+ ; CHECK-NEXT: $q0 = COPY [[SUB]](<4 x s32>)
+ ; CHECK-NEXT: RET_ReallyLR implicit $q0
+ %0:_(<4 x s32>) = COPY $q0
+ %c0:_(s32) = G_CONSTANT i32 4
+ %c1:_(s32) = G_CONSTANT i32 -8
+ %c2:_(s32) = G_CONSTANT i32 16
+ %c3:_(s32) = G_CONSTANT i32 -1
+ %1:_(<4 x s32>) = G_BUILD_VECTOR %c0, %c1, %c2, %c3
+ %2:_(<4 x s32>) = G_SREM %0, %1
+ $q0 = COPY %2(<4 x s32>)
+ RET_ReallyLR implicit $q0
+
+...
+---
+# Negative test: non-pow2 divisor should not match.
+name: srem_nonpow2_s32
+body: |
+ bb.1:
+ liveins: $w0
+
+ ; CHECK-LABEL: name: srem_nonpow2_s32
+ ; CHECK: liveins: $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1431655766
+ ; CHECK-NEXT: [[SMULH:%[0-9]+]]:_(s32) = G_SMULH [[COPY]], [[C1]]
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[SMULH]], [[C2]](s32)
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SMULH]], [[LSHR]]
+ ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[ADD]], [[C]]
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[MUL]]
+ ; CHECK-NEXT: $w0 = COPY [[SUB]](s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = G_CONSTANT i32 3
+ %2:_(s32) = G_SREM %0, %1
+ $w0 = COPY %2(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+# Negative test: non-constant divisor should not match.
+name: srem_nonconst_s32
+body: |
+ bb.1:
+ liveins: $w0, $w1
+
+ ; CHECK-LABEL: name: srem_nonconst_s32
+ ; CHECK: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+ ; CHECK-NEXT: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[COPY]], [[COPY1]]
+ ; CHECK-NEXT: $w0 = COPY [[SREM]](s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = COPY $w1
+ %2:_(s32) = G_SREM %0, %1
+ $w0 = COPY %2(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+# Negative test: zero divisor should not match.
+name: srem_zero_s32
+body: |
+ bb.1:
+ liveins: $w0
+
+ ; CHECK-LABEL: name: srem_zero_s32
+ ; CHECK: liveins: $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[COPY]], [[C]]
+ ; CHECK-NEXT: $w0 = COPY [[SREM]](s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = G_CONSTANT i32 0
+ %2:_(s32) = G_SREM %0, %1
+ $w0 = COPY %2(s32)
+ RET_ReallyLR implicit $w0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/srem-vec-crash.ll b/llvm/test/CodeGen/AArch64/srem-vec-crash.ll
index 0b1e430e21105..f447f0ea8903f 100644
--- a/llvm/test/CodeGen/AArch64/srem-vec-crash.ll
+++ b/llvm/test/CodeGen/AArch64/srem-vec-crash.ll
@@ -1,23 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-unknown-unknown -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-unknown-unknown -global-isel < %s | FileCheck %s
define i32 @pr84830(i1 %arg) {
-; CHECK-SD-LABEL: pr84830:
-; CHECK-SD: // %bb.0: // %bb
-; CHECK-SD-NEXT: mov w0, #1 // =0x1
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: pr84830:
-; CHECK-GI: // %bb.0: // %bb
-; CHECK-GI-NEXT: mov w8, #1 // =0x1
-; CHECK-GI-NEXT: sbfx w9, w0, #0, #1
-; CHECK-GI-NEXT: sbfx w8, w8, #0, #1
-; CHECK-GI-NEXT: sdiv w10, w9, w8
-; CHECK-GI-NEXT: msub w8, w10, w8, w9
-; CHECK-GI-NEXT: eor w8, w8, #0x1
-; CHECK-GI-NEXT: and w0, w8, #0x1
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: pr84830:
+; CHECK: // %bb.0: // %bb
+; CHECK-NEXT: mov w0, #1 // =0x1
+; CHECK-NEXT: ret
bb:
%new0 = srem i1 %arg, true
%last = zext i1 %new0 to i32
|
|
@llvm/pr-subscribers-backend-aarch64 Author: Alex MacLean (AlexMaclean) ChangesFull diff: https://github.com/llvm/llvm-project/pull/194673.diff 5 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 97f29015c6911..9d8d44f5b5dde 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -752,6 +752,10 @@ class CombinerHelper {
/// return expressions that implements it by shifting.
void applyUDivByPow2(MachineInstr &MI) const;
+ /// Combine G_SREM x, (+/-2^k) to a bias-and-mask sequence.
+ bool matchAbsPow2(Register Reg) const;
+ void applySimplifySRemByPow2(MachineInstr &MI) const;
+
// G_UMULH x, (1 << c)) -> x >> (bitwidth - c)
bool matchUMulHToLShr(MachineInstr &MI) const;
void applyUMulHToLShr(MachineInstr &MI) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index d4ef5722f44c5..3393769f215c9 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1338,7 +1338,14 @@ def srem_by_const : GICombineRule<
[{ return Helper.matchSDivOrSRemByConst(*${root}); }]),
(apply [{ Helper.applySDivOrSRemByConst(*${root}); }])>;
-def intrem_combines : GICombineGroup<[urem_by_const, srem_by_const]>;
+def srem_pow2_to_mask : GICombineRule<
+ (defs root:$root),
+ (match (G_SREM $dst, $x, $y):$root,
+ [{ return Helper.matchAbsPow2(${y}.getReg()); }]),
+ (apply [{ Helper.applySimplifySRemByPow2(*${root}); }])>;
+
+def intrem_combines : GICombineGroup<[srem_pow2_to_mask, urem_by_const,
+ srem_by_const]>;
def reassoc_ptradd : GICombineRule<
(defs root:$root, build_fn_matchinfo:$matchinfo),
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 95b7f864c16fd..f2612191aec47 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6103,6 +6103,58 @@ void CombinerHelper::applyUDivByPow2(MachineInstr &MI) const {
MI.eraseFromParent();
}
+/// \p Reg is known to be +/- a power of 2.
+bool CombinerHelper::matchAbsPow2(Register Reg) const {
+ // Known bits only work for positive powers of 2.
+ if (isKnownToBeAPowerOfTwo(Reg, MRI, VT))
+ return true;
+
+ // Otherwise accept any constant whose absolute value is a power of 2.
+ auto MatchAbsPow2 = [](const Constant *C) {
+ auto *CI = dyn_cast<ConstantInt>(C);
+ return CI && (CI->getValue().isPowerOf2() ||
+ CI->getValue().isNegatedPowerOf2());
+ };
+ return matchUnaryPredicate(MRI, Reg, MatchAbsPow2, /*AllowUndefs=*/false);
+}
+
+void CombinerHelper::applySimplifySRemByPow2(MachineInstr &MI) const {
+ assert(MI.getOpcode() == TargetOpcode::G_SREM && "Expected SREM");
+ auto &SRem = cast<GenericMachineInstr>(MI);
+ Register Dst = SRem.getReg(0);
+ Register LHS = SRem.getReg(1);
+ Register RHS = SRem.getReg(2);
+ LLT Ty = MRI.getType(Dst);
+ LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
+
+ // Effectively we want to lower G_SREM %lhs, %rhs, where %rhs is +/- a power
+ // of 2, to the following branch-free bias-and-mask version:
+ //
+ // %abs = G_ABS %rhs
+ // %mask = G_SUB %abs, 1
+ // %sign = G_ASHR %lhs, $(bitwidth - 1)
+ // %bias = G_AND %sign, %mask
+ // %biased = G_ADD %lhs, %bias
+ // %masked = G_AND %biased, %mask
+ // %res = G_SUB %masked, %bias
+ //
+ // The bias adds (|%rhs| - 1) for negative %lhs, correcting rounding towards
+ // zero (instead of towards -inf that a plain mask would give). Constant
+ // divisors collapse %mask to a single G_CONSTANT via the CSEMIRBuilder folds
+ // for G_ABS and G_SUB.
+
+ unsigned BitWidth = Ty.getScalarSizeInBits();
+ auto AbsRHS = Builder.buildAbs(Ty, RHS);
+ auto Mask = Builder.buildSub(Ty, AbsRHS, Builder.buildConstant(Ty, 1));
+ auto BWMinusOne = Builder.buildConstant(ShiftAmtTy, BitWidth - 1);
+ auto Sign = Builder.buildAShr(Ty, LHS, BWMinusOne);
+ auto Bias = Builder.buildAnd(Ty, Sign, Mask);
+ auto Biased = Builder.buildAdd(Ty, LHS, Bias);
+ auto Masked = Builder.buildAnd(Ty, Biased, Mask);
+ Builder.buildSub(Dst, Masked, Bias);
+ MI.eraseFromParent();
+}
+
bool CombinerHelper::matchUMulHToLShr(MachineInstr &MI) const {
assert(MI.getOpcode() == TargetOpcode::G_UMULH);
Register RHS = MI.getOperand(2).getReg();
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-srem-by-pow2.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-srem-by-pow2.mir
new file mode 100644
index 0000000000000..c22a421ada922
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-srem-by-pow2.mir
@@ -0,0 +1,281 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: srem_pos_pow2_s32
+body: |
+ bb.1:
+ liveins: $w0
+
+ ; CHECK-LABEL: name: srem_pos_pow2_s32
+ ; CHECK: liveins: $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C1]](s32)
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ASHR]], [[C]]
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[AND]]
+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[AND1]], [[AND]]
+ ; CHECK-NEXT: $w0 = COPY [[SUB]](s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = G_CONSTANT i32 4
+ %2:_(s32) = G_SREM %0, %1
+ $w0 = COPY %2(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+name: srem_neg_pow2_s32
+body: |
+ bb.1:
+ liveins: $w0
+
+ ; CHECK-LABEL: name: srem_neg_pow2_s32
+ ; CHECK: liveins: $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C1]](s32)
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ASHR]], [[C]]
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[AND]]
+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[AND1]], [[AND]]
+ ; CHECK-NEXT: $w0 = COPY [[SUB]](s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = G_CONSTANT i32 -4
+ %2:_(s32) = G_SREM %0, %1
+ $w0 = COPY %2(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+name: srem_one_s32
+body: |
+ bb.1:
+ liveins: $w0
+
+ ; CHECK-LABEL: name: srem_one_s32
+ ; CHECK: liveins: $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: $w0 = COPY [[C]](s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = G_CONSTANT i32 1
+ %2:_(s32) = G_SREM %0, %1
+ $w0 = COPY %2(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+name: srem_minus_one_s32
+body: |
+ bb.1:
+ liveins: $w0
+
+ ; CHECK-LABEL: name: srem_minus_one_s32
+ ; CHECK: liveins: $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: $w0 = COPY [[C]](s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = G_CONSTANT i32 -1
+ %2:_(s32) = G_SREM %0, %1
+ $w0 = COPY %2(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+name: srem_intmin_s32
+body: |
+ bb.1:
+ liveins: $w0
+
+ ; CHECK-LABEL: name: srem_intmin_s32
+ ; CHECK: liveins: $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C1]](s32)
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ASHR]], [[C]]
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[AND]]
+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[AND1]], [[AND]]
+ ; CHECK-NEXT: $w0 = COPY [[SUB]](s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = G_CONSTANT i32 -2147483648
+ %2:_(s32) = G_SREM %0, %1
+ $w0 = COPY %2(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+name: srem_pow2_s64
+body: |
+ bb.1:
+ liveins: $x0
+
+ ; CHECK-LABEL: name: srem_pow2_s64
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 7
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
+ ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY]], [[C1]](s64)
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ASHR]], [[C]]
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[AND]]
+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C]]
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[AND1]], [[AND]]
+ ; CHECK-NEXT: $x0 = COPY [[SUB]](s64)
+ ; CHECK-NEXT: RET_ReallyLR implicit $x0
+ %0:_(s64) = COPY $x0
+ %1:_(s64) = G_CONSTANT i64 8
+ %2:_(s64) = G_SREM %0, %1
+ $x0 = COPY %2(s64)
+ RET_ReallyLR implicit $x0
+
+...
+---
+name: srem_pow2_v4s32_splat
+body: |
+ bb.1:
+ liveins: $q0
+
+ ; CHECK-LABEL: name: srem_pow2_v4s32_splat
+ ; CHECK: liveins: $q0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32), [[C1]](s32), [[C1]](s32)
+ ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(<4 x s32>) = G_ASHR [[COPY]], [[BUILD_VECTOR1]](<4 x s32>)
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s32>) = G_AND [[ASHR]], [[BUILD_VECTOR]]
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<4 x s32>) = G_ADD [[COPY]], [[AND]]
+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<4 x s32>) = G_AND [[ADD]], [[BUILD_VECTOR]]
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<4 x s32>) = G_SUB [[AND1]], [[AND]]
+ ; CHECK-NEXT: $q0 = COPY [[SUB]](<4 x s32>)
+ ; CHECK-NEXT: RET_ReallyLR implicit $q0
+ %0:_(<4 x s32>) = COPY $q0
+ %1:_(s32) = G_CONSTANT i32 4
+ %2:_(<4 x s32>) = G_BUILD_VECTOR %1(s32), %1(s32), %1(s32), %1(s32)
+ %3:_(<4 x s32>) = G_SREM %0, %2
+ $q0 = COPY %3(<4 x s32>)
+ RET_ReallyLR implicit $q0
+
+...
+---
+name: srem_pow2_v4s32_mixed_sign
+body: |
+ bb.1:
+ liveins: $q0
+
+ ; CHECK-LABEL: name: srem_pow2_v4s32_mixed_sign
+ ; CHECK: liveins: $q0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+ ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32), [[C2]](s32), [[C3]](s32)
+ ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C4]](s32), [[C4]](s32), [[C4]](s32), [[C4]](s32)
+ ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(<4 x s32>) = G_ASHR [[COPY]], [[BUILD_VECTOR1]](<4 x s32>)
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s32>) = G_AND [[ASHR]], [[BUILD_VECTOR]]
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<4 x s32>) = G_ADD [[COPY]], [[AND]]
+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<4 x s32>) = G_AND [[ADD]], [[BUILD_VECTOR]]
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<4 x s32>) = G_SUB [[AND1]], [[AND]]
+ ; CHECK-NEXT: $q0 = COPY [[SUB]](<4 x s32>)
+ ; CHECK-NEXT: RET_ReallyLR implicit $q0
+ %0:_(<4 x s32>) = COPY $q0
+ %c0:_(s32) = G_CONSTANT i32 4
+ %c1:_(s32) = G_CONSTANT i32 -8
+ %c2:_(s32) = G_CONSTANT i32 16
+ %c3:_(s32) = G_CONSTANT i32 -1
+ %1:_(<4 x s32>) = G_BUILD_VECTOR %c0, %c1, %c2, %c3
+ %2:_(<4 x s32>) = G_SREM %0, %1
+ $q0 = COPY %2(<4 x s32>)
+ RET_ReallyLR implicit $q0
+
+...
+---
+# Negative test: non-pow2 divisor should not match.
+name: srem_nonpow2_s32
+body: |
+ bb.1:
+ liveins: $w0
+
+ ; CHECK-LABEL: name: srem_nonpow2_s32
+ ; CHECK: liveins: $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1431655766
+ ; CHECK-NEXT: [[SMULH:%[0-9]+]]:_(s32) = G_SMULH [[COPY]], [[C1]]
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[SMULH]], [[C2]](s32)
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SMULH]], [[LSHR]]
+ ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[ADD]], [[C]]
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[MUL]]
+ ; CHECK-NEXT: $w0 = COPY [[SUB]](s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = G_CONSTANT i32 3
+ %2:_(s32) = G_SREM %0, %1
+ $w0 = COPY %2(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+# Negative test: non-constant divisor should not match.
+name: srem_nonconst_s32
+body: |
+ bb.1:
+ liveins: $w0, $w1
+
+ ; CHECK-LABEL: name: srem_nonconst_s32
+ ; CHECK: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+ ; CHECK-NEXT: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[COPY]], [[COPY1]]
+ ; CHECK-NEXT: $w0 = COPY [[SREM]](s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = COPY $w1
+ %2:_(s32) = G_SREM %0, %1
+ $w0 = COPY %2(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+# Negative test: zero divisor should not match.
+name: srem_zero_s32
+body: |
+ bb.1:
+ liveins: $w0
+
+ ; CHECK-LABEL: name: srem_zero_s32
+ ; CHECK: liveins: $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[COPY]], [[C]]
+ ; CHECK-NEXT: $w0 = COPY [[SREM]](s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = G_CONSTANT i32 0
+ %2:_(s32) = G_SREM %0, %1
+ $w0 = COPY %2(s32)
+ RET_ReallyLR implicit $w0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/srem-vec-crash.ll b/llvm/test/CodeGen/AArch64/srem-vec-crash.ll
index 0b1e430e21105..f447f0ea8903f 100644
--- a/llvm/test/CodeGen/AArch64/srem-vec-crash.ll
+++ b/llvm/test/CodeGen/AArch64/srem-vec-crash.ll
@@ -1,23 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-unknown-unknown -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-unknown-unknown -global-isel < %s | FileCheck %s
define i32 @pr84830(i1 %arg) {
-; CHECK-SD-LABEL: pr84830:
-; CHECK-SD: // %bb.0: // %bb
-; CHECK-SD-NEXT: mov w0, #1 // =0x1
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: pr84830:
-; CHECK-GI: // %bb.0: // %bb
-; CHECK-GI-NEXT: mov w8, #1 // =0x1
-; CHECK-GI-NEXT: sbfx w9, w0, #0, #1
-; CHECK-GI-NEXT: sbfx w8, w8, #0, #1
-; CHECK-GI-NEXT: sdiv w10, w9, w8
-; CHECK-GI-NEXT: msub w8, w10, w8, w9
-; CHECK-GI-NEXT: eor w8, w8, #0x1
-; CHECK-GI-NEXT: and w0, w8, #0x1
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: pr84830:
+; CHECK: // %bb.0: // %bb
+; CHECK-NEXT: mov w0, #1 // =0x1
+; CHECK-NEXT: ret
bb:
%new0 = srem i1 %arg, true
%last = zext i1 %new0 to i32
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
🪟 Windows x64 Test Results
✅ The build succeeded and all tests passed. |
dbf5bbd to
177e9bd
Compare
| if (isKnownToBeAPowerOfTwo(Reg, MRI, VT)) | ||
| return true; | ||
|
|
||
| // Otherwise accept any constant whose absolute value is a power of 2. |
There was a problem hiding this comment.
I don't understand why you need this special case handling of constants. Surely isKnownToBeAPowerOfTwo should handle constants?
There was a problem hiding this comment.
This special case is for negative powers of 2 which won't be handled by isKnownToBeAPowerOfTwo since that is looking for 1 bit set.
There was a problem hiding this comment.
This should probably be an option in isKnownToBeAPowerOfTwo, rather than adding a second level of analysis in one specific use
There was a problem hiding this comment.
Yep, that is much cleaner, I've switched to this approach.
Adds combine-srem-by-pow2.mir with the existing GISel codegen for SREM by a power of 2 (the magic-multiply expansion via srem_by_const). A follow-up commit adds the srem_pow2_to_mask combine and regenerates these CHECK lines, so the diff there shows what the new combine does. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Adds srem_pow2_to_mask, lowering G_SREM by +/- a power of 2 to a branch-free bias-and-mask sequence: %abs = G_ABS %rhs %mask = G_SUB %abs, 1 %sign = G_ASHR %lhs, $(bitwidth - 1) %bias = G_AND %sign, %mask %biased = G_ADD %lhs, %bias %masked = G_AND %biased, %mask %res = G_SUB %masked, %bias For constant divisors %mask collapses to a single G_CONSTANT via the CSEMIRBuilder folds for G_ABS and G_SUB. The matcher uses known bits plus a per-lane constant fallback so it catches scalar +/-2^k, vector splats, mixed-sign vector lanes (e.g. <i32 4, i32 -8, i32 16, i32 -1>), and non-constant pow2 patterns like (1 << x). Placed before srem_by_const in intrem_combines so the bias-and-mask shape wins over the magic-multiply expansion. The CHECK delta in combine-srem-by-pow2.mir shows the codegen change. srem-vec-crash.ll's pr84830 (srem i1 by true) now collapses to the SDAG-equivalent `mov w0, 1; ret` rather than the prior 7-instruction sdiv+msub chain, so the CHECK-SD/CHECK-GI prefixes merge. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
177e9bd to
588c2c6
Compare
No description provided.