From bc8272a13a4c9c3a4c646e6ffcf9e0f5f93daea0 Mon Sep 17 00:00:00 2001 From: Yu Li Date: Wed, 27 Aug 2025 09:19:23 +0000 Subject: [PATCH 1/8] [GISel] Combine shift + trunc + shift pattern --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 13 ++ .../include/llvm/Target/GlobalISel/Combine.td | 11 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 54 ++++++ llvm/lib/Target/AArch64/AArch64Combine.td | 4 +- llvm/test/CodeGen/AArch64/combine-sdiv.ll | 26 +-- llvm/test/CodeGen/AArch64/rem-by-const.ll | 141 ++++++-------- llvm/test/CodeGen/AArch64/urem-lkk.ll | 29 +-- llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll | 176 +++++++++--------- llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll | 8 +- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 8 +- .../AMDGPU/GlobalISel/store-local.128.ll | 105 ++++++----- .../AMDGPU/GlobalISel/store-local.96.ll | 96 +++++----- llvm/test/CodeGen/AMDGPU/ds-alignment.ll | 42 ++--- 13 files changed, 369 insertions(+), 344 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 6dba689e8af71..40f612cc98bcc 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -81,6 +81,13 @@ struct ShiftOfShiftedLogic { uint64_t ValSum; }; +struct ShiftOfTruncOfShift { + Register Src; + uint64_t ShiftAmt; + LLT ShiftAmtTy; + LLT InnerShiftTy; +}; + using BuildFnTy = std::function; using OperandBuildSteps = @@ -338,6 +345,12 @@ class CombinerHelper { bool matchCommuteShift(MachineInstr &MI, BuildFnTy &MatchInfo) const; + /// Fold (shift (trunc (shift x, C1)), C2) -> trunc (shift x, (C1 + C2)) + bool matchShiftOfTruncOfShift(MachineInstr &MI, + ShiftOfTruncOfShift &MatchInfo) const; + void applyShiftOfTruncOfShift(MachineInstr &MI, + ShiftOfTruncOfShift &MatchInfo) const; + /// Transform a multiply by a power-of-2 value to a left shift. bool matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal) const; void applyCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal) const; diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 1b7c1235c9805..95bcbbcdaaa70 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -396,6 +396,14 @@ def commute_shift : GICombineRule< [{ return Helper.matchCommuteShift(*${d}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFn(*${d}, ${matchinfo}); }])>; +// Fold (shift (trunc (shift x, C1)), C2) -> trunc (shift x, (C1 + C2)) +def shift_of_trunc_of_shift_matchdata : GIDefMatchData<"ShiftOfTruncOfShift">; +def shift_of_trunc_of_shift : GICombineRule< + (defs root:$root, shift_of_trunc_of_shift_matchdata:$matchinfo), + (match (wip_match_opcode G_LSHR, G_ASHR):$root, + [{ return Helper.matchShiftOfTruncOfShift(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyShiftOfTruncOfShift(*${root}, ${matchinfo}); }])>; + def narrow_binop_feeding_and : GICombineRule< (defs root:$root, build_fn_matchinfo:$matchinfo), (match (wip_match_opcode G_AND):$root, @@ -2133,7 +2141,8 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines, fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors, simplify_neg_minmax, combine_concat_vector, sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines, - combine_use_vector_truncate, merge_combines, overflow_combines, truncsat_combines]>; + combine_use_vector_truncate, merge_combines, overflow_combines, + truncsat_combines, shift_of_trunc_of_shift]>; // A combine group used to for prelegalizer combiners at -O0. The combines in // this group have been selected based on experiments to balance code size and diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 0674f5fd1ae06..d3f0731955353 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2094,6 +2094,60 @@ bool CombinerHelper::matchCommuteShift(MachineInstr &MI, return true; } +bool CombinerHelper::matchShiftOfTruncOfShift( + MachineInstr &MI, ShiftOfTruncOfShift &MatchInfo) const { + unsigned ShiftOpcode = MI.getOpcode(); + assert(ShiftOpcode == TargetOpcode::G_LSHR || + ShiftOpcode == TargetOpcode::G_ASHR); + + Register N0 = MI.getOperand(1).getReg(); + Register N1 = MI.getOperand(2).getReg(); + unsigned OpSizeInBits = MRI.getType(N0).getScalarSizeInBits(); + + APInt N1C; + Register InnerShift; + if (!mi_match(N1, MRI, m_ICstOrSplat(N1C)) || + !mi_match(N0, MRI, m_GTrunc(m_Reg(InnerShift)))) + return false; + + auto *InnerMI = MRI.getVRegDef(InnerShift); + if (InnerMI->getOpcode() != ShiftOpcode) + return false; + + APInt N001C; + auto N001 = InnerMI->getOperand(2).getReg(); + if (!mi_match(N001, MRI, m_ICstOrSplat(N001C))) + return false; + + uint64_t c1 = N001C.getZExtValue(); + uint64_t c2 = N1C.getZExtValue(); + LLT InnerShiftTy = MRI.getType(InnerShift); + uint64_t InnerShiftSize = InnerShiftTy.getScalarSizeInBits(); + if (!(c1 + OpSizeInBits == InnerShiftSize) || !(c1 + c2 < InnerShiftSize)) + return false; + + MatchInfo.Src = InnerMI->getOperand(1).getReg(); + MatchInfo.ShiftAmt = c1 + c2; + MatchInfo.ShiftAmtTy = MRI.getType(N001); + MatchInfo.InnerShiftTy = InnerShiftTy; + return true; +} + +void CombinerHelper::applyShiftOfTruncOfShift( + MachineInstr &MI, ShiftOfTruncOfShift &MatchInfo) const { + unsigned ShiftOpcode = MI.getOpcode(); + assert(ShiftOpcode == TargetOpcode::G_LSHR || + ShiftOpcode == TargetOpcode::G_ASHR); + + Register Dst = MI.getOperand(0).getReg(); + auto ShiftAmt = + Builder.buildConstant(MatchInfo.ShiftAmtTy, MatchInfo.ShiftAmt); + auto Shift = Builder.buildInstr(ShiftOpcode, {MatchInfo.InnerShiftTy}, + {MatchInfo.Src, ShiftAmt}); + Builder.buildTrunc(Dst, Shift); + MI.eraseFromParent(); +} + bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal) const { assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL"); diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 5f499e5e9700a..e44819ad5a4ae 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -349,6 +349,8 @@ def AArch64PostLegalizerLowering } // Post-legalization combines which are primarily optimizations. + + def AArch64PostLegalizerCombiner : GICombiner<"AArch64PostLegalizerCombinerImpl", [copy_prop, cast_of_cast_combines, @@ -369,5 +371,5 @@ def AArch64PostLegalizerCombiner commute_constant_to_rhs, extract_vec_elt_combines, push_freeze_to_prevent_poison_from_propagating, combine_mul_cmlt, combine_use_vector_truncate, - extmultomull, truncsat_combines]> { + extmultomull, truncsat_combines, shift_of_trunc_of_shift]> { } diff --git a/llvm/test/CodeGen/AArch64/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/combine-sdiv.ll index 9d0ade2480428..014eaee5ebb2f 100644 --- a/llvm/test/CodeGen/AArch64/combine-sdiv.ll +++ b/llvm/test/CodeGen/AArch64/combine-sdiv.ll @@ -1684,24 +1684,14 @@ define i32 @combine_i32_sdiv_const7(i32 %x) { } define i32 @combine_i32_sdiv_const100(i32 %x) { -; CHECK-SD-LABEL: combine_i32_sdiv_const100: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #34079 // =0x851f -; CHECK-SD-NEXT: movk w8, #20971, lsl #16 -; CHECK-SD-NEXT: smull x8, w0, w8 -; CHECK-SD-NEXT: asr x8, x8, #37 -; CHECK-SD-NEXT: add w0, w8, w8, lsr #31 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: combine_i32_sdiv_const100: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #34079 // =0x851f -; CHECK-GI-NEXT: movk w8, #20971, lsl #16 -; CHECK-GI-NEXT: smull x8, w0, w8 -; CHECK-GI-NEXT: asr x8, x8, #32 -; CHECK-GI-NEXT: asr w8, w8, #5 -; CHECK-GI-NEXT: add w0, w8, w8, lsr #31 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: combine_i32_sdiv_const100: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #34079 // =0x851f +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: asr x8, x8, #37 +; CHECK-NEXT: add w0, w8, w8, lsr #31 +; CHECK-NEXT: ret %1 = sdiv i32 %x, 100 ret i32 %1 } diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll index c57383ad9b1e7..f36a87794be35 100644 --- a/llvm/test/CodeGen/AArch64/rem-by-const.ll +++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll @@ -276,28 +276,16 @@ entry: } define i32 @si32_100(i32 %a, i32 %b) { -; CHECK-SD-LABEL: si32_100: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mov w8, #34079 // =0x851f -; CHECK-SD-NEXT: mov w9, #100 // =0x64 -; CHECK-SD-NEXT: movk w8, #20971, lsl #16 -; CHECK-SD-NEXT: smull x8, w0, w8 -; CHECK-SD-NEXT: asr x8, x8, #37 -; CHECK-SD-NEXT: add w8, w8, w8, lsr #31 -; CHECK-SD-NEXT: msub w0, w8, w9, w0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: si32_100: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, #34079 // =0x851f -; CHECK-GI-NEXT: mov w9, #100 // =0x64 -; CHECK-GI-NEXT: movk w8, #20971, lsl #16 -; CHECK-GI-NEXT: smull x8, w0, w8 -; CHECK-GI-NEXT: asr x8, x8, #32 -; CHECK-GI-NEXT: asr w8, w8, #5 -; CHECK-GI-NEXT: add w8, w8, w8, lsr #31 -; CHECK-GI-NEXT: msub w0, w8, w9, w0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: si32_100: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #34079 // =0x851f +; CHECK-NEXT: mov w9, #100 // =0x64 +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: asr x8, x8, #37 +; CHECK-NEXT: add w8, w8, w8, lsr #31 +; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: ret entry: %s = srem i32 %a, 100 ret i32 %s @@ -336,26 +324,15 @@ entry: } define i32 @ui32_100(i32 %a, i32 %b) { -; CHECK-SD-LABEL: ui32_100: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mov w8, #34079 // =0x851f -; CHECK-SD-NEXT: mov w9, #100 // =0x64 -; CHECK-SD-NEXT: movk w8, #20971, lsl #16 -; CHECK-SD-NEXT: umull x8, w0, w8 -; CHECK-SD-NEXT: lsr x8, x8, #37 -; CHECK-SD-NEXT: msub w0, w8, w9, w0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: ui32_100: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, #34079 // =0x851f -; CHECK-GI-NEXT: mov w9, #100 // =0x64 -; CHECK-GI-NEXT: movk w8, #20971, lsl #16 -; CHECK-GI-NEXT: umull x8, w0, w8 -; CHECK-GI-NEXT: lsr x8, x8, #32 -; CHECK-GI-NEXT: lsr w8, w8, #5 -; CHECK-GI-NEXT: msub w0, w8, w9, w0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: ui32_100: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #34079 // =0x851f +; CHECK-NEXT: mov w9, #100 // =0x64 +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #37 +; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: ret entry: %s = urem i32 %a, 100 ret i32 %s @@ -1118,13 +1095,12 @@ define <8 x i8> @sv8i8_100(<8 x i8> %d, <8 x i8> %e) { ; CHECK-GI-LABEL: sv8i8_100: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v1.8b, #41 -; CHECK-GI-NEXT: movi v3.8b, #100 +; CHECK-GI-NEXT: movi v2.8b, #100 ; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b -; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8 -; CHECK-GI-NEXT: sshr v2.8b, v1.8b, #4 -; CHECK-GI-NEXT: ushr v2.8b, v2.8b, #7 -; CHECK-GI-NEXT: ssra v2.8b, v1.8b, #4 -; CHECK-GI-NEXT: mls v0.8b, v2.8b, v3.8b +; CHECK-GI-NEXT: sshr v1.8h, v1.8h, #12 +; CHECK-GI-NEXT: xtn v1.8b, v1.8h +; CHECK-GI-NEXT: usra v1.8b, v1.8b, #7 +; CHECK-GI-NEXT: mls v0.8b, v1.8b, v2.8b ; CHECK-GI-NEXT: ret entry: %s = srem <8 x i8> %d, @@ -1619,15 +1595,25 @@ entry: } define <8 x i8> @uv8i8_100(<8 x i8> %d, <8 x i8> %e) { -; CHECK-LABEL: uv8i8_100: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.8b, #41 -; CHECK-NEXT: movi v2.8b, #100 -; CHECK-NEXT: umull v1.8h, v0.8b, v1.8b -; CHECK-NEXT: shrn v1.8b, v1.8h, #8 -; CHECK-NEXT: ushr v1.8b, v1.8b, #4 -; CHECK-NEXT: mls v0.8b, v1.8b, v2.8b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: uv8i8_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi v1.8b, #41 +; CHECK-SD-NEXT: movi v2.8b, #100 +; CHECK-SD-NEXT: umull v1.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-SD-NEXT: ushr v1.8b, v1.8b, #4 +; CHECK-SD-NEXT: mls v0.8b, v1.8b, v2.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv8i8_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v1.8b, #41 +; CHECK-GI-NEXT: movi v2.8b, #100 +; CHECK-GI-NEXT: umull v1.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: ushr v1.8h, v1.8h, #12 +; CHECK-GI-NEXT: xtn v1.8b, v1.8h +; CHECK-GI-NEXT: mls v0.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: ret entry: %s = urem <8 x i8> %d, ret <8 x i8> %s @@ -1904,14 +1890,13 @@ define <4 x i16> @sv4i16_7(<4 x i16> %d, <4 x i16> %e) { ; CHECK-GI-LABEL: sv4i16_7: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: adrp x8, .LCPI44_0 -; CHECK-GI-NEXT: movi v3.4h, #7 +; CHECK-GI-NEXT: movi v2.4h, #7 ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI44_0] ; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h -; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 -; CHECK-GI-NEXT: sshr v2.4h, v1.4h, #1 -; CHECK-GI-NEXT: ushr v2.4h, v2.4h, #15 -; CHECK-GI-NEXT: ssra v2.4h, v1.4h, #1 -; CHECK-GI-NEXT: mls v0.4h, v2.4h, v3.4h +; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #17 +; CHECK-GI-NEXT: xtn v1.4h, v1.4s +; CHECK-GI-NEXT: usra v1.4h, v1.4h, #15 +; CHECK-GI-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-GI-NEXT: ret entry: %s = srem <4 x i16> %d, @@ -1934,14 +1919,13 @@ define <4 x i16> @sv4i16_100(<4 x i16> %d, <4 x i16> %e) { ; CHECK-GI-LABEL: sv4i16_100: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: adrp x8, .LCPI45_0 -; CHECK-GI-NEXT: movi v3.4h, #100 +; CHECK-GI-NEXT: movi v2.4h, #100 ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI45_0] ; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h -; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 -; CHECK-GI-NEXT: sshr v2.4h, v1.4h, #3 -; CHECK-GI-NEXT: ushr v2.4h, v2.4h, #15 -; CHECK-GI-NEXT: ssra v2.4h, v1.4h, #3 -; CHECK-GI-NEXT: mls v0.4h, v2.4h, v3.4h +; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #19 +; CHECK-GI-NEXT: xtn v1.4h, v1.4s +; CHECK-GI-NEXT: usra v1.4h, v1.4h, #15 +; CHECK-GI-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-GI-NEXT: ret entry: %s = srem <4 x i16> %d, @@ -2301,8 +2285,8 @@ define <4 x i16> @uv4i16_100(<4 x i16> %d, <4 x i16> %e) { ; CHECK-GI-NEXT: ldr d2, [x8, :lo12:.LCPI53_0] ; CHECK-GI-NEXT: umull v1.4s, v1.4h, v2.4h ; CHECK-GI-NEXT: movi v2.4h, #100 -; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 -; CHECK-GI-NEXT: ushr v1.4h, v1.4h, #1 +; CHECK-GI-NEXT: ushr v1.4s, v1.4s, #17 +; CHECK-GI-NEXT: xtn v1.4h, v1.4s ; CHECK-GI-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-GI-NEXT: ret entry: @@ -2424,14 +2408,13 @@ define <2 x i32> @sv2i32_100(<2 x i32> %d, <2 x i32> %e) { ; CHECK-GI-LABEL: sv2i32_100: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: adrp x8, .LCPI57_0 -; CHECK-GI-NEXT: movi v3.2s, #100 +; CHECK-GI-NEXT: movi v2.2s, #100 ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI57_0] ; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32 -; CHECK-GI-NEXT: sshr v2.2s, v1.2s, #5 -; CHECK-GI-NEXT: ushr v2.2s, v2.2s, #31 -; CHECK-GI-NEXT: ssra v2.2s, v1.2s, #5 -; CHECK-GI-NEXT: mls v0.2s, v2.2s, v3.2s +; CHECK-GI-NEXT: sshr v1.2d, v1.2d, #37 +; CHECK-GI-NEXT: xtn v1.2s, v1.2d +; CHECK-GI-NEXT: usra v1.2s, v1.2s, #31 +; CHECK-GI-NEXT: mls v0.2s, v1.2s, v2.2s ; CHECK-GI-NEXT: ret entry: %s = srem <2 x i32> %d, @@ -2656,8 +2639,8 @@ define <2 x i32> @uv2i32_100(<2 x i32> %d, <2 x i32> %e) { ; CHECK-GI-NEXT: movi v2.2s, #100 ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI63_0] ; CHECK-GI-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32 -; CHECK-GI-NEXT: ushr v1.2s, v1.2s, #5 +; CHECK-GI-NEXT: ushr v1.2d, v1.2d, #37 +; CHECK-GI-NEXT: xtn v1.2s, v1.2d ; CHECK-GI-NEXT: mls v0.2s, v1.2s, v2.2s ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/urem-lkk.ll b/llvm/test/CodeGen/AArch64/urem-lkk.ll index 0dd6685555826..40016c7e4ce0f 100644 --- a/llvm/test/CodeGen/AArch64/urem-lkk.ll +++ b/llvm/test/CodeGen/AArch64/urem-lkk.ll @@ -20,26 +20,15 @@ define i32 @fold_urem_positive_odd(i32 %x) { } define i32 @fold_urem_positive_even(i32 %x) { -; CHECK-SD-LABEL: fold_urem_positive_even: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #16323 // =0x3fc3 -; CHECK-SD-NEXT: mov w9, #1060 // =0x424 -; CHECK-SD-NEXT: movk w8, #63310, lsl #16 -; CHECK-SD-NEXT: umull x8, w0, w8 -; CHECK-SD-NEXT: lsr x8, x8, #42 -; CHECK-SD-NEXT: msub w0, w8, w9, w0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fold_urem_positive_even: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #16323 // =0x3fc3 -; CHECK-GI-NEXT: mov w9, #1060 // =0x424 -; CHECK-GI-NEXT: movk w8, #63310, lsl #16 -; CHECK-GI-NEXT: umull x8, w0, w8 -; CHECK-GI-NEXT: lsr x8, x8, #32 -; CHECK-GI-NEXT: lsr w8, w8, #10 -; CHECK-GI-NEXT: msub w0, w8, w9, w0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fold_urem_positive_even: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16323 // =0x3fc3 +; CHECK-NEXT: mov w9, #1060 // =0x424 +; CHECK-NEXT: movk w8, #63310, lsl #16 +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #42 +; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: ret %1 = urem i32 %x, 1060 ret i32 %1 } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index fd329e230e78b..09bb9c94dc81e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -3679,22 +3679,21 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, < ; ; GFX8-LABEL: s_fshl_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_lshr_b32 s5, s2, 16 -; GFX8-NEXT: s_and_b32 s6, s2, 15 -; GFX8-NEXT: s_andn2_b32 s2, 15, s2 -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_and_b32 s5, s2, 15 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, s6 -; GFX8-NEXT: s_lshr_b32 s1, s1, s2 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s5, 15 -; GFX8-NEXT: s_andn2_b32 s2, 15, s5 -; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_lshr_b32 s3, s4, 1 -; GFX8-NEXT: s_lshr_b32 s2, s3, s2 -; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s5 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s1 +; GFX8-NEXT: s_lshr_b32 s4, s2, 16 +; GFX8-NEXT: s_andn2_b32 s2, 15, s2 +; GFX8-NEXT: s_lshr_b32 s5, s5, 1 +; GFX8-NEXT: s_lshr_b32 s2, s5, s2 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s4, 15 +; GFX8-NEXT: s_andn2_b32 s4, 15, s4 +; GFX8-NEXT: s_lshr_b32 s1, s1, 17 +; GFX8-NEXT: s_lshl_b32 s2, s3, s2 +; GFX8-NEXT: s_lshr_b32 s1, s1, s4 +; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 @@ -3806,13 +3805,12 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) { ; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v5 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 15 -; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v5, -1 +; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v4, 1 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 17, v1 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -3879,14 +3877,14 @@ define <2 x i16> @v_fshl_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, 4, v0 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, 12, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 17, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 7, v1 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 4, v0 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3957,11 +3955,10 @@ define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg % ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, -1 -; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_xor_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX8-NEXT: s_lshr_b32 s0, s3, 1 +; GFX8-NEXT: s_lshr_b32 s0, s1, 17 ; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 @@ -4051,11 +4048,10 @@ define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, < ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, s1, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 ; GFX8-NEXT: s_and_b32 s0, s3, 15 ; GFX8-NEXT: s_andn2_b32 s1, 15, s3 -; GFX8-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 17, v0 ; GFX8-NEXT: s_lshl_b32 s0, s2, s0 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, s1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 @@ -4135,21 +4131,20 @@ define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, < ; ; GFX8-LABEL: v_fshl_v2i16_vss: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s4, s1, 15 +; GFX8-NEXT: s_and_b32 s3, s1, 15 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, s3, v0 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s0 +; GFX8-NEXT: s_lshr_b32 s2, s1, 16 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 -; GFX8-NEXT: s_lshr_b32 s0, s0, 1 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, s4, v0 -; GFX8-NEXT: s_lshr_b32 s0, s0, s1 -; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 -; GFX8-NEXT: s_and_b32 s0, s3, 15 -; GFX8-NEXT: s_andn2_b32 s1, 15, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: s_lshr_b32 s0, s2, 1 +; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_lshr_b32 s1, s3, s1 +; GFX8-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX8-NEXT: s_and_b32 s1, s2, 15 +; GFX8-NEXT: s_andn2_b32 s2, 15, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: s_lshr_b32 s0, s0, 17 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: s_lshr_b32 s0, s0, s1 +; GFX8-NEXT: s_lshr_b32 s0, s0, s2 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -4249,23 +4244,22 @@ define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, < ; ; GFX8-LABEL: s_fshl_v3i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s7, s2, 16 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_lshr_b32 s8, s4, 16 -; GFX8-NEXT: s_and_b32 s9, s4, 15 -; GFX8-NEXT: s_andn2_b32 s4, 15, s4 -; GFX8-NEXT: s_lshr_b32 s2, s2, 1 +; GFX8-NEXT: s_and_b32 s8, s4, 15 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, s9 -; GFX8-NEXT: s_lshr_b32 s2, s2, s4 -; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s8, 15 -; GFX8-NEXT: s_andn2_b32 s4, 15, s8 -; GFX8-NEXT: s_lshl_b32 s2, s6, s2 -; GFX8-NEXT: s_lshr_b32 s6, s7, 1 -; GFX8-NEXT: s_lshr_b32 s4, s6, s4 +; GFX8-NEXT: s_lshl_b32 s0, s0, s8 +; GFX8-NEXT: s_and_b32 s8, 0xffff, s2 +; GFX8-NEXT: s_lshr_b32 s7, s4, 16 +; GFX8-NEXT: s_andn2_b32 s4, 15, s4 +; GFX8-NEXT: s_lshr_b32 s8, s8, 1 +; GFX8-NEXT: s_lshr_b32 s4, s8, s4 +; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s4, s7, 15 +; GFX8-NEXT: s_andn2_b32 s7, 15, s7 +; GFX8-NEXT: s_lshr_b32 s2, s2, 17 +; GFX8-NEXT: s_lshl_b32 s4, s6, s4 +; GFX8-NEXT: s_lshr_b32 s2, s2, s7 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NEXT: s_or_b32 s2, s2, s4 +; GFX8-NEXT: s_or_b32 s2, s4, s2 ; GFX8-NEXT: s_and_b32 s4, s5, 15 ; GFX8-NEXT: s_andn2_b32 s5, 15, s5 ; GFX8-NEXT: s_lshr_b32 s3, s3, 1 @@ -4462,13 +4456,12 @@ define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) ; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v8 ; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX8-NEXT: v_mov_b32_e32 v7, 15 -; GFX8-NEXT: v_and_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v8, -1 +; GFX8-NEXT: v_and_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_xor_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v7, 1 ; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 17, v2 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v5 @@ -4586,39 +4579,37 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg % ; ; GFX8-LABEL: s_fshl_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s8, s2, 16 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_lshr_b32 s10, s4, 16 -; GFX8-NEXT: s_and_b32 s12, s4, 15 -; GFX8-NEXT: s_andn2_b32 s4, 15, s4 -; GFX8-NEXT: s_lshr_b32 s2, s2, 1 +; GFX8-NEXT: s_and_b32 s10, s4, 15 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, s12 -; GFX8-NEXT: s_lshr_b32 s2, s2, s4 -; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s10, 15 -; GFX8-NEXT: s_andn2_b32 s4, 15, s10 -; GFX8-NEXT: s_lshl_b32 s2, s6, s2 -; GFX8-NEXT: s_lshr_b32 s6, s8, 1 -; GFX8-NEXT: s_lshr_b32 s9, s3, 16 -; GFX8-NEXT: s_lshr_b32 s4, s6, s4 -; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NEXT: s_lshr_b32 s11, s5, 16 -; GFX8-NEXT: s_or_b32 s2, s2, s4 +; GFX8-NEXT: s_lshl_b32 s0, s0, s10 +; GFX8-NEXT: s_and_b32 s10, 0xffff, s2 +; GFX8-NEXT: s_lshr_b32 s8, s4, 16 +; GFX8-NEXT: s_andn2_b32 s4, 15, s4 +; GFX8-NEXT: s_lshr_b32 s10, s10, 1 +; GFX8-NEXT: s_lshr_b32 s4, s10, s4 +; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s4, s8, 15 +; GFX8-NEXT: s_andn2_b32 s8, 15, s8 +; GFX8-NEXT: s_lshr_b32 s2, s2, 17 +; GFX8-NEXT: s_lshl_b32 s4, s6, s4 +; GFX8-NEXT: s_lshr_b32 s2, s2, s8 +; GFX8-NEXT: s_or_b32 s2, s4, s2 ; GFX8-NEXT: s_and_b32 s4, s5, 15 -; GFX8-NEXT: s_andn2_b32 s5, 15, s5 -; GFX8-NEXT: s_lshr_b32 s3, s3, 1 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s3 +; GFX8-NEXT: s_lshr_b32 s9, s5, 16 +; GFX8-NEXT: s_andn2_b32 s5, 15, s5 +; GFX8-NEXT: s_lshr_b32 s4, s4, 1 +; GFX8-NEXT: s_lshr_b32 s4, s4, s5 +; GFX8-NEXT: s_or_b32 s1, s1, s4 +; GFX8-NEXT: s_and_b32 s4, s9, 15 +; GFX8-NEXT: s_andn2_b32 s5, 15, s9 +; GFX8-NEXT: s_lshr_b32 s3, s3, 17 +; GFX8-NEXT: s_lshl_b32 s4, s7, s4 ; GFX8-NEXT: s_lshr_b32 s3, s3, s5 -; GFX8-NEXT: s_or_b32 s1, s1, s3 -; GFX8-NEXT: s_and_b32 s3, s11, 15 -; GFX8-NEXT: s_andn2_b32 s4, 15, s11 -; GFX8-NEXT: s_lshr_b32 s5, s9, 1 -; GFX8-NEXT: s_lshl_b32 s3, s7, s3 -; GFX8-NEXT: s_lshr_b32 s4, s5, s4 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_or_b32 s3, s3, s4 +; GFX8-NEXT: s_or_b32 s3, s4, s3 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s2 @@ -4803,26 +4794,25 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) ; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v8 ; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX8-NEXT: v_mov_b32_e32 v7, 15 -; GFX8-NEXT: v_and_b32_sdwa v8, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v9, -1 +; GFX8-NEXT: v_and_b32_sdwa v8, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_xor_b32_sdwa v4, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v8, 1 ; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 17, v2 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2 ; GFX8-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v5 ; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v10, 1, v3 +; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v10 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v8 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX8-NEXT: v_and_b32_sdwa v4, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_xor_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 17, v3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index b74c6eeabb239..18f6be7fd3e40 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -3623,14 +3623,14 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, 12, v0 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, 4, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 17, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 7, v1 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 12, v0 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 6baa10bb48621..8533e34ff13f8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -807,10 +807,10 @@ define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) { ; GFX8-LABEL: v_lshr_v2i16_15: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, 15 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 15, v0 -; GFX8-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 31 +; GFX8-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 15, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_lshr_v2i16_15: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll index 38ef707fa65a2..3685eed5043a3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -71,14 +71,14 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s5, 0xffff, s0 -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: s_lshr_b32 s0, s5, 8 +; GFX9-NEXT: s_lshr_b32 s5, s5, 8 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX9-NEXT: s_lshr_b32 s0, s4, 8 +; GFX9-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -90,7 +90,7 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 @@ -102,7 +102,7 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: s_lshr_b32 s1, s2, 24 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 @@ -114,7 +114,7 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:13 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: s_lshr_b32 s1, s3, 24 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:14 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 @@ -181,37 +181,37 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-NEXT: s_and_b32 s5, 0xffff, s0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: s_lshr_b32 s0, s1, 16 -; GFX10-NEXT: s_and_b32 s6, 0xffff, s1 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s0 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: s_and_b32 s7, 0xffff, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: s_lshr_b32 s1, s2, 16 -; GFX10-NEXT: s_and_b32 s7, 0xffff, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s2 -; GFX10-NEXT: s_lshr_b32 s2, s5, 8 +; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_lshr_b32 s5, s4, 8 -; GFX10-NEXT: s_lshr_b32 s4, s6, 8 -; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-NEXT: s_lshr_b32 s8, s2, 16 +; GFX10-NEXT: s_and_b32 s9, 0xffff, s2 +; GFX10-NEXT: s_lshr_b32 s5, s5, 8 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: v_mov_b32_e32 v6, s2 ; GFX10-NEXT: s_lshr_b32 s0, s7, 8 -; GFX10-NEXT: v_mov_b32_e32 v7, s5 -; GFX10-NEXT: v_mov_b32_e32 v8, s4 -; GFX10-NEXT: v_mov_b32_e32 v9, s6 +; GFX10-NEXT: v_mov_b32_e32 v6, s6 +; GFX10-NEXT: v_mov_b32_e32 v7, s1 +; GFX10-NEXT: s_lshr_b32 s1, s9, 8 +; GFX10-NEXT: v_mov_b32_e32 v8, s5 +; GFX10-NEXT: v_mov_b32_e32 v9, s0 ; GFX10-NEXT: ds_write_b8 v1, v0 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:2 -; GFX10-NEXT: ds_write_b8 v1, v5 offset:6 -; GFX10-NEXT: ds_write_b8 v1, v6 offset:1 -; GFX10-NEXT: ds_write_b8 v1, v7 offset:3 -; GFX10-NEXT: ds_write_b8 v1, v8 offset:5 -; GFX10-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-NEXT: v_mov_b32_e32 v10, s0 -; GFX10-NEXT: s_lshr_b32 s0, s1, 8 -; GFX10-NEXT: ds_write_b8 v1, v9 offset:7 +; GFX10-NEXT: ds_write_b8 v1, v5 offset:3 +; GFX10-NEXT: ds_write_b8 v1, v6 offset:6 +; GFX10-NEXT: ds_write_b8 v1, v8 offset:1 +; GFX10-NEXT: ds_write_b8 v1, v9 offset:5 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: v_mov_b32_e32 v10, s1 +; GFX10-NEXT: s_lshr_b32 s0, s2, 24 +; GFX10-NEXT: ds_write_b8 v1, v7 offset:7 ; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 ; GFX10-NEXT: ds_write_b8 v1, v10 offset:9 ; GFX10-NEXT: ds_write_b8 v1, v0 offset:10 @@ -221,7 +221,7 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-NEXT: s_lshr_b32 s0, s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v2, s3 ; GFX10-NEXT: v_mov_b32_e32 v3, s0 -; GFX10-NEXT: s_lshr_b32 s0, s1, 8 +; GFX10-NEXT: s_lshr_b32 s0, s3, 24 ; GFX10-NEXT: v_mov_b32_e32 v4, s1 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: ds_write_b8 v1, v0 offset:11 @@ -240,38 +240,37 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX11-NEXT: s_and_b32 s6, 0xffff, s0 ; GFX11-NEXT: s_lshr_b32 s5, s0, 16 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 -; GFX11-NEXT: s_lshr_b32 s0, s1, 16 -; GFX11-NEXT: s_and_b32 s4, 0xffff, s1 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-NEXT: s_and_b32 s7, 0xffff, s1 +; GFX11-NEXT: s_lshr_b32 s6, s6, 8 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 -; GFX11-NEXT: s_lshr_b32 s1, s2, 16 -; GFX11-NEXT: s_and_b32 s7, 0xffff, s2 -; GFX11-NEXT: s_lshr_b32 s2, s6, 8 -; GFX11-NEXT: s_lshr_b32 s6, s5, 8 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s6 +; GFX11-NEXT: s_lshr_b32 s1, s1, 24 ; GFX11-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: s_lshr_b32 s4, s4, 8 -; GFX11-NEXT: s_lshr_b32 s5, s0, 8 ; GFX11-NEXT: s_lshr_b32 s0, s7, 8 -; GFX11-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s6 +; GFX11-NEXT: s_and_b32 s9, 0xffff, s2 +; GFX11-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: s_lshr_b32 s0, s2, 24 +; GFX11-NEXT: s_lshr_b32 s1, s9, 8 ; GFX11-NEXT: ds_store_b8 v1, v0 -; GFX11-NEXT: ds_store_b8 v1, v6 offset:1 +; GFX11-NEXT: ds_store_b8 v1, v7 offset:1 ; GFX11-NEXT: ds_store_b8 v1, v4 offset:2 -; GFX11-NEXT: ds_store_b8 v1, v7 offset:3 +; GFX11-NEXT: ds_store_b8 v1, v5 offset:3 ; GFX11-NEXT: ds_store_b8 v1, v2 offset:4 -; GFX11-NEXT: ds_store_b8 v1, v8 offset:5 -; GFX11-NEXT: ds_store_b8 v1, v5 offset:6 -; GFX11-NEXT: ds_store_b8 v1, v9 offset:7 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s3 -; GFX11-NEXT: s_lshr_b32 s0, s1, 8 -; GFX11-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-NEXT: ds_store_b8 v1, v9 offset:5 +; GFX11-NEXT: ds_store_b8 v1, v6 offset:6 +; GFX11-NEXT: ds_store_b8 v1, v8 offset:7 ; GFX11-NEXT: v_mov_b32_e32 v4, s0 ; GFX11-NEXT: s_and_b32 s0, 0xffff, s3 -; GFX11-NEXT: s_lshr_b32 s1, s3, 16 +; GFX11-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v5, s3 ; GFX11-NEXT: s_lshr_b32 s0, s0, 8 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 -; GFX11-NEXT: s_lshr_b32 s0, s1, 8 +; GFX11-NEXT: s_lshr_b32 s1, s3, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v7, s1 +; GFX11-NEXT: v_mov_b32_e32 v6, s0 +; GFX11-NEXT: s_lshr_b32 s0, s3, 24 ; GFX11-NEXT: v_mov_b32_e32 v8, s0 ; GFX11-NEXT: ds_store_b8 v1, v3 offset:8 ; GFX11-NEXT: ds_store_b8 v1, v0 offset:9 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll index 1d2d330eeb61a..cce6bd9301cbf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -72,15 +72,15 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-NEXT: ; kill: killed $sgpr4_sgpr5 ; GFX9-NEXT: s_and_b32 s5, 0xffff, s0 -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_lshr_b32 s0, s5, 8 +; GFX9-NEXT: s_lshr_b32 s3, s5, 8 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX9-NEXT: s_lshr_b32 s0, s4, 8 +; GFX9-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -92,7 +92,7 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 @@ -104,7 +104,7 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: s_lshr_b32 s1, s2, 24 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 @@ -163,37 +163,37 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_lshr_b32 s0, s1, 16 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_and_b32 s3, 0xffff, s1 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: s_lshr_b32 s1, s2, 16 -; GFX10-NEXT: s_and_b32 s6, 0xffff, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s2 -; GFX10-NEXT: s_lshr_b32 s2, s5, 8 -; GFX10-NEXT: s_lshr_b32 s5, s4, 8 +; GFX10-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-NEXT: s_and_b32 s8, 0xffff, s2 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_lshr_b32 s4, s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: s_lshr_b32 s0, s6, 8 -; GFX10-NEXT: v_mov_b32_e32 v9, s4 -; GFX10-NEXT: s_lshr_b32 s3, s3, 8 -; GFX10-NEXT: v_mov_b32_e32 v6, s2 -; GFX10-NEXT: v_mov_b32_e32 v10, s0 -; GFX10-NEXT: s_lshr_b32 s0, s1, 8 -; GFX10-NEXT: v_mov_b32_e32 v7, s5 -; GFX10-NEXT: v_mov_b32_e32 v8, s3 +; GFX10-NEXT: v_mov_b32_e32 v7, s1 +; GFX10-NEXT: s_lshr_b32 s1, s8, 8 +; GFX10-NEXT: s_lshr_b32 s7, s2, 16 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: s_lshr_b32 s5, s5, 8 +; GFX10-NEXT: v_mov_b32_e32 v6, s3 +; GFX10-NEXT: v_mov_b32_e32 v9, s0 +; GFX10-NEXT: v_mov_b32_e32 v10, s1 +; GFX10-NEXT: s_lshr_b32 s0, s2, 24 +; GFX10-NEXT: v_mov_b32_e32 v8, s5 ; GFX10-NEXT: ds_write_b8 v1, v0 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:2 -; GFX10-NEXT: ds_write_b8 v1, v5 offset:6 -; GFX10-NEXT: ds_write_b8 v1, v6 offset:1 -; GFX10-NEXT: ds_write_b8 v1, v7 offset:3 -; GFX10-NEXT: ds_write_b8 v1, v8 offset:5 -; GFX10-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-NEXT: ds_write_b8 v1, v5 offset:3 +; GFX10-NEXT: ds_write_b8 v1, v6 offset:6 +; GFX10-NEXT: ds_write_b8 v1, v8 offset:1 +; GFX10-NEXT: ds_write_b8 v1, v9 offset:5 +; GFX10-NEXT: v_mov_b32_e32 v0, s7 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: ds_write_b8 v1, v9 offset:7 +; GFX10-NEXT: ds_write_b8 v1, v7 offset:7 ; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 ; GFX10-NEXT: ds_write_b8 v1, v10 offset:9 ; GFX10-NEXT: ds_write_b8 v1, v0 offset:10 @@ -206,37 +206,37 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 ; GFX11-NEXT: s_and_b32 s5, 0xffff, s0 +; GFX11-NEXT: s_lshr_b32 s7, s2, 16 +; GFX11-NEXT: s_lshr_b32 s5, s5, 8 ; GFX11-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: s_lshr_b32 s0, s1, 16 -; GFX11-NEXT: s_and_b32 s3, 0xffff, s1 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: s_lshr_b32 s3, s1, 16 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 -; GFX11-NEXT: s_lshr_b32 s1, s2, 16 -; GFX11-NEXT: s_and_b32 s6, 0xffff, s2 -; GFX11-NEXT: s_lshr_b32 s2, s5, 8 -; GFX11-NEXT: s_lshr_b32 s5, s4, 8 -; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, 24 +; GFX11-NEXT: s_and_b32 s8, 0xffff, s2 +; GFX11-NEXT: v_dual_mov_b32 v8, s7 :: v_dual_mov_b32 v9, s5 ; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: s_lshr_b32 s3, s3, 8 -; GFX11-NEXT: s_lshr_b32 s4, s0, 8 +; GFX11-NEXT: s_lshr_b32 s2, s2, 24 ; GFX11-NEXT: s_lshr_b32 s0, s6, 8 -; GFX11-NEXT: s_lshr_b32 s6, s1, 8 -; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s3 -; GFX11-NEXT: v_dual_mov_b32 v10, s4 :: v_dual_mov_b32 v11, s0 -; GFX11-NEXT: v_mov_b32_e32 v12, s6 +; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s1 +; GFX11-NEXT: s_lshr_b32 s1, s8, 8 +; GFX11-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: v_mov_b32_e32 v12, s1 ; GFX11-NEXT: ds_store_b8 v1, v0 -; GFX11-NEXT: ds_store_b8 v1, v7 offset:1 +; GFX11-NEXT: ds_store_b8 v1, v9 offset:1 ; GFX11-NEXT: ds_store_b8 v1, v4 offset:2 -; GFX11-NEXT: ds_store_b8 v1, v8 offset:3 +; GFX11-NEXT: ds_store_b8 v1, v5 offset:3 ; GFX11-NEXT: ds_store_b8 v1, v2 offset:4 -; GFX11-NEXT: ds_store_b8 v1, v9 offset:5 -; GFX11-NEXT: ds_store_b8 v1, v5 offset:6 -; GFX11-NEXT: ds_store_b8 v1, v10 offset:7 +; GFX11-NEXT: ds_store_b8 v1, v11 offset:5 +; GFX11-NEXT: ds_store_b8 v1, v6 offset:6 +; GFX11-NEXT: ds_store_b8 v1, v7 offset:7 ; GFX11-NEXT: ds_store_b8 v1, v3 offset:8 -; GFX11-NEXT: ds_store_b8 v1, v11 offset:9 -; GFX11-NEXT: ds_store_b8 v1, v6 offset:10 -; GFX11-NEXT: ds_store_b8 v1, v12 offset:11 +; GFX11-NEXT: ds_store_b8 v1, v12 offset:9 +; GFX11-NEXT: ds_store_b8 v1, v8 offset:10 +; GFX11-NEXT: ds_store_b8 v1, v10 offset:11 ; GFX11-NEXT: s_endpgm store <3 x i32> %x, ptr addrspace(3) %out, align 1 ret void diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll index 93422e259b827..4b52d6efb8e98 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll @@ -105,14 +105,13 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ALIGNED-GISEL-LABEL: ds4align1: ; ALIGNED-GISEL: ; %bb.0: ; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, 8 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 ; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:3 ; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:2 -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) ; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) @@ -121,11 +120,11 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v2, v0, v1 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 -; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v0 -; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:1 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v0 offset:2 -; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:3 +; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v0 +; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:1 +; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v0 offset:2 +; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v2 offset:3 ; ALIGNED-GISEL-NEXT: s_endpgm ; ; UNALIGNED-LABEL: ds4align1: @@ -262,14 +261,13 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 ; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v2 offset:1 -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, 8 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v2, 24, v1 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v3, v1 offset:2 -; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v4 offset:3 +; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v2 offset:3 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 ; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v0 offset:4 ; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 offset:5 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v3, v0 offset:6 ; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 offset:7 ; ALIGNED-GISEL-NEXT: s_endpgm @@ -448,26 +446,25 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v6, v7, v2 ; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v4, v3 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v3, 8, v1 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 -; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v6, v7, v2 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v3 offset:1 -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, 8 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v5, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v1 offset:2 -; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v5 offset:3 +; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v3 offset:3 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v2 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v2 offset:4 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:5 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 24, v2 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v2 offset:6 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:7 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v0 offset:8 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:9 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v0 offset:10 ; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:11 ; ALIGNED-GISEL-NEXT: s_endpgm @@ -765,26 +762,25 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v4 offset:1 -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, 8 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v6, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v1 offset:2 -; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v6 offset:3 +; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v4 offset:3 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v2 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v2 offset:4 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:5 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 24, v2 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v2 offset:6 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:7 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v3 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v3 offset:8 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:9 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 24, v3 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v3 offset:10 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:11 ; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v0 offset:12 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:13 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v0 offset:14 ; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:15 ; ALIGNED-GISEL-NEXT: s_endpgm From 41879496bf8e9d34b9788896947c6214bfc88b7e Mon Sep 17 00:00:00 2001 From: Yu Li Date: Wed, 27 Aug 2025 09:41:36 +0000 Subject: [PATCH 2/8] whitespace --- llvm/lib/Target/AArch64/AArch64Combine.td | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index e44819ad5a4ae..a64f767ff320d 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -349,8 +349,6 @@ def AArch64PostLegalizerLowering } // Post-legalization combines which are primarily optimizations. - - def AArch64PostLegalizerCombiner : GICombiner<"AArch64PostLegalizerCombinerImpl", [copy_prop, cast_of_cast_combines, From 74bcf6cd44bc4ccfbeeef89fd8d8a7cc0ab80efb Mon Sep 17 00:00:00 2001 From: Yu Li Date: Wed, 27 Aug 2025 12:14:40 +0000 Subject: [PATCH 3/8] re,ove wip_match_opcode from GICombineRule --- llvm/include/llvm/Target/GlobalISel/Combine.td | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 95bcbbcdaaa70..2ccad1fe1046e 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -397,10 +397,15 @@ def commute_shift : GICombineRule< (apply [{ Helper.applyBuildFn(*${d}, ${matchinfo}); }])>; // Fold (shift (trunc (shift x, C1)), C2) -> trunc (shift x, (C1 + C2)) +def shift_right_op : GICombinePatFrag< + (outs root:$dst), (ins), + !foreach(op, + [G_LSHR, G_ASHR], + (pattern (op $dst, $shifted, $amt)))>; def shift_of_trunc_of_shift_matchdata : GIDefMatchData<"ShiftOfTruncOfShift">; def shift_of_trunc_of_shift : GICombineRule< - (defs root:$root, shift_of_trunc_of_shift_matchdata:$matchinfo), - (match (wip_match_opcode G_LSHR, G_ASHR):$root, + (defs root:$dst, shift_of_trunc_of_shift_matchdata:$matchinfo), + (match (shift_right_op $dst):$root, [{ return Helper.matchShiftOfTruncOfShift(*${root}, ${matchinfo}); }]), (apply [{ Helper.applyShiftOfTruncOfShift(*${root}, ${matchinfo}); }])>; From 716eee077c752fde9d1a607119b950b5361eff49 Mon Sep 17 00:00:00 2001 From: Yu Li Date: Thu, 4 Sep 2025 15:20:15 +0000 Subject: [PATCH 4/8] resolved comments --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 15 +- .../include/llvm/Target/GlobalISel/Combine.td | 29 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 68 +-- llvm/lib/Target/AArch64/AArch64Combine.td | 2 +- llvm/test/CodeGen/AArch64/combine-sdiv.ll | 26 +- llvm/test/CodeGen/AArch64/lshr-trunc-lshr.ll | 125 +++++ llvm/test/CodeGen/AArch64/rem-by-const.ll | 76 +-- llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll | 478 +++++++++--------- llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll | 6 +- 9 files changed, 488 insertions(+), 337 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/lshr-trunc-lshr.ll diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 40f612cc98bcc..46e099e092f54 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -81,9 +81,11 @@ struct ShiftOfShiftedLogic { uint64_t ValSum; }; -struct ShiftOfTruncOfShift { +struct LshrOfTruncOfLshr { + bool Mask = false; + APInt MaskVal; Register Src; - uint64_t ShiftAmt; + APInt ShiftAmt; LLT ShiftAmtTy; LLT InnerShiftTy; }; @@ -345,11 +347,10 @@ class CombinerHelper { bool matchCommuteShift(MachineInstr &MI, BuildFnTy &MatchInfo) const; - /// Fold (shift (trunc (shift x, C1)), C2) -> trunc (shift x, (C1 + C2)) - bool matchShiftOfTruncOfShift(MachineInstr &MI, - ShiftOfTruncOfShift &MatchInfo) const; - void applyShiftOfTruncOfShift(MachineInstr &MI, - ShiftOfTruncOfShift &MatchInfo) const; + /// Fold (lshr (trunc (lshr x, C1)), C2) -> trunc (shift x, (C1 + C2)) + bool matchLshrOfTruncOfLshr(MachineInstr &MI, LshrOfTruncOfLshr &MatchInfo, MachineInstr &ShiftMI, MachineInstr &TruncMI) const; + void applyLshrOfTruncOfLshr(MachineInstr &MI, + LshrOfTruncOfLshr &MatchInfo) const; /// Transform a multiply by a power-of-2 value to a left shift. bool matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal) const; diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 2ccad1fe1046e..b0c8315d19096 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -396,18 +396,21 @@ def commute_shift : GICombineRule< [{ return Helper.matchCommuteShift(*${d}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFn(*${d}, ${matchinfo}); }])>; -// Fold (shift (trunc (shift x, C1)), C2) -> trunc (shift x, (C1 + C2)) -def shift_right_op : GICombinePatFrag< - (outs root:$dst), (ins), - !foreach(op, - [G_LSHR, G_ASHR], - (pattern (op $dst, $shifted, $amt)))>; -def shift_of_trunc_of_shift_matchdata : GIDefMatchData<"ShiftOfTruncOfShift">; -def shift_of_trunc_of_shift : GICombineRule< - (defs root:$dst, shift_of_trunc_of_shift_matchdata:$matchinfo), - (match (shift_right_op $dst):$root, - [{ return Helper.matchShiftOfTruncOfShift(*${root}, ${matchinfo}); }]), - (apply [{ Helper.applyShiftOfTruncOfShift(*${root}, ${matchinfo}); }])>; +// Fold (lshr (trunc (lshr x, C1)), C2) -> trunc (lshr x, (C1 + C2)) +def lshr_of_trunc_of_lshr_matchdata : GIDefMatchData<"LshrOfTruncOfLshr">; +//def lshr_of_trunc_of_lshr : GICombineRule< +// (defs root:$root, lshr_of_trunc_of_lshr_matchdata:$matchinfo), +// (match (G_LSHR $dst, $x, $y):$root, +// [{ return Helper.matchLshrOfTruncOfLshr(*${root}, ${matchinfo}); }]), +// (apply [{ Helper.applyLshrOfTruncOfLshr(*${root}, ${matchinfo}); }])>; + +def lshr_of_trunc_of_lshr : GICombineRule< + (defs root:$root, lshr_of_trunc_of_lshr_matchdata:$matchinfo), + (match (G_LSHR $d1, $x, $y):$Shift, + (G_TRUNC $d2, $d1):$Trunc, + (G_LSHR $dst, $d2, $z):$root, + [{ return Helper.matchLshrOfTruncOfLshr(*${root}, ${matchinfo}, *${Shift}, *${Trunc}); }]), + (apply [{ Helper.applyLshrOfTruncOfLshr(*${root}, ${matchinfo}); }])>; def narrow_binop_feeding_and : GICombineRule< (defs root:$root, build_fn_matchinfo:$matchinfo), @@ -2147,7 +2150,7 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines, simplify_neg_minmax, combine_concat_vector, sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines, combine_use_vector_truncate, merge_combines, overflow_combines, - truncsat_combines, shift_of_trunc_of_shift]>; + truncsat_combines, lshr_of_trunc_of_lshr]>; // A combine group used to for prelegalizer combiners at -O0. The combines in // this group have been selected based on experiments to balance code size and diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index d3f0731955353..3527b92bcb9f4 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2094,57 +2094,63 @@ bool CombinerHelper::matchCommuteShift(MachineInstr &MI, return true; } -bool CombinerHelper::matchShiftOfTruncOfShift( - MachineInstr &MI, ShiftOfTruncOfShift &MatchInfo) const { +bool CombinerHelper::matchLshrOfTruncOfLshr( + MachineInstr &MI, LshrOfTruncOfLshr &MatchInfo, MachineInstr &ShiftMI, MachineInstr &TruncMI) const { unsigned ShiftOpcode = MI.getOpcode(); - assert(ShiftOpcode == TargetOpcode::G_LSHR || - ShiftOpcode == TargetOpcode::G_ASHR); + assert(ShiftOpcode == TargetOpcode::G_LSHR); Register N0 = MI.getOperand(1).getReg(); Register N1 = MI.getOperand(2).getReg(); unsigned OpSizeInBits = MRI.getType(N0).getScalarSizeInBits(); - APInt N1C; - Register InnerShift; - if (!mi_match(N1, MRI, m_ICstOrSplat(N1C)) || - !mi_match(N0, MRI, m_GTrunc(m_Reg(InnerShift)))) + APInt N1C, N001C; + if (!mi_match(N1, MRI, m_ICstOrSplat(N1C))) return false; - - auto *InnerMI = MRI.getVRegDef(InnerShift); - if (InnerMI->getOpcode() != ShiftOpcode) - return false; - - APInt N001C; - auto N001 = InnerMI->getOperand(2).getReg(); + auto N001 = ShiftMI.getOperand(2).getReg(); if (!mi_match(N001, MRI, m_ICstOrSplat(N001C))) return false; - uint64_t c1 = N001C.getZExtValue(); - uint64_t c2 = N1C.getZExtValue(); + if (N001C.getBitWidth() > N1C.getBitWidth()) + N1C = N1C.zext(N001C.getBitWidth()); + else + N001C = N001C.zext(N1C.getBitWidth()); + + Register InnerShift = ShiftMI.getOperand(0).getReg(); LLT InnerShiftTy = MRI.getType(InnerShift); uint64_t InnerShiftSize = InnerShiftTy.getScalarSizeInBits(); - if (!(c1 + OpSizeInBits == InnerShiftSize) || !(c1 + c2 < InnerShiftSize)) - return false; + if ((N1C + N001C).ult(InnerShiftSize)) { + MatchInfo.Src = ShiftMI.getOperand(1).getReg(); + MatchInfo.ShiftAmt = N1C + N001C; + MatchInfo.ShiftAmtTy = MRI.getType(N001); + MatchInfo.InnerShiftTy = InnerShiftTy; - MatchInfo.Src = InnerMI->getOperand(1).getReg(); - MatchInfo.ShiftAmt = c1 + c2; - MatchInfo.ShiftAmtTy = MRI.getType(N001); - MatchInfo.InnerShiftTy = InnerShiftTy; - return true; + if ((N001C + OpSizeInBits) == InnerShiftSize) + return true; + if (MRI.hasOneUse(N0) && MRI.hasOneUse(InnerShift)) { + MatchInfo.Mask = true; + MatchInfo.MaskVal = APInt(N1C.getBitWidth(), OpSizeInBits) - N1C; + return true; + } + } + return false; } -void CombinerHelper::applyShiftOfTruncOfShift( - MachineInstr &MI, ShiftOfTruncOfShift &MatchInfo) const { +void CombinerHelper::applyLshrOfTruncOfLshr( + MachineInstr &MI, LshrOfTruncOfLshr &MatchInfo) const { unsigned ShiftOpcode = MI.getOpcode(); - assert(ShiftOpcode == TargetOpcode::G_LSHR || - ShiftOpcode == TargetOpcode::G_ASHR); + assert(ShiftOpcode == TargetOpcode::G_LSHR); Register Dst = MI.getOperand(0).getReg(); auto ShiftAmt = Builder.buildConstant(MatchInfo.ShiftAmtTy, MatchInfo.ShiftAmt); - auto Shift = Builder.buildInstr(ShiftOpcode, {MatchInfo.InnerShiftTy}, - {MatchInfo.Src, ShiftAmt}); - Builder.buildTrunc(Dst, Shift); + auto Shift = Builder.buildLShr(MatchInfo.InnerShiftTy, MatchInfo.Src, ShiftAmt); + if (MatchInfo.Mask == true) { + APInt MaskVal = APInt::getLowBitsSet(MatchInfo.InnerShiftTy.getScalarSizeInBits(), MatchInfo.MaskVal.getZExtValue()); + auto Mask = Builder.buildConstant(MatchInfo.ShiftAmtTy, MaskVal); + auto And = Builder.buildAnd(MatchInfo.InnerShiftTy, Shift, Mask); + Builder.buildTrunc(Dst, And); + } else + Builder.buildTrunc(Dst, Shift); MI.eraseFromParent(); } diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index a64f767ff320d..076a6235eef0a 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -369,5 +369,5 @@ def AArch64PostLegalizerCombiner commute_constant_to_rhs, extract_vec_elt_combines, push_freeze_to_prevent_poison_from_propagating, combine_mul_cmlt, combine_use_vector_truncate, - extmultomull, truncsat_combines, shift_of_trunc_of_shift]> { + extmultomull, truncsat_combines, lshr_of_trunc_of_lshr]> { } diff --git a/llvm/test/CodeGen/AArch64/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/combine-sdiv.ll index 014eaee5ebb2f..9d0ade2480428 100644 --- a/llvm/test/CodeGen/AArch64/combine-sdiv.ll +++ b/llvm/test/CodeGen/AArch64/combine-sdiv.ll @@ -1684,14 +1684,24 @@ define i32 @combine_i32_sdiv_const7(i32 %x) { } define i32 @combine_i32_sdiv_const100(i32 %x) { -; CHECK-LABEL: combine_i32_sdiv_const100: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #34079 // =0x851f -; CHECK-NEXT: movk w8, #20971, lsl #16 -; CHECK-NEXT: smull x8, w0, w8 -; CHECK-NEXT: asr x8, x8, #37 -; CHECK-NEXT: add w0, w8, w8, lsr #31 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: combine_i32_sdiv_const100: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w8, #34079 // =0x851f +; CHECK-SD-NEXT: movk w8, #20971, lsl #16 +; CHECK-SD-NEXT: smull x8, w0, w8 +; CHECK-SD-NEXT: asr x8, x8, #37 +; CHECK-SD-NEXT: add w0, w8, w8, lsr #31 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_i32_sdiv_const100: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #34079 // =0x851f +; CHECK-GI-NEXT: movk w8, #20971, lsl #16 +; CHECK-GI-NEXT: smull x8, w0, w8 +; CHECK-GI-NEXT: asr x8, x8, #32 +; CHECK-GI-NEXT: asr w8, w8, #5 +; CHECK-GI-NEXT: add w0, w8, w8, lsr #31 +; CHECK-GI-NEXT: ret %1 = sdiv i32 %x, 100 ret i32 %1 } diff --git a/llvm/test/CodeGen/AArch64/lshr-trunc-lshr.ll b/llvm/test/CodeGen/AArch64/lshr-trunc-lshr.ll new file mode 100644 index 0000000000000..8a576fc346bdc --- /dev/null +++ b/llvm/test/CodeGen/AArch64/lshr-trunc-lshr.ll @@ -0,0 +1,125 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-none-eabi -global-isel=0 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-none-eabi -global-isel=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +define i32 @s32_test1(i64 %a) { +; CHECK-LABEL: s32_test1: +; CHECK: // %bb.0: +; CHECK-NEXT: lsr x0, x0, #48 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %r = lshr i64 %a, 32 + %ret = trunc i64 %r to i32 + %x = lshr i32 %ret, 16 + ret i32 %x +} + +define i32 @s32_test2(i64 %a) { +; CHECK-LABEL: s32_test2: +; CHECK: // %bb.0: +; CHECK-NEXT: ubfx x0, x0, #32, #16 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %r = lshr i64 %a, 16 + %ret = trunc i64 %r to i32 + %x = lshr i32 %ret, 16 + ret i32 %x +} + +define <8 x i8> @v8s8_test1(<8 x i16> %a) { +; CHECK-LABEL: v8s8_test1: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.8h, v0.8h, #12 +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: ret + %r = lshr <8 x i16> %a, + %ret = trunc <8 x i16> %r to <8 x i8> + %x = lshr <8 x i8> %ret, + ret <8 x i8> %x +} + +define <8 x i8> @v8s8_test2(<8 x i16> %a) { +; CHECK-SD-LABEL: v8s8_test2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ushr v0.8h, v0.8h, #8 +; CHECK-SD-NEXT: bic v0.8h, #240 +; CHECK-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v8s8_test2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v1.8h, #15 +; CHECK-GI-NEXT: ushr v0.8h, v0.8h, #8 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: ret + %r = lshr <8 x i16> %a, + %ret = trunc <8 x i16> %r to <8 x i8> + %x = lshr <8 x i8> %ret, + ret <8 x i8> %x +} + +define <4 x i16> @v4s16_test1(<4 x i32> %a) { +; CHECK-LABEL: v4s16_test1: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.4s, v0.4s, #24 +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret + %r = lshr <4 x i32> %a, + %ret = trunc <4 x i32> %r to <4 x i16> + %x = lshr <4 x i16> %ret, + ret <4 x i16> %x +} + +define <4 x i16> @v4s16_test2(<4 x i32> %a) { +; CHECK-SD-LABEL: v4s16_test2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v4s16_test2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v1.2d, #0x0000ff000000ff +; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #16 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: ret + %r = lshr <4 x i32> %a, + %ret = trunc <4 x i32> %r to <4 x i16> + %x = lshr <4 x i16> %ret, + ret <4 x i16> %x +} + +define <2 x i32> @v2s32_test1(<2 x i64> %a) { +; CHECK-LABEL: v2s32_test1: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.2d, v0.2d, #48 +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: ret + %r = lshr <2 x i64> %a, + %ret = trunc <2 x i64> %r to <2 x i32> + %x = lshr <2 x i32> %ret, + ret <2 x i32> %x +} + +define <2 x i32> @v2s32_test2(<2 x i64> %a) { +; CHECK-SD-LABEL: v2s32_test2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi d1, #0x00ffff0000ffff +; CHECK-SD-NEXT: shrn v0.2s, v0.2d, #32 +; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v2s32_test2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v1.2d, #0x0000000000ffff +; CHECK-GI-NEXT: ushr v0.2d, v0.2d, #32 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: xtn v0.2s, v0.2d +; CHECK-GI-NEXT: ret + %r = lshr <2 x i64> %a, + %ret = trunc <2 x i64> %r to <2 x i32> + %x = lshr <2 x i32> %ret, + ret <2 x i32> %x +} diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll index f36a87794be35..599fa510d4aea 100644 --- a/llvm/test/CodeGen/AArch64/rem-by-const.ll +++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll @@ -276,16 +276,28 @@ entry: } define i32 @si32_100(i32 %a, i32 %b) { -; CHECK-LABEL: si32_100: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #34079 // =0x851f -; CHECK-NEXT: mov w9, #100 // =0x64 -; CHECK-NEXT: movk w8, #20971, lsl #16 -; CHECK-NEXT: smull x8, w0, w8 -; CHECK-NEXT: asr x8, x8, #37 -; CHECK-NEXT: add w8, w8, w8, lsr #31 -; CHECK-NEXT: msub w0, w8, w9, w0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: si32_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #34079 // =0x851f +; CHECK-SD-NEXT: mov w9, #100 // =0x64 +; CHECK-SD-NEXT: movk w8, #20971, lsl #16 +; CHECK-SD-NEXT: smull x8, w0, w8 +; CHECK-SD-NEXT: asr x8, x8, #37 +; CHECK-SD-NEXT: add w8, w8, w8, lsr #31 +; CHECK-SD-NEXT: msub w0, w8, w9, w0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: si32_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #34079 // =0x851f +; CHECK-GI-NEXT: mov w9, #100 // =0x64 +; CHECK-GI-NEXT: movk w8, #20971, lsl #16 +; CHECK-GI-NEXT: smull x8, w0, w8 +; CHECK-GI-NEXT: asr x8, x8, #32 +; CHECK-GI-NEXT: asr w8, w8, #5 +; CHECK-GI-NEXT: add w8, w8, w8, lsr #31 +; CHECK-GI-NEXT: msub w0, w8, w9, w0 +; CHECK-GI-NEXT: ret entry: %s = srem i32 %a, 100 ret i32 %s @@ -1095,12 +1107,13 @@ define <8 x i8> @sv8i8_100(<8 x i8> %d, <8 x i8> %e) { ; CHECK-GI-LABEL: sv8i8_100: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi v1.8b, #41 -; CHECK-GI-NEXT: movi v2.8b, #100 +; CHECK-GI-NEXT: movi v3.8b, #100 ; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b -; CHECK-GI-NEXT: sshr v1.8h, v1.8h, #12 -; CHECK-GI-NEXT: xtn v1.8b, v1.8h -; CHECK-GI-NEXT: usra v1.8b, v1.8b, #7 -; CHECK-GI-NEXT: mls v0.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-GI-NEXT: sshr v2.8b, v1.8b, #4 +; CHECK-GI-NEXT: ushr v2.8b, v2.8b, #7 +; CHECK-GI-NEXT: ssra v2.8b, v1.8b, #4 +; CHECK-GI-NEXT: mls v0.8b, v2.8b, v3.8b ; CHECK-GI-NEXT: ret entry: %s = srem <8 x i8> %d, @@ -1890,13 +1903,14 @@ define <4 x i16> @sv4i16_7(<4 x i16> %d, <4 x i16> %e) { ; CHECK-GI-LABEL: sv4i16_7: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: adrp x8, .LCPI44_0 -; CHECK-GI-NEXT: movi v2.4h, #7 +; CHECK-GI-NEXT: movi v3.4h, #7 ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI44_0] ; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h -; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #17 -; CHECK-GI-NEXT: xtn v1.4h, v1.4s -; CHECK-GI-NEXT: usra v1.4h, v1.4h, #15 -; CHECK-GI-NEXT: mls v0.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-GI-NEXT: sshr v2.4h, v1.4h, #1 +; CHECK-GI-NEXT: ushr v2.4h, v2.4h, #15 +; CHECK-GI-NEXT: ssra v2.4h, v1.4h, #1 +; CHECK-GI-NEXT: mls v0.4h, v2.4h, v3.4h ; CHECK-GI-NEXT: ret entry: %s = srem <4 x i16> %d, @@ -1919,13 +1933,14 @@ define <4 x i16> @sv4i16_100(<4 x i16> %d, <4 x i16> %e) { ; CHECK-GI-LABEL: sv4i16_100: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: adrp x8, .LCPI45_0 -; CHECK-GI-NEXT: movi v2.4h, #100 +; CHECK-GI-NEXT: movi v3.4h, #100 ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI45_0] ; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h -; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #19 -; CHECK-GI-NEXT: xtn v1.4h, v1.4s -; CHECK-GI-NEXT: usra v1.4h, v1.4h, #15 -; CHECK-GI-NEXT: mls v0.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-GI-NEXT: sshr v2.4h, v1.4h, #3 +; CHECK-GI-NEXT: ushr v2.4h, v2.4h, #15 +; CHECK-GI-NEXT: ssra v2.4h, v1.4h, #3 +; CHECK-GI-NEXT: mls v0.4h, v2.4h, v3.4h ; CHECK-GI-NEXT: ret entry: %s = srem <4 x i16> %d, @@ -2408,13 +2423,14 @@ define <2 x i32> @sv2i32_100(<2 x i32> %d, <2 x i32> %e) { ; CHECK-GI-LABEL: sv2i32_100: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: adrp x8, .LCPI57_0 -; CHECK-GI-NEXT: movi v2.2s, #100 +; CHECK-GI-NEXT: movi v3.2s, #100 ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI57_0] ; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-GI-NEXT: sshr v1.2d, v1.2d, #37 -; CHECK-GI-NEXT: xtn v1.2s, v1.2d -; CHECK-GI-NEXT: usra v1.2s, v1.2s, #31 -; CHECK-GI-NEXT: mls v0.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-GI-NEXT: sshr v2.2s, v1.2s, #5 +; CHECK-GI-NEXT: ushr v2.2s, v2.2s, #31 +; CHECK-GI-NEXT: ssra v2.2s, v1.2s, #5 +; CHECK-GI-NEXT: mls v0.2s, v2.2s, v3.2s ; CHECK-GI-NEXT: ret entry: %s = srem <2 x i32> %d, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index 09bb9c94dc81e..bd5303213a690 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -971,100 +971,98 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; ; GFX8-LABEL: s_fshl_v4i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s6, s1, 8 -; GFX8-NEXT: s_lshr_b32 s7, s1, 16 -; GFX8-NEXT: s_lshr_b32 s8, s1, 24 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_lshr_b32 s9, s2, 8 -; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_lshr_b32 s11, s2, 24 -; GFX8-NEXT: s_and_b32 s12, s2, 7 -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 -; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_and_b32 s11, s2, 7 ; GFX8-NEXT: s_lshr_b32 s3, s0, 8 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_lshr_b32 s5, s0, 24 -; GFX8-NEXT: s_lshl_b32 s0, s0, s12 -; GFX8-NEXT: s_lshr_b32 s1, s1, s2 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s9, 7 -; GFX8-NEXT: s_and_b32 s2, s6, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_lshr_b32 s2, s2, 1 -; GFX8-NEXT: s_andn2_b32 s3, 7, s9 -; GFX8-NEXT: s_lshr_b32 s2, s2, s3 -; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s2, s10, 7 -; GFX8-NEXT: s_and_b32 s3, s7, 0xff -; GFX8-NEXT: s_lshl_b32 s2, s4, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s11 +; GFX8-NEXT: s_and_b32 s11, s1, 0xff +; GFX8-NEXT: s_lshr_b32 s8, s2, 8 +; GFX8-NEXT: s_lshr_b32 s9, s2, 16 +; GFX8-NEXT: s_lshr_b32 s10, s2, 24 +; GFX8-NEXT: s_lshr_b32 s11, s11, 1 +; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_lshr_b32 s2, s11, s2 +; GFX8-NEXT: s_lshr_b32 s6, s1, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s8, 7 +; GFX8-NEXT: s_lshl_b32 s2, s3, s2 +; GFX8-NEXT: s_and_b32 s3, s6, 0xff ; GFX8-NEXT: s_lshr_b32 s3, s3, 1 -; GFX8-NEXT: s_andn2_b32 s4, 7, s10 -; GFX8-NEXT: s_lshr_b32 s3, s3, s4 +; GFX8-NEXT: s_andn2_b32 s6, 7, s8 +; GFX8-NEXT: s_lshr_b32 s3, s3, s6 +; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_or_b32 s2, s2, s3 -; GFX8-NEXT: s_and_b32 s3, s11, 7 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_lshl_b32 s3, s5, s3 -; GFX8-NEXT: s_lshr_b32 s4, s8, 1 -; GFX8-NEXT: s_andn2_b32 s5, 7, s11 -; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_lshr_b32 s4, s4, s5 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s2, 0xff +; GFX8-NEXT: s_and_b32 s3, s9, 7 +; GFX8-NEXT: s_lshl_b32 s3, s4, s3 +; GFX8-NEXT: s_and_b32 s4, s7, 0xff +; GFX8-NEXT: s_lshr_b32 s4, s4, 1 +; GFX8-NEXT: s_andn2_b32 s6, 7, s9 +; GFX8-NEXT: s_lshr_b32 s4, s4, s6 ; GFX8-NEXT: s_or_b32 s3, s3, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, 0xff +; GFX8-NEXT: s_and_b32 s4, s10, 7 +; GFX8-NEXT: s_lshl_b32 s4, s5, s4 +; GFX8-NEXT: s_lshr_b32 s1, s1, 25 +; GFX8-NEXT: s_andn2_b32 s5, 7, s10 +; GFX8-NEXT: s_and_b32 s2, s2, 0xff +; GFX8-NEXT: s_lshr_b32 s1, s1, s5 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s2, s2, 8 +; GFX8-NEXT: s_or_b32 s1, s4, s1 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s3, 0xff +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s1, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s6, s1, 8 -; GFX9-NEXT: s_lshr_b32 s7, s1, 16 -; GFX9-NEXT: s_lshr_b32 s8, s1, 24 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_lshr_b32 s9, s2, 8 -; GFX9-NEXT: s_lshr_b32 s10, s2, 16 -; GFX9-NEXT: s_lshr_b32 s11, s2, 24 -; GFX9-NEXT: s_and_b32 s12, s2, 7 -; GFX9-NEXT: s_lshr_b32 s1, s1, 1 -; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_and_b32 s11, s2, 7 ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s5, s0, 24 -; GFX9-NEXT: s_lshl_b32 s0, s0, s12 -; GFX9-NEXT: s_lshr_b32 s1, s1, s2 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s9, 7 -; GFX9-NEXT: s_and_b32 s2, s6, 0xff -; GFX9-NEXT: s_lshl_b32 s1, s3, s1 -; GFX9-NEXT: s_lshr_b32 s2, s2, 1 -; GFX9-NEXT: s_andn2_b32 s3, 7, s9 -; GFX9-NEXT: s_lshr_b32 s2, s2, s3 -; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s2, s10, 7 -; GFX9-NEXT: s_and_b32 s3, s7, 0xff -; GFX9-NEXT: s_lshl_b32 s2, s4, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s11 +; GFX9-NEXT: s_and_b32 s11, s1, 0xff +; GFX9-NEXT: s_lshr_b32 s8, s2, 8 +; GFX9-NEXT: s_lshr_b32 s9, s2, 16 +; GFX9-NEXT: s_lshr_b32 s10, s2, 24 +; GFX9-NEXT: s_lshr_b32 s11, s11, 1 +; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_lshr_b32 s2, s11, s2 +; GFX9-NEXT: s_lshr_b32 s6, s1, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s8, 7 +; GFX9-NEXT: s_lshl_b32 s2, s3, s2 +; GFX9-NEXT: s_and_b32 s3, s6, 0xff ; GFX9-NEXT: s_lshr_b32 s3, s3, 1 -; GFX9-NEXT: s_andn2_b32 s4, 7, s10 -; GFX9-NEXT: s_lshr_b32 s3, s3, s4 +; GFX9-NEXT: s_andn2_b32 s6, 7, s8 +; GFX9-NEXT: s_lshr_b32 s3, s3, s6 +; GFX9-NEXT: s_lshr_b32 s7, s1, 16 ; GFX9-NEXT: s_or_b32 s2, s2, s3 -; GFX9-NEXT: s_and_b32 s3, s11, 7 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_lshl_b32 s3, s5, s3 -; GFX9-NEXT: s_lshr_b32 s4, s8, 1 -; GFX9-NEXT: s_andn2_b32 s5, 7, s11 -; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_lshr_b32 s4, s4, s5 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s2, 0xff +; GFX9-NEXT: s_and_b32 s3, s9, 7 +; GFX9-NEXT: s_lshl_b32 s3, s4, s3 +; GFX9-NEXT: s_and_b32 s4, s7, 0xff +; GFX9-NEXT: s_lshr_b32 s4, s4, 1 +; GFX9-NEXT: s_andn2_b32 s6, 7, s9 +; GFX9-NEXT: s_lshr_b32 s4, s4, s6 ; GFX9-NEXT: s_or_b32 s3, s3, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 16 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, 0xff +; GFX9-NEXT: s_and_b32 s4, s10, 7 +; GFX9-NEXT: s_lshl_b32 s4, s5, s4 +; GFX9-NEXT: s_lshr_b32 s1, s1, 25 +; GFX9-NEXT: s_andn2_b32 s5, 7, s10 +; GFX9-NEXT: s_and_b32 s2, s2, 0xff +; GFX9-NEXT: s_lshr_b32 s1, s1, s5 +; GFX9-NEXT: s_and_b32 s0, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_or_b32 s1, s4, s1 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s3, 0xff +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_or_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s1, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog @@ -1072,100 +1070,98 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX10-LABEL: s_fshl_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s6, s1, 8 -; GFX10-NEXT: s_lshr_b32 s7, s1, 16 -; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshr_b32 s9, s2, 8 -; GFX10-NEXT: s_lshr_b32 s10, s2, 16 -; GFX10-NEXT: s_lshr_b32 s11, s2, 24 +; GFX10-NEXT: s_and_b32 s11, s1, 0xff +; GFX10-NEXT: s_lshr_b32 s8, s2, 8 +; GFX10-NEXT: s_lshr_b32 s9, s2, 16 +; GFX10-NEXT: s_lshr_b32 s10, s2, 24 ; GFX10-NEXT: s_and_b32 s12, s2, 7 -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_lshr_b32 s11, s11, 1 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_lshr_b32 s1, s1, s2 -; GFX10-NEXT: s_and_b32 s2, s6, 0xff -; GFX10-NEXT: s_and_b32 s6, s9, 7 -; GFX10-NEXT: s_lshr_b32 s2, s2, 1 -; GFX10-NEXT: s_andn2_b32 s9, 7, s9 +; GFX10-NEXT: s_lshr_b32 s2, s11, s2 +; GFX10-NEXT: s_and_b32 s11, s8, 7 +; GFX10-NEXT: s_lshr_b32 s6, s6, 1 +; GFX10-NEXT: s_andn2_b32 s8, 7, s8 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_lshr_b32 s7, s1, 16 ; GFX10-NEXT: s_lshl_b32 s0, s0, s12 -; GFX10-NEXT: s_lshl_b32 s3, s3, s6 -; GFX10-NEXT: s_lshr_b32 s2, s2, s9 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_or_b32 s1, s3, s2 -; GFX10-NEXT: s_and_b32 s2, s7, 0xff -; GFX10-NEXT: s_and_b32 s3, s10, 7 -; GFX10-NEXT: s_lshr_b32 s2, s2, 1 -; GFX10-NEXT: s_andn2_b32 s6, 7, s10 -; GFX10-NEXT: s_lshl_b32 s3, s4, s3 -; GFX10-NEXT: s_lshr_b32 s2, s2, s6 -; GFX10-NEXT: s_and_b32 s4, s11, 7 -; GFX10-NEXT: s_lshr_b32 s6, s8, 1 -; GFX10-NEXT: s_andn2_b32 s7, 7, s11 -; GFX10-NEXT: s_lshl_b32 s4, s5, s4 -; GFX10-NEXT: s_lshr_b32 s5, s6, s7 -; GFX10-NEXT: s_or_b32 s2, s3, s2 -; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_or_b32 s3, s4, s5 -; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_lshl_b32 s1, s1, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, s11 +; GFX10-NEXT: s_lshr_b32 s6, s6, s8 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s2, s3, s6 +; GFX10-NEXT: s_and_b32 s3, s7, 0xff +; GFX10-NEXT: s_and_b32 s6, s9, 7 +; GFX10-NEXT: s_lshr_b32 s3, s3, 1 +; GFX10-NEXT: s_andn2_b32 s7, 7, s9 +; GFX10-NEXT: s_lshl_b32 s4, s4, s6 +; GFX10-NEXT: s_lshr_b32 s3, s3, s7 +; GFX10-NEXT: s_and_b32 s6, s10, 7 +; GFX10-NEXT: s_lshr_b32 s1, s1, 25 +; GFX10-NEXT: s_andn2_b32 s7, 7, s10 +; GFX10-NEXT: s_lshl_b32 s5, s5, s6 +; GFX10-NEXT: s_lshr_b32 s1, s1, s7 +; GFX10-NEXT: s_or_b32 s3, s4, s3 ; GFX10-NEXT: s_and_b32 s2, s2, 0xff -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s3, 0xff -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, 24 +; GFX10-NEXT: s_or_b32 s1, s5, s1 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_and_b32 s3, s3, 0xff +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s2, s3, 16 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_v4i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_lshr_b32 s6, s1, 8 -; GFX11-NEXT: s_lshr_b32 s7, s1, 16 -; GFX11-NEXT: s_lshr_b32 s8, s1, 24 -; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshr_b32 s9, s2, 8 -; GFX11-NEXT: s_lshr_b32 s10, s2, 16 -; GFX11-NEXT: s_lshr_b32 s11, s2, 24 +; GFX11-NEXT: s_and_b32 s11, s1, 0xff +; GFX11-NEXT: s_lshr_b32 s8, s2, 8 +; GFX11-NEXT: s_lshr_b32 s9, s2, 16 +; GFX11-NEXT: s_lshr_b32 s10, s2, 24 ; GFX11-NEXT: s_and_b32 s12, s2, 7 -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_lshr_b32 s11, s11, 1 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff ; GFX11-NEXT: s_lshr_b32 s3, s0, 8 -; GFX11-NEXT: s_lshr_b32 s1, s1, s2 -; GFX11-NEXT: s_and_b32 s2, s6, 0xff -; GFX11-NEXT: s_and_b32 s6, s9, 7 -; GFX11-NEXT: s_lshr_b32 s2, s2, 1 -; GFX11-NEXT: s_and_not1_b32 s9, 7, s9 +; GFX11-NEXT: s_lshr_b32 s2, s11, s2 +; GFX11-NEXT: s_and_b32 s11, s8, 7 +; GFX11-NEXT: s_lshr_b32 s6, s6, 1 +; GFX11-NEXT: s_and_not1_b32 s8, 7, s8 ; GFX11-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-NEXT: s_lshr_b32 s5, s0, 24 +; GFX11-NEXT: s_lshr_b32 s7, s1, 16 ; GFX11-NEXT: s_lshl_b32 s0, s0, s12 -; GFX11-NEXT: s_lshl_b32 s3, s3, s6 -; GFX11-NEXT: s_lshr_b32 s2, s2, s9 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s3, s2 -; GFX11-NEXT: s_and_b32 s2, s7, 0xff -; GFX11-NEXT: s_and_b32 s3, s10, 7 -; GFX11-NEXT: s_lshr_b32 s2, s2, 1 -; GFX11-NEXT: s_and_not1_b32 s6, 7, s10 -; GFX11-NEXT: s_lshl_b32 s3, s4, s3 -; GFX11-NEXT: s_lshr_b32 s2, s2, s6 -; GFX11-NEXT: s_and_b32 s4, s11, 7 -; GFX11-NEXT: s_lshr_b32 s6, s8, 1 -; GFX11-NEXT: s_and_not1_b32 s7, 7, s11 -; GFX11-NEXT: s_lshl_b32 s4, s5, s4 -; GFX11-NEXT: s_lshr_b32 s5, s6, s7 -; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_or_b32 s3, s4, s5 -; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, s11 +; GFX11-NEXT: s_lshr_b32 s6, s6, s8 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s6 +; GFX11-NEXT: s_and_b32 s3, s7, 0xff +; GFX11-NEXT: s_and_b32 s6, s9, 7 +; GFX11-NEXT: s_lshr_b32 s3, s3, 1 +; GFX11-NEXT: s_and_not1_b32 s7, 7, s9 +; GFX11-NEXT: s_lshl_b32 s4, s4, s6 +; GFX11-NEXT: s_lshr_b32 s3, s3, s7 +; GFX11-NEXT: s_and_b32 s6, s10, 7 +; GFX11-NEXT: s_lshr_b32 s1, s1, 25 +; GFX11-NEXT: s_and_not1_b32 s7, 7, s10 +; GFX11-NEXT: s_lshl_b32 s5, s5, s6 +; GFX11-NEXT: s_lshr_b32 s1, s1, s7 +; GFX11-NEXT: s_or_b32 s3, s4, s3 ; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_lshl_b32 s1, s2, 16 -; GFX11-NEXT: s_and_b32 s2, s3, 0xff -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_lshl_b32 s1, s2, 24 +; GFX11-NEXT: s_or_b32 s1, s5, s1 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_lshl_b32 s2, s3, 16 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_lshl_b32 s1, s1, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -1248,18 +1244,18 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX8-NEXT: v_mov_b32_e32 v7, 0xff ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 7 -; GFX8-NEXT: v_mov_b32_e32 v9, -1 +; GFX8-NEXT: v_mov_b32_e32 v8, -1 ; GFX8-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v7, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_xor_b32_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_sdwa v9, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b16_e32 v7, 1, v7 -; GFX8-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_and_b32_e32 v9, 7, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 25, v1 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v7, v10, v7 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, v9, v7 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 @@ -1298,21 +1294,21 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX9-NEXT: v_mov_b32_e32 v7, 0xff ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_mov_b32_e32 v4, 7 -; GFX9-NEXT: v_mov_b32_e32 v10, -1 +; GFX9-NEXT: v_mov_b32_e32 v9, -1 ; GFX9-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v9, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_xor_b32_sdwa v11, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v8, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_xor_b32_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_xor_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b16_e32 v9, 1, v9 -; GFX9-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_xor_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b16_e32 v8, 1, v8 +; GFX9-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 25, v1 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b16_e32 v9, v11, v9 +; GFX9-NEXT: v_lshrrev_b16_e32 v8, v10, v8 ; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 -; GFX9-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v8 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1327,111 +1323,109 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX10-LABEL: v_fshl_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX10-NEXT: v_and_b32_e32 v8, 7, v2 -; GFX10-NEXT: v_and_b32_e32 v9, 0xff, v1 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v2 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v6 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v8, 0xff, v1 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2 +; GFX10-NEXT: v_and_b32_e32 v10, 7, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 -; GFX10-NEXT: v_lshrrev_b16 v8, 1, v9 -; GFX10-NEXT: v_and_b32_e32 v9, 7, v10 -; GFX10-NEXT: v_lshlrev_b16 v3, v11, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX10-NEXT: v_lshlrev_b16 v0, v7, v0 +; GFX10-NEXT: v_lshrrev_b16 v7, 1, v8 +; GFX10-NEXT: v_and_b32_e32 v8, 7, v9 +; GFX10-NEXT: v_lshlrev_b16 v3, v10, v3 +; GFX10-NEXT: v_and_b32_e32 v9, 0xff, v11 ; GFX10-NEXT: v_mov_b32_e32 v10, 0xff ; GFX10-NEXT: v_mov_b32_e32 v11, -1 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v1 -; GFX10-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX10-NEXT: v_mov_b32_e32 v13, 7 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_xor_b32_sdwa v10, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX10-NEXT: v_mov_b32_e32 v12, 7 +; GFX10-NEXT: v_lshrrev_b16 v9, 1, v9 +; GFX10-NEXT: v_and_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_xor_b32_sdwa v13, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_xor_b32_sdwa v11, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b16 v7, 1, v7 -; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX10-NEXT: v_and_b32_sdwa v14, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b16 v12, 1, v12 +; GFX10-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX10-NEXT: v_and_b32_sdwa v14, v2, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b16 v10, 1, v10 +; GFX10-NEXT: v_and_b32_e32 v13, 7, v13 +; GFX10-NEXT: v_and_b32_sdwa v2, v2, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 25, v1 ; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX10-NEXT: v_lshrrev_b16 v6, v6, v7 +; GFX10-NEXT: v_lshrrev_b16 v5, v5, v9 ; GFX10-NEXT: v_lshlrev_b16 v4, v14, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v10, v1 -; GFX10-NEXT: v_lshlrev_b16 v2, v2, v5 -; GFX10-NEXT: v_lshrrev_b16 v5, v11, v12 -; GFX10-NEXT: v_lshrrev_b16 v7, v9, v8 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, 8 -; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX10-NEXT: v_or_b32_e32 v0, v0, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b16 v9, v13, v10 +; GFX10-NEXT: v_lshlrev_b16 v2, v2, v6 +; GFX10-NEXT: v_lshrrev_b16 v1, v11, v1 +; GFX10-NEXT: v_lshrrev_b16 v6, v8, v7 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX10-NEXT: v_mov_b32_e32 v5, 8 +; GFX10-NEXT: v_or_b32_e32 v4, v4, v9 +; GFX10-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshl_v4i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 8, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-NEXT: v_xor_b32_e32 v13, -1, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX11-NEXT: v_and_b32_e32 v9, 7, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX11-NEXT: v_xor_b32_e32 v13, -1, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX11-NEXT: v_and_b32_e32 v8, 7, v8 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX11-NEXT: v_lshrrev_b16 v6, 1, v6 ; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 -; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-NEXT: v_lshlrev_b16 v3, v9, v3 -; GFX11-NEXT: v_xor_b32_e32 v9, -1, v10 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-NEXT: v_lshrrev_b16 v6, v13, v6 -; GFX11-NEXT: v_xor_b32_e32 v13, -1, v11 +; GFX11-NEXT: v_lshlrev_b16 v3, v8, v3 +; GFX11-NEXT: v_xor_b32_e32 v8, -1, v9 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX11-NEXT: v_and_b32_e32 v12, 7, v2 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_lshrrev_b16 v6, v13, v6 +; GFX11-NEXT: v_xor_b32_e32 v13, -1, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 7, v2 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v1 ; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX11-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX11-NEXT: v_lshrrev_b16 v7, 1, v7 ; GFX11-NEXT: v_and_b32_e32 v9, 7, v9 -; GFX11-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX11-NEXT: v_lshrrev_b16 v8, 1, v8 +; GFX11-NEXT: v_lshrrev_b16 v7, 1, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 7, v8 +; GFX11-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 25, v1 ; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 -; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX11-NEXT: v_lshrrev_b16 v12, 1, v12 ; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX11-NEXT: v_or_b32_e32 v3, v3, v6 -; GFX11-NEXT: v_lshlrev_b16 v4, v10, v4 -; GFX11-NEXT: v_lshrrev_b16 v6, v9, v7 -; GFX11-NEXT: v_lshlrev_b16 v5, v11, v5 -; GFX11-NEXT: v_lshrrev_b16 v7, v13, v8 -; GFX11-NEXT: v_lshlrev_b16 v0, v12, v0 -; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1 -; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v6 -; GFX11-NEXT: v_or_b32_e32 v4, v5, v7 +; GFX11-NEXT: v_lshlrev_b16 v4, v9, v4 +; GFX11-NEXT: v_lshrrev_b16 v6, v8, v7 +; GFX11-NEXT: v_lshlrev_b16 v5, v10, v5 +; GFX11-NEXT: v_lshrrev_b16 v1, v13, v1 +; GFX11-NEXT: v_lshlrev_b16 v0, v11, v0 +; GFX11-NEXT: v_lshrrev_b16 v2, v2, v12 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX11-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX11-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> @@ -3877,11 +3871,9 @@ define <2 x i16> @v_fshl_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 12, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 17, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 7, v1 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, 12, v1 +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 4, v0 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index 18f6be7fd3e40..ea6b3a3ad7866 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -3623,11 +3623,9 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 4, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 17, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 7, v1 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, 4, v1 +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 12, v0 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 From e65e5b6a0208ba4ef14c28931e9956c69c94f6c0 Mon Sep 17 00:00:00 2001 From: Yu Li Date: Tue, 9 Sep 2025 10:09:48 +0000 Subject: [PATCH 5/8] formatting --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 6 ++++-- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 15 ++++++++++----- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 46e099e092f54..029375b17e5e6 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -348,9 +348,11 @@ class CombinerHelper { bool matchCommuteShift(MachineInstr &MI, BuildFnTy &MatchInfo) const; /// Fold (lshr (trunc (lshr x, C1)), C2) -> trunc (shift x, (C1 + C2)) - bool matchLshrOfTruncOfLshr(MachineInstr &MI, LshrOfTruncOfLshr &MatchInfo, MachineInstr &ShiftMI, MachineInstr &TruncMI) const; + bool matchLshrOfTruncOfLshr(MachineInstr &MI, LshrOfTruncOfLshr &MatchInfo, + MachineInstr &ShiftMI, + MachineInstr &TruncMI) const; void applyLshrOfTruncOfLshr(MachineInstr &MI, - LshrOfTruncOfLshr &MatchInfo) const; + LshrOfTruncOfLshr &MatchInfo) const; /// Transform a multiply by a power-of-2 value to a left shift. bool matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal) const; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 3527b92bcb9f4..6cc0c86bbff4c 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2094,8 +2094,10 @@ bool CombinerHelper::matchCommuteShift(MachineInstr &MI, return true; } -bool CombinerHelper::matchLshrOfTruncOfLshr( - MachineInstr &MI, LshrOfTruncOfLshr &MatchInfo, MachineInstr &ShiftMI, MachineInstr &TruncMI) const { +bool CombinerHelper::matchLshrOfTruncOfLshr(MachineInstr &MI, + LshrOfTruncOfLshr &MatchInfo, + MachineInstr &ShiftMI, + MachineInstr &TruncMI) const { unsigned ShiftOpcode = MI.getOpcode(); assert(ShiftOpcode == TargetOpcode::G_LSHR); @@ -2122,7 +2124,7 @@ bool CombinerHelper::matchLshrOfTruncOfLshr( MatchInfo.Src = ShiftMI.getOperand(1).getReg(); MatchInfo.ShiftAmt = N1C + N001C; MatchInfo.ShiftAmtTy = MRI.getType(N001); - MatchInfo.InnerShiftTy = InnerShiftTy; + MatchInfo.InnerShiftTy = InnerShiftTy; if ((N001C + OpSizeInBits) == InnerShiftSize) return true; @@ -2143,9 +2145,12 @@ void CombinerHelper::applyLshrOfTruncOfLshr( Register Dst = MI.getOperand(0).getReg(); auto ShiftAmt = Builder.buildConstant(MatchInfo.ShiftAmtTy, MatchInfo.ShiftAmt); - auto Shift = Builder.buildLShr(MatchInfo.InnerShiftTy, MatchInfo.Src, ShiftAmt); + auto Shift = + Builder.buildLShr(MatchInfo.InnerShiftTy, MatchInfo.Src, ShiftAmt); if (MatchInfo.Mask == true) { - APInt MaskVal = APInt::getLowBitsSet(MatchInfo.InnerShiftTy.getScalarSizeInBits(), MatchInfo.MaskVal.getZExtValue()); + APInt MaskVal = + APInt::getLowBitsSet(MatchInfo.InnerShiftTy.getScalarSizeInBits(), + MatchInfo.MaskVal.getZExtValue()); auto Mask = Builder.buildConstant(MatchInfo.ShiftAmtTy, MaskVal); auto And = Builder.buildAnd(MatchInfo.InnerShiftTy, Shift, Mask); Builder.buildTrunc(Dst, And); From 2deba8306749db4448db06f3c4f26c26a0e94428 Mon Sep 17 00:00:00 2001 From: Yu Li Date: Tue, 9 Sep 2025 10:14:07 +0000 Subject: [PATCH 6/8] cleaned up comments --- llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h | 3 +-- llvm/include/llvm/Target/GlobalISel/Combine.td | 10 ++-------- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 3 +-- 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 029375b17e5e6..5a1ff3d128bfb 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -349,8 +349,7 @@ class CombinerHelper { /// Fold (lshr (trunc (lshr x, C1)), C2) -> trunc (shift x, (C1 + C2)) bool matchLshrOfTruncOfLshr(MachineInstr &MI, LshrOfTruncOfLshr &MatchInfo, - MachineInstr &ShiftMI, - MachineInstr &TruncMI) const; + MachineInstr &ShiftMI) const; void applyLshrOfTruncOfLshr(MachineInstr &MI, LshrOfTruncOfLshr &MatchInfo) const; diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index b0c8315d19096..204e1f6887fa2 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -398,18 +398,12 @@ def commute_shift : GICombineRule< // Fold (lshr (trunc (lshr x, C1)), C2) -> trunc (lshr x, (C1 + C2)) def lshr_of_trunc_of_lshr_matchdata : GIDefMatchData<"LshrOfTruncOfLshr">; -//def lshr_of_trunc_of_lshr : GICombineRule< -// (defs root:$root, lshr_of_trunc_of_lshr_matchdata:$matchinfo), -// (match (G_LSHR $dst, $x, $y):$root, -// [{ return Helper.matchLshrOfTruncOfLshr(*${root}, ${matchinfo}); }]), -// (apply [{ Helper.applyLshrOfTruncOfLshr(*${root}, ${matchinfo}); }])>; - def lshr_of_trunc_of_lshr : GICombineRule< (defs root:$root, lshr_of_trunc_of_lshr_matchdata:$matchinfo), (match (G_LSHR $d1, $x, $y):$Shift, - (G_TRUNC $d2, $d1):$Trunc, + (G_TRUNC $d2, $d1), (G_LSHR $dst, $d2, $z):$root, - [{ return Helper.matchLshrOfTruncOfLshr(*${root}, ${matchinfo}, *${Shift}, *${Trunc}); }]), + [{ return Helper.matchLshrOfTruncOfLshr(*${root}, ${matchinfo}, *${Shift}); }]), (apply [{ Helper.applyLshrOfTruncOfLshr(*${root}, ${matchinfo}); }])>; def narrow_binop_feeding_and : GICombineRule< diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 6cc0c86bbff4c..6593892f2d5c1 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2096,8 +2096,7 @@ bool CombinerHelper::matchCommuteShift(MachineInstr &MI, bool CombinerHelper::matchLshrOfTruncOfLshr(MachineInstr &MI, LshrOfTruncOfLshr &MatchInfo, - MachineInstr &ShiftMI, - MachineInstr &TruncMI) const { + MachineInstr &ShiftMI) const { unsigned ShiftOpcode = MI.getOpcode(); assert(ShiftOpcode == TargetOpcode::G_LSHR); From 5059b2fc67014353b5902895ea01dd9a52240095 Mon Sep 17 00:00:00 2001 From: Yu Li Date: Wed, 10 Sep 2025 10:20:27 +0000 Subject: [PATCH 7/8] changes based on comments --- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 6593892f2d5c1..09f9c23f980ba 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2097,8 +2097,8 @@ bool CombinerHelper::matchCommuteShift(MachineInstr &MI, bool CombinerHelper::matchLshrOfTruncOfLshr(MachineInstr &MI, LshrOfTruncOfLshr &MatchInfo, MachineInstr &ShiftMI) const { - unsigned ShiftOpcode = MI.getOpcode(); - assert(ShiftOpcode == TargetOpcode::G_LSHR); + //unsigned ShiftOpcode = MI.getOpcode(); + assert(MI.getOpcode() == TargetOpcode::G_LSHR && "Expected a G_LSHR"); Register N0 = MI.getOperand(1).getReg(); Register N1 = MI.getOperand(2).getReg(); @@ -2138,8 +2138,8 @@ bool CombinerHelper::matchLshrOfTruncOfLshr(MachineInstr &MI, void CombinerHelper::applyLshrOfTruncOfLshr( MachineInstr &MI, LshrOfTruncOfLshr &MatchInfo) const { - unsigned ShiftOpcode = MI.getOpcode(); - assert(ShiftOpcode == TargetOpcode::G_LSHR); + //unsigned ShiftOpcode = MI.getOpcode(); + assert(MI.getOpcode() == TargetOpcode::G_LSHR && "Expected a G_LSHR"); Register Dst = MI.getOperand(0).getReg(); auto ShiftAmt = @@ -2150,7 +2150,7 @@ void CombinerHelper::applyLshrOfTruncOfLshr( APInt MaskVal = APInt::getLowBitsSet(MatchInfo.InnerShiftTy.getScalarSizeInBits(), MatchInfo.MaskVal.getZExtValue()); - auto Mask = Builder.buildConstant(MatchInfo.ShiftAmtTy, MaskVal); + auto Mask = Builder.buildConstant(MatchInfo.InnerShiftTy, MaskVal); auto And = Builder.buildAnd(MatchInfo.InnerShiftTy, Shift, Mask); Builder.buildTrunc(Dst, And); } else From 7824c6e67164a2ff96b01aeef8a2a10c94f58e9c Mon Sep 17 00:00:00 2001 From: Yu Li Date: Wed, 10 Sep 2025 10:21:18 +0000 Subject: [PATCH 8/8] formatting --- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 09f9c23f980ba..0ebee2cfd8688 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2097,7 +2097,6 @@ bool CombinerHelper::matchCommuteShift(MachineInstr &MI, bool CombinerHelper::matchLshrOfTruncOfLshr(MachineInstr &MI, LshrOfTruncOfLshr &MatchInfo, MachineInstr &ShiftMI) const { - //unsigned ShiftOpcode = MI.getOpcode(); assert(MI.getOpcode() == TargetOpcode::G_LSHR && "Expected a G_LSHR"); Register N0 = MI.getOperand(1).getReg(); @@ -2138,7 +2137,6 @@ bool CombinerHelper::matchLshrOfTruncOfLshr(MachineInstr &MI, void CombinerHelper::applyLshrOfTruncOfLshr( MachineInstr &MI, LshrOfTruncOfLshr &MatchInfo) const { - //unsigned ShiftOpcode = MI.getOpcode(); assert(MI.getOpcode() == TargetOpcode::G_LSHR && "Expected a G_LSHR"); Register Dst = MI.getOperand(0).getReg();