-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[AMDGPU] Make rotr illegal #166558
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[AMDGPU] Make rotr illegal #166558
Conversation
fshr is already legal and is strictly more powerful than rotr, so we should only need selection patterns for fshr.
|
@llvm/pr-subscribers-backend-amdgpu Author: Jay Foad (jayfoad) Changesfshr is already legal and is strictly more powerful than rotr, so we Full diff: https://github.com/llvm/llvm-project/pull/166558.diff 7 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 1b559a628be08..7f942fdd95211 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -502,9 +502,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
// The hardware supports 32-bit FSHR, but not FSHL.
setOperationAction(ISD::FSHR, MVT::i32, Legal);
- // The hardware supports 32-bit ROTR, but not ROTL.
- setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
- setOperationAction(ISD::ROTR, MVT::i64, Expand);
+ setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand);
setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index bd443b5b6f1e6..ddcb431f39a87 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -806,12 +806,6 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : AMDGPUPat <
(vt rc:$addr)
>;
-// rotr pattern
-class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
- (rotr i32:$src0, i32:$src1),
- (BIT_ALIGN $src0, $src0, $src1)
->;
-
// Special conversion patterns
def cvt_rpi_i32_f32 : PatFrag <
diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index dadc7dcd7054a..a2e3ecef1c206 100644
--- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -505,7 +505,6 @@ def : AMDGPUPat <
(fshr i32:$src0, i32:$src1, i32:$src2),
(BIT_ALIGN_INT_eg $src0, $src1, $src2)
>;
-def : ROTRPattern <BIT_ALIGN_INT_eg>;
def MULADD_eg : MULADD_Common<0x14>;
def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
def FMA_eg : FMA_Common<0x7>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 6f1feb1dc2996..a432d297da595 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2663,8 +2663,6 @@ def : AMDGPUPat <
let True16Predicate = NotHasTrue16BitInsts in {
let SubtargetPredicate = isNotGFX9Plus in {
-def : ROTRPattern <V_ALIGNBIT_B32_e64>;
-
def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
@@ -2675,14 +2673,6 @@ def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:
} // isNotGFX9Plus
let SubtargetPredicate = isGFX9GFX10 in {
-def : GCNPat <
- (rotr i32:$src0, i32:$src1),
- (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0,
- /* src1_modifiers */ 0, $src0,
- /* src2_modifiers */ 0,
- $src1, /* clamp */ 0, /* op_sel */ 0)
->;
-
foreach pat = [(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in
def : GCNPat<pat,
@@ -2704,15 +2694,6 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
} // end True16Predicate = NotHasTrue16BitInsts
let True16Predicate = UseRealTrue16Insts in {
-def : GCNPat <
- (rotr i32:$src0, i32:$src1),
- (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
- /* src1_modifiers */ 0, $src0,
- /* src2_modifiers */ 0,
- (EXTRACT_SUBREG $src1, lo16),
- /* clamp */ 0, /* op_sel */ 0)
->;
-
def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
(V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
@@ -2731,14 +2712,6 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
} // end True16Predicate = UseRealTrue16Insts
let True16Predicate = UseFakeTrue16Insts in {
-def : GCNPat <
- (rotr i32:$src0, i32:$src1),
- (V_ALIGNBIT_B32_fake16_e64 /* src0_modifiers */ 0, $src0,
- /* src1_modifiers */ 0, $src0,
- /* src2_modifiers */ 0,
- $src1, /* clamp */ 0, /* op_sel */ 0)
->;
-
def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
diff --git a/llvm/test/CodeGen/AMDGPU/packetizer.ll b/llvm/test/CodeGen/AMDGPU/packetizer.ll
index aab035f811434..0b8acee74eacb 100644
--- a/llvm/test/CodeGen/AMDGPU/packetizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/packetizer.ll
@@ -1,5 +1,6 @@
; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s
; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s
+; XFAIL: *
; CHECK: {{^}}test:
; CHECK: BIT_ALIGN_INT T{{[0-9]}}.X
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index 0741cb256cc24..75263683371be 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -353,7 +353,7 @@ define hidden void @shuffle5341ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16
+; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040706
; GFX10-NEXT: global_store_dword v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -361,8 +361,9 @@ define hidden void @shuffle5341ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: s_mov_b32 s4, 0x5040706
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_alignbit_b32 v0, v0, v0, 16
+; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index 28330bfc9bb69..acf999e586a68 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -1470,21 +1470,20 @@ define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr add
;
; EG-LABEL: s_shl_inline_imm_1_i64:
; EG: ; %bb.0:
-; EG-NEXT: ALU 11, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: AND_INT T0.W, KC0[2].W, literal.x,
-; EG-NEXT: LSHL * T1.W, KC0[2].W, literal.y,
-; EG-NEXT: 31(4.344025e-44), 26(3.643376e-44)
-; EG-NEXT: ASHR T1.W, PS, literal.x,
-; EG-NEXT: LSHL * T0.W, 1, PV.W,
+; EG-NEXT: NOT_INT * T1.W, KC0[2].W,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T0.Y, PV.W, PS,
-; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.x,
+; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, 0.0, PS,
+; EG-NEXT: AND_INT T1.W, KC0[2].W, literal.x,
+; EG-NEXT: LSHL * T0.W, 1, PV.W,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT: CNDE_INT T0.X, PV.W, T0.W, 0.0,
+; EG-NEXT: CNDE_INT * T0.Y, PV.W, PV.Z, PS,
+; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%shl = shl i64 1, %a
|
| ; GFX10-NEXT: global_load_dword v0, v[0:1], off | ||
| ; GFX10-NEXT: s_waitcnt vmcnt(0) | ||
| ; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 | ||
| ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040706 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These are just more instances of a problem introduced by #70240 and already noted here:
| ; FIXME: produce v_alignbit_b32 v2, v2, s0, 24 instead of v_perm |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you have a plan for these? It feels like you're going to end up adding fshr peepholes to replace the rotr ones?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you have a plan for these?
My plan is to hope @jrbyrnes picks it up :) At the moment I don't understand why his #70240 removed this check of yours:
| // Check that we haven't just recreated the same FSHR node. |
It feels like you're going to end up adding fshr peepholes to replace the rotr ones?
I don't think there are any peepholes. It is just that PerformDAGCombine's FSHR case calls matchPERM but the ROTR case does not.
| @@ -1,5 +1,6 @@ | |||
| ; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s | |||
| ; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s | |||
| ; XFAIL: * | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know what happened in this test since I don't speak R600. The full diff (for redwood) is:
.type test,@function
test: ; @test
; %bb.0: ; %entry
- ALU 12, @4, KC0[CB0:0-32], KC1[]
+ ALU 9, @4, KC0[CB0:0-32], KC1[]
MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
CF_END
PAD
ALU clause starting at 4:
- ADD_INT T0.Y, KC0[3].X, 1,
- ADD_INT T0.Z, KC0[3].Y, 1,
- ADD_INT T0.W, KC0[2].Z, 1,
- ADD_INT * T1.W, KC0[2].W, 1,
- BIT_ALIGN_INT T0.X, PS, PS, KC0[3].Z,
- BIT_ALIGN_INT T1.Y, PV.W, PV.W, KC0[3].Z,
- BIT_ALIGN_INT T0.Z, PV.Z, PV.Z, KC0[3].Z,
- BIT_ALIGN_INT * T0.W, PV.Y, PV.Y, KC0[3].Z,
- OR_INT T0.W, PV.W, PV.Z,
- OR_INT * T1.W, PV.Y, PV.X,
- OR_INT T0.X, PS, PV.W,
+ ADD_INT T0.Y, KC0[2].W, 1,
+ ADD_INT T0.Z, KC0[2].Z, 1,
+ ADD_INT T0.W, KC0[3].Y, 1,
+ ADD_INT * T1.W, KC0[3].X, 1,
+ OR_INT T0.W, PS, PV.W,
+ OR_INT * T1.W, PV.Z, PV.Y,
+ OR_INT * T0.W, PS, PV.W,
+ BIT_ALIGN_INT T0.X, PV.W, PV.W, KC0[3].Z,
LSHR * T1.X, KC0[2].Y, literal.x,
2(2.802597e-45), 0(0.000000e+00)
.Lfunc_end0:There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just regenerate?
; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s --check-prefixes=EG
; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s --check-prefixes=CM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I guess I will convert it to autogenerated checks.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fshr is already legal and is strictly more powerful than rotr, so we
should only need selection patterns for fshr.