-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Generate s_lshl?_add_u32 #167032
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: None (LU-JOHN) ChangesGenerate s_lshl?_add_u32 through SDAG. Patch is 84.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/167032.diff 16 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9c74c654d8e35..9701079bb2761 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7635,6 +7635,8 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
unsigned Opcode = Inst.getOpcode();
unsigned NewOpcode = getVALUOp(Inst);
+ const DebugLoc &DL = Inst.getDebugLoc();
+
// Handle some special cases
switch (Opcode) {
default:
@@ -7872,7 +7874,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
- const DebugLoc &DL = Inst.getDebugLoc();
MachineOperand &Dest0 = Inst.getOperand(0);
MachineOperand &Dest1 = Inst.getOperand(1);
MachineOperand &Src0 = Inst.getOperand(2);
@@ -7897,7 +7898,34 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
Inst.eraseFromParent();
}
return;
+ case AMDGPU::S_LSHL1_ADD_U32:
+ case AMDGPU::S_LSHL2_ADD_U32:
+ case AMDGPU::S_LSHL3_ADD_U32:
+ case AMDGPU::S_LSHL4_ADD_U32: {
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+ unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
+ : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
+ : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
+ : 4);
+
+ const TargetRegisterClass *NewRC =
+ RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
+ Register DestReg = MRI.createVirtualRegister(NewRC);
+ MachineInstr *NewInstr =
+ BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
+ .add(Src0)
+ .addImm(ShiftAmt)
+ .add(Src1);
+ legalizeOperands(*NewInstr, MDT);
+ MRI.replaceRegWith(Dest.getReg(), DestReg);
+ addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
+ Worklist);
+ Inst.eraseFromParent();
+ }
+ return;
case AMDGPU::S_CSELECT_B32:
case AMDGPU::S_CSELECT_B64:
lowerSelect(Worklist, Inst, MDT);
@@ -7994,7 +8022,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
}
case AMDGPU::S_CVT_HI_F32_F16: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
if (ST.useRealTrue16Insts()) {
@@ -8024,7 +8051,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
case AMDGPU::S_MINIMUM_F32:
case AMDGPU::S_MAXIMUM_F32: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
.addImm(0) // src0_modifiers
@@ -8042,7 +8068,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
case AMDGPU::S_MINIMUM_F16:
case AMDGPU::S_MAXIMUM_F16: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
? &AMDGPU::VGPR_16RegClass
: &AMDGPU::VGPR_32RegClass);
@@ -8066,7 +8091,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
case AMDGPU::V_S_RCP_F16_e64:
case AMDGPU::V_S_RSQ_F16_e64:
case AMDGPU::V_S_SQRT_F16_e64: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
? &AMDGPU::VGPR_16RegClass
: &AMDGPU::VGPR_32RegClass);
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 1931e0be15152..93cfd5ab3750c 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -2012,6 +2012,14 @@ let AddedComplexity = 20 in {
>;
}
+let SubtargetPredicate = isGFX9Plus in
+foreach I = 1-4 in {
+def : GCNPat <
+ (i32 (UniformBinFrag<add> (shl_oneuse i32:$src0, (i32 I)), i32:$src1)),
+ (!cast<SOP2_Pseudo>("S_LSHL"#I#"_ADD_U32") $src0, $src1)
+>;
+}
+
// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector
// case, the sgpr-copies pass will fix this to use the vector version.
def : GCNPat <
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
index 06150e4277e9a..7669ae21f6635 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
@@ -51,10 +51,8 @@ define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b32 s0, s0, 2
+; GFX12-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_add_co_i32 s0, s0, 15
; GFX12-NEXT: s_mov_b32 s32, 16
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s0, s0, -16
@@ -69,8 +67,7 @@ define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) {
; GFX942-LABEL: test_alloca_var_uniform:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: s_lshl_b32 s0, s0, 2
-; GFX942-NEXT: s_add_i32 s0, s0, 15
+; GFX942-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX942-NEXT: s_mov_b32 s32, 16
; GFX942-NEXT: s_and_b32 s0, s0, -16
; GFX942-NEXT: v_mov_b32_e32 v0, 0
@@ -211,15 +208,13 @@ define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count)
; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
-; GFX12-NEXT: s_lshl_b32 s0, s0, 2
+; GFX12-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT: s_add_co_i32 s0, s0, 15
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_mov_b32 s32, 16
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s0, s0, -16
-; GFX12-NEXT: s_mov_b32 s1, s32
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_mov_b32 s1, s32
; GFX12-NEXT: s_lshl_b32 s0, s0, 5
; GFX12-NEXT: scratch_store_b32 off, v0, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -232,8 +227,7 @@ define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count)
; GFX942-LABEL: test_alloca_and_call_var_uniform:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: s_lshl_b32 s0, s0, 2
-; GFX942-NEXT: s_add_i32 s0, s0, 15
+; GFX942-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX942-NEXT: s_and_b32 s0, s0, -16
; GFX942-NEXT: s_lshl_b32 s2, s0, 6
; GFX942-NEXT: s_getpc_b64 s[0:1]
@@ -396,14 +390,12 @@ define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count)
; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
-; GFX12-NEXT: s_lshl_b32 s0, s0, 2
+; GFX12-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT: s_add_co_i32 s0, s0, 15
; GFX12-NEXT: s_mov_b32 s32, 16
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s0, s0, -16
-; GFX12-NEXT: s_mov_b32 s4, s32
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_mov_b32 s4, s32
; GFX12-NEXT: s_lshl_b32 s0, s0, 5
; GFX12-NEXT: v_mov_b32_e32 v40, 0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -417,8 +409,7 @@ define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count)
; GFX942-LABEL: test_call_and_alloca_var_uniform:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: s_lshl_b32 s0, s0, 2
-; GFX942-NEXT: s_add_i32 s0, s0, 15
+; GFX942-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX942-NEXT: s_and_b32 s0, s0, -16
; GFX942-NEXT: s_lshl_b32 s2, s0, 6
; GFX942-NEXT: s_getpc_b64 s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
index 9a4040a25419a..49977a4c64784 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -265,8 +265,7 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspac
; GFX9-NEXT: v_mov_b32_e32 v0, 7
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sub_i32 s2, s2, s3
-; GFX9-NEXT: s_lshl_b32 s2, s2, 2
-; GFX9-NEXT: s_add_i32 s0, s0, s2
+; GFX9-NEXT: s_lshl2_add_u32 s0, s2, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16
@@ -282,9 +281,8 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspac
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_i32 s2, s2, s3
; GFX11-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-NEXT: s_lshl_b32 s2, s2, 2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_i32 s0, s0, s2
+; GFX11-NEXT: s_lshl2_add_u32 s0, s2, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v1, s0
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index c5db7a33f70e0..9f2001d452fe3 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -13,8 +13,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) {
; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x400
; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2
-; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6
@@ -53,12 +52,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) {
; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
; GFX11-SDAG-NEXT: s_endpgm
;
@@ -88,13 +86,12 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_over_aligned(i
; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_load_dword s4, s[8:9], 0x0
-; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17
; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x2000
+; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17
; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0
; GFX9-SDAG-NEXT: s_add_i32 s5, s32, 0x1fff
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2
-; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0xffffe000
; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 10
@@ -137,12 +134,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_over_aligned(i
; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
; GFX11-SDAG-NEXT: s_endpgm
;
@@ -178,8 +174,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_under_aligned(
; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x400
; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2
-; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 22
; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6
@@ -218,12 +213,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_under_aligned(
; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
; GFX11-SDAG-NEXT: s_endpgm
;
@@ -609,8 +603,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x2000
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB6_4
; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.0
-; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 2
-; GFX9-SDAG-NEXT: s_add_i32 s5, s5, 15
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s5, s5, 15
; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff
; GFX9-SDAG-NEXT: s_and_b32 s5, s5, -16
; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
@@ -639,8 +632,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: .LBB6_4: ; %bb.1
-; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2
-; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1
; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6
@@ -719,20 +711,17 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB6_4
; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.0
; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT: s_lshl_b32 s1, s1, 2
+; GFX11-SDAG-NEXT: s_lshl2_add_u32 s1, s1, 15
; GFX11-SDAG-NEXT: s_add_i32 s3, s32, 0x7ff
-; GFX11-SDAG-NEXT: s_add_i32 s1, s1, 15
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_and_b32 s4, s1, -16
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
; GFX11-SDAG-NEXT: s_and_b32 s1, s3, 0xfffff800
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
; GFX11-SDAG-NEXT: s_lshl_b32 s3, s4, 5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s3
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
; GFX11-SDAG-NEXT: s_mov_b32 s3, exec_lo
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
; GFX11-SDAG-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_ctz_i32_b32 s4, s3
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s5, v0, s4
@@ -750,18 +739,16 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX11-SDAG-NEXT: .LBB6_4: ; %bb.1
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
-; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s33 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
; GFX11-SDAG-NEXT: s_endpgm
;
@@ -866,9 +853,8 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_cbranch_execnz .LBB7_5
; GFX9-SDAG-NEXT: .LBB7_4: ; %bb.0
-; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 2
; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0xfff
-; GFX9-SDAG-NEXT: s_add_i32 s5, s5, 15
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s5, s5, 15
; GFX9-SDAG-NEXT: s_and_b32 s4, s4, 0xfffff000
; GFX9-SDAG-NEXT: s_and_b32 s5, s5, -16
; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 6
@@ -964,16 +950,15 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB7_5
; GFX11-SDAG-NEXT: .LBB7_4: ; %bb.0
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s1, 2
+; GFX11-SDAG-NEXT: s_lshl2_add_u32 s1, s1, 15
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
-; GFX11-SDAG-NEXT: s_add_i32 s1, s32, 0x7ff
-; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
-; GFX11-SDAG-NEXT: s_and_b32 s1, s1, 0xfffff800
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT: s_add_i32 s0, s32, 0x7ff
+; GFX11-SDAG-NEXT: s_and_b32 s1, s1, -16
+; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 0xfffff800
+; GFX11-SDAG-NEXT: s_lshl_b32 s1, s1, 5
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s0 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: s_add_i32 s32, s0, s1
; GFX11-SDAG-NEXT: .LBB7_5: ; %bb.2
; GFX11-SDAG-NEXT: s_endpgm
; GFX11-SDAG-NEXT: .LBB7_6:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index b0e6752386285..e01cb79382c05 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -524,7 +524,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1
+; GFX942-SDAG-NEXT: s_lshl1_add_u32 s0, s0, 0
; GFX942-SDAG-NEXT: v_add_u32_e32 v0, s0, v0
; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 1, v0
; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 2, v0
@@ -695,7 +695,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1
+; GFX942-SDAG-NEXT: s_lshl1_add_u32 s0, s0, 0
; GF...
[truncated]
|
| ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 | ||
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=si-fix-sgpr-copies < %s | FileCheck %s | ||
|
|
||
| define amdgpu_kernel void @lshl1_add(ptr addrspace(5) %alloca) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not an alloca (and it's likely this may be made a verifier error some day), so can you use a different pointer
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changed variable name and addrspace.
| foreach I = 1-4 in { | ||
| def : GCNPat < | ||
| (i32 (UniformBinFrag<add> (shl_oneuse i32:$src0, (i32 I)), i32:$src1)), | ||
| (!cast<SOP2_Pseudo>("S_LSHL"#I#"_ADD_U32") $src0, $src1) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is already the same pattern on the instruction definition, so why isn't that one working? Shouldn't need to repeat this here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That pattern was explicitly disabled for SDAG: https://reviews.llvm.org/D74942
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So can just remove that then
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed new pattern and enabled old pattern for SelectionDAG. Same results generated for shlN_add.ll with old and new patterns. Enabling old pattern passes check-libc-amdgcn-amd-amdhsa target with this distribution of s_lshl?_add instructions generated:
200 S_LSHL1_ADD_U32
96 S_LSHL2_ADD_U32
322 S_LSHL3_ADD_U32
24 S_LSHL4_ADD_U32
Signed-off-by: John Lu <John.Lu@amd.com>
Signed-off-by: John Lu <John.Lu@amd.com>
Signed-off-by: John Lu <John.Lu@amd.com>
Signed-off-by: John Lu <John.Lu@amd.com>
Signed-off-by: John Lu <John.Lu@amd.com>
Signed-off-by: John Lu <John.Lu@amd.com>
Signed-off-by: John Lu <John.Lu@amd.com>
2f1bd6a to
59e6e42
Compare
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/116/builds/20832 Here is the relevant piece of the build log for the reference |
Generate s_lshl?_add_u32 through SDAG.