-
Notifications
You must be signed in to change notification settings - Fork 15k
[AMDGPU] Add intrinsic exposing s_alloc_vgpr #163951
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Make it possible to use `s_alloc_vgpr` at the IR level. This is a huge footgun and use for anything other than compiler internal purposes is heavily discouraged. The calling code must make sure that it does not allocate fewer VGPRs than necessary - the intrinsic is NOT a request to the backend to limit the number of VGPRs it uses (in essence it's not so different from what we do with the dynamic VGPR flags of the `amdgcn.cs.chain` intrinsic, it just makes it possible to use this functionality in other scenarios).
|
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-analysis Author: Diana Picus (rovka) ChangesMake it possible to use Full diff: https://github.com/llvm/llvm-project/pull/163951.diff 7 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ded00b1274670..9bb305823e932 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -391,6 +391,17 @@ def int_amdgcn_s_wait_loadcnt : AMDGPUWaitIntrinsic;
def int_amdgcn_s_wait_samplecnt : AMDGPUWaitIntrinsic;
def int_amdgcn_s_wait_storecnt : AMDGPUWaitIntrinsic;
+// Force the VGPR allocation of the current wave to (at least) the given value.
+// The actual number of allocated VGPRs may be rounded up to match hardware
+// block boundaries.
+// It is the responsibility of the calling code to ensure it does not allocate
+// below the VGPR requirements of the current shader.
+def int_amdgcn_s_alloc_vgpr :
+ Intrinsic<
+ [llvm_i1_ty], // Returns true if the allocation succeeded, false otherwise.
+ [llvm_i32_ty], // The number of VGPRs to allocate.
+ [NoUndef<RetIndex>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
def int_amdgcn_div_scale : DefaultAttrsIntrinsic<
// 1st parameter: Numerator
// 2nd parameter: Denominator
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 12915c7344426..2f9c87cb5f20e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2331,6 +2331,22 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
return selectDSBvhStackIntrinsic(I);
+ case Intrinsic::amdgcn_s_alloc_vgpr: {
+ // S_ALLOC_VGPR doesn't have a destination register, it just implicitly sets
+ // SCC. We then need to COPY it into the result vreg.
+ MachineBasicBlock *MBB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+
+ Register ResReg = I.getOperand(0).getReg();
+
+ MachineInstr *AllocMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_ALLOC_VGPR))
+ .add(I.getOperand(2));
+ MachineInstr *CopyMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), ResReg)
+ .addReg(AMDGPU::SCC);
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*AllocMI, TII, TRI, RBI) &&
+ RBI.constrainGenericRegister(ResReg, AMDGPU::SReg_32RegClass, *MRI);
+ }
case Intrinsic::amdgcn_s_barrier_init:
case Intrinsic::amdgcn_s_barrier_signal_var:
return selectNamedBarrierInit(I, IntrinsicID);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 56807a475537d..dda73f13f7487 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5359,6 +5359,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
break;
+ case Intrinsic::amdgcn_s_alloc_vgpr:
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ break;
case Intrinsic::amdgcn_s_sendmsg:
case Intrinsic::amdgcn_s_sendmsghalt: {
// This must be an SGPR, but accept a VGPR.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 2393346839707..b82b2416a57f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -409,6 +409,7 @@ def : AlwaysUniform<int_amdgcn_cluster_workgroup_max_flat_id>;
def : AlwaysUniform<int_amdgcn_workgroup_id_x>;
def : AlwaysUniform<int_amdgcn_workgroup_id_y>;
def : AlwaysUniform<int_amdgcn_workgroup_id_z>;
+def : AlwaysUniform<int_amdgcn_s_alloc_vgpr>;
def : AlwaysUniform<int_amdgcn_s_getpc>;
def : AlwaysUniform<int_amdgcn_s_getreg>;
def : AlwaysUniform<int_amdgcn_s_memrealtime>;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 84287b621fe78..9496087aec20c 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -433,8 +433,10 @@ let SubtargetPredicate = isGFX11Plus in {
} // End SubtargetPredicate = isGFX11Plus
let SubtargetPredicate = isGFX12Plus in {
- let hasSideEffects = 1, Defs = [SCC] in {
- def S_ALLOC_VGPR : SOP1_0_32 <"s_alloc_vgpr">;
+ let hasSideEffects = 1, isConvergent = 1, Defs = [SCC] in {
+ def S_ALLOC_VGPR : SOP1_0_32 <"s_alloc_vgpr",
+ [(set SCC, (int_amdgcn_s_alloc_vgpr SSrc_b32:$src0))]
+ >;
}
} // End SubtargetPredicate = isGFX12Plus
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
index 9ff670bee0f89..3f56f12f3cb34 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
@@ -183,6 +183,15 @@ define void @cluster_workgroup_max_flat_id(ptr addrspace(1) inreg %out) {
ret void
}
+; CHECK-LABEL: for function 's_alloc_vgpr':
+; CHECK: ALL VALUES UNIFORM
+define void @s_alloc_vgpr(i32 inreg %n, ptr addrspace(1) inreg %out) {
+ %scc = call i1 @llvm.amdgcn.s.alloc.vgpr(i32 %n)
+ %sel = select i1 %scc, i32 1, i32 0
+ store i32 %sel, ptr addrspace(1) %out
+ ret void
+}
+
; CHECK-LABEL: for function 's_memtime':
; CHECK: ALL VALUES UNIFORM
define void @s_memtime(ptr addrspace(1) inreg %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/intrinsic-amdgcn-s-alloc-vgpr.ll b/llvm/test/CodeGen/AMDGPU/intrinsic-amdgcn-s-alloc-vgpr.ll
new file mode 100644
index 0000000000000..74c42b7bffd04
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/intrinsic-amdgcn-s-alloc-vgpr.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=DAGISEL
+
+declare i1 @llvm.amdgcn.s.alloc.vgpr(i32)
+
+define amdgpu_cs void @test_alloc_vreg_const(ptr addrspace(1) %out) #0 {
+; GISEL-LABEL: test_alloc_vreg_const:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_alloc_vgpr 45
+; GISEL-NEXT: s_cselect_b32 s0, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_and_b32 s0, s0, 1
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_endpgm
+;
+; DAGISEL-LABEL: test_alloc_vreg_const:
+; DAGISEL: ; %bb.0: ; %entry
+; DAGISEL-NEXT: s_alloc_vgpr 45
+; DAGISEL-NEXT: s_cselect_b32 s0, -1, 0
+; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; DAGISEL-NEXT: global_store_b32 v[0:1], v2, off
+; DAGISEL-NEXT: s_endpgm
+entry:
+ %scc = call i1 @llvm.amdgcn.s.alloc.vgpr(i32 45)
+ %sel = select i1 %scc, i32 1, i32 0
+ store i32 %sel, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_alloc_vreg_var(i32 inreg %n, ptr addrspace(1) %out) #0 {
+; GISEL-LABEL: test_alloc_vreg_var:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_alloc_vgpr s0
+; GISEL-NEXT: s_cselect_b32 s0, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_and_b32 s0, s0, 1
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_endpgm
+;
+; DAGISEL-LABEL: test_alloc_vreg_var:
+; DAGISEL: ; %bb.0: ; %entry
+; DAGISEL-NEXT: s_alloc_vgpr s0
+; DAGISEL-NEXT: s_cselect_b32 s0, -1, 0
+; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; DAGISEL-NEXT: global_store_b32 v[0:1], v2, off
+; DAGISEL-NEXT: s_endpgm
+entry:
+ %scc = call i1 @llvm.amdgcn.s.alloc.vgpr(i32 %n)
+ %sel = select i1 %scc, i32 1, i32 0
+ store i32 %sel, ptr addrspace(1) %out
+ ret void
+}
+
+attributes #0 = { "amdgpu-dynamic-vgpr-block-sze" = "16" }
|
|
@llvm/pr-subscribers-llvm-ir Author: Diana Picus (rovka) ChangesMake it possible to use Full diff: https://github.com/llvm/llvm-project/pull/163951.diff 7 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ded00b1274670..9bb305823e932 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -391,6 +391,17 @@ def int_amdgcn_s_wait_loadcnt : AMDGPUWaitIntrinsic;
def int_amdgcn_s_wait_samplecnt : AMDGPUWaitIntrinsic;
def int_amdgcn_s_wait_storecnt : AMDGPUWaitIntrinsic;
+// Force the VGPR allocation of the current wave to (at least) the given value.
+// The actual number of allocated VGPRs may be rounded up to match hardware
+// block boundaries.
+// It is the responsibility of the calling code to ensure it does not allocate
+// below the VGPR requirements of the current shader.
+def int_amdgcn_s_alloc_vgpr :
+ Intrinsic<
+ [llvm_i1_ty], // Returns true if the allocation succeeded, false otherwise.
+ [llvm_i32_ty], // The number of VGPRs to allocate.
+ [NoUndef<RetIndex>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
def int_amdgcn_div_scale : DefaultAttrsIntrinsic<
// 1st parameter: Numerator
// 2nd parameter: Denominator
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 12915c7344426..2f9c87cb5f20e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2331,6 +2331,22 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
return selectDSBvhStackIntrinsic(I);
+ case Intrinsic::amdgcn_s_alloc_vgpr: {
+ // S_ALLOC_VGPR doesn't have a destination register, it just implicitly sets
+ // SCC. We then need to COPY it into the result vreg.
+ MachineBasicBlock *MBB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+
+ Register ResReg = I.getOperand(0).getReg();
+
+ MachineInstr *AllocMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_ALLOC_VGPR))
+ .add(I.getOperand(2));
+ MachineInstr *CopyMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), ResReg)
+ .addReg(AMDGPU::SCC);
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*AllocMI, TII, TRI, RBI) &&
+ RBI.constrainGenericRegister(ResReg, AMDGPU::SReg_32RegClass, *MRI);
+ }
case Intrinsic::amdgcn_s_barrier_init:
case Intrinsic::amdgcn_s_barrier_signal_var:
return selectNamedBarrierInit(I, IntrinsicID);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 56807a475537d..dda73f13f7487 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5359,6 +5359,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
break;
+ case Intrinsic::amdgcn_s_alloc_vgpr:
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ break;
case Intrinsic::amdgcn_s_sendmsg:
case Intrinsic::amdgcn_s_sendmsghalt: {
// This must be an SGPR, but accept a VGPR.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 2393346839707..b82b2416a57f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -409,6 +409,7 @@ def : AlwaysUniform<int_amdgcn_cluster_workgroup_max_flat_id>;
def : AlwaysUniform<int_amdgcn_workgroup_id_x>;
def : AlwaysUniform<int_amdgcn_workgroup_id_y>;
def : AlwaysUniform<int_amdgcn_workgroup_id_z>;
+def : AlwaysUniform<int_amdgcn_s_alloc_vgpr>;
def : AlwaysUniform<int_amdgcn_s_getpc>;
def : AlwaysUniform<int_amdgcn_s_getreg>;
def : AlwaysUniform<int_amdgcn_s_memrealtime>;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 84287b621fe78..9496087aec20c 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -433,8 +433,10 @@ let SubtargetPredicate = isGFX11Plus in {
} // End SubtargetPredicate = isGFX11Plus
let SubtargetPredicate = isGFX12Plus in {
- let hasSideEffects = 1, Defs = [SCC] in {
- def S_ALLOC_VGPR : SOP1_0_32 <"s_alloc_vgpr">;
+ let hasSideEffects = 1, isConvergent = 1, Defs = [SCC] in {
+ def S_ALLOC_VGPR : SOP1_0_32 <"s_alloc_vgpr",
+ [(set SCC, (int_amdgcn_s_alloc_vgpr SSrc_b32:$src0))]
+ >;
}
} // End SubtargetPredicate = isGFX12Plus
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
index 9ff670bee0f89..3f56f12f3cb34 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
@@ -183,6 +183,15 @@ define void @cluster_workgroup_max_flat_id(ptr addrspace(1) inreg %out) {
ret void
}
+; CHECK-LABEL: for function 's_alloc_vgpr':
+; CHECK: ALL VALUES UNIFORM
+define void @s_alloc_vgpr(i32 inreg %n, ptr addrspace(1) inreg %out) {
+ %scc = call i1 @llvm.amdgcn.s.alloc.vgpr(i32 %n)
+ %sel = select i1 %scc, i32 1, i32 0
+ store i32 %sel, ptr addrspace(1) %out
+ ret void
+}
+
; CHECK-LABEL: for function 's_memtime':
; CHECK: ALL VALUES UNIFORM
define void @s_memtime(ptr addrspace(1) inreg %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/intrinsic-amdgcn-s-alloc-vgpr.ll b/llvm/test/CodeGen/AMDGPU/intrinsic-amdgcn-s-alloc-vgpr.ll
new file mode 100644
index 0000000000000..74c42b7bffd04
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/intrinsic-amdgcn-s-alloc-vgpr.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=DAGISEL
+
+declare i1 @llvm.amdgcn.s.alloc.vgpr(i32)
+
+define amdgpu_cs void @test_alloc_vreg_const(ptr addrspace(1) %out) #0 {
+; GISEL-LABEL: test_alloc_vreg_const:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_alloc_vgpr 45
+; GISEL-NEXT: s_cselect_b32 s0, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_and_b32 s0, s0, 1
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_endpgm
+;
+; DAGISEL-LABEL: test_alloc_vreg_const:
+; DAGISEL: ; %bb.0: ; %entry
+; DAGISEL-NEXT: s_alloc_vgpr 45
+; DAGISEL-NEXT: s_cselect_b32 s0, -1, 0
+; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; DAGISEL-NEXT: global_store_b32 v[0:1], v2, off
+; DAGISEL-NEXT: s_endpgm
+entry:
+ %scc = call i1 @llvm.amdgcn.s.alloc.vgpr(i32 45)
+ %sel = select i1 %scc, i32 1, i32 0
+ store i32 %sel, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_alloc_vreg_var(i32 inreg %n, ptr addrspace(1) %out) #0 {
+; GISEL-LABEL: test_alloc_vreg_var:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_alloc_vgpr s0
+; GISEL-NEXT: s_cselect_b32 s0, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_and_b32 s0, s0, 1
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_endpgm
+;
+; DAGISEL-LABEL: test_alloc_vreg_var:
+; DAGISEL: ; %bb.0: ; %entry
+; DAGISEL-NEXT: s_alloc_vgpr s0
+; DAGISEL-NEXT: s_cselect_b32 s0, -1, 0
+; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; DAGISEL-NEXT: global_store_b32 v[0:1], v2, off
+; DAGISEL-NEXT: s_endpgm
+entry:
+ %scc = call i1 @llvm.amdgcn.s.alloc.vgpr(i32 %n)
+ %sel = select i1 %scc, i32 1, i32 0
+ store i32 %sel, ptr addrspace(1) %out
+ ret void
+}
+
+attributes #0 = { "amdgpu-dynamic-vgpr-block-sze" = "16" }
|
Make it possible to use
s_alloc_vgprat the IR level. This is a huge footgun and use for anything other than compiler internal purposes is heavily discouraged. The calling code must make sure that it does not allocate fewer VGPRs than necessary - the intrinsic is NOT a request to the backend to limit the number of VGPRs it uses (in essence it's not so different from what we do with the dynamic VGPR flags of theamdgcn.cs.chainintrinsic, it just makes it possible to use this functionality in other scenarios).