-
Notifications
You must be signed in to change notification settings - Fork 15.5k
AMDGPU/GlobalISel: Regbanklegalize rules for G_UNMERGE_VALUES #171653
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Petar Avramovic (petar-avramovic) ChangesMove G_UNMERGE_VALUES handling to AMDGPURegBankLegalizeRules.cpp. Full diff: https://github.com/llvm/llvm-project/pull/171653.diff 7 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index f36935d8c0e8f..f684e830e20fe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -186,3 +186,19 @@ void AMDGPU::buildReadFirstLane(MachineIRBuilder &B, Register SgprDst,
.addReg(VgprSrc);
});
}
+
+bool AMDGPU::isBRC(LLT Ty) {
+ if (Ty.isPointer())
+ return true;
+
+ unsigned Size = Ty.getSizeInBits();
+ if (Size % 32 != 0)
+ return false;
+
+ // 32, 2x32, 3x32 ... 12x32, 16x32, 32x32
+ unsigned NumB32s = Size / 32;
+ if ((NumB32s >= 1 && NumB32s <= 12) || NumB32s == 16 || NumB32s == 32)
+ return true;
+
+ return false;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
index 5e1000ee0ab26..86af5b8bfd2f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
@@ -53,6 +53,10 @@ void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
const RegisterBankInfo &RBI);
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
const RegisterBankInfo &RBI);
+
+// "Reg Class" low level types. LLTs that fit into some register class on both
+// sgpr and vgpr. LLTs with sizes 32, 2x32, 3x32 ... 12x32, 16x32, 32x32.
+bool isBRC(LLT Ty);
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index 839120da89711..a1fce93eb01f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -442,8 +442,8 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
// Opcodes that support pretty much all combinations of reg banks and LLTs
// (except S1). There is no point in writing rules for them.
- if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES ||
- Opc == AMDGPU::G_MERGE_VALUES || Opc == AMDGPU::G_BITCAST) {
+ if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_MERGE_VALUES ||
+ Opc == AMDGPU::G_BITCAST) {
RBLHelper.applyMappingTrivial(*MI);
continue;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index cc31d7d5c55ac..b5a8c7b89e64c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -910,6 +910,55 @@ bool RegBankLegalizeHelper::lower(MachineInstr &MI,
return lowerUnpackAExt(MI);
case WidenMMOToS32:
return widenMMOToS32(cast<GAnyLoad>(MI));
+ case VerifyAllSgpr: {
+ for (unsigned i = 0; i < MI.getNumOperands(); ++i)
+ assert(MRI.getRegBankOrNull(MI.getOperand(i).getReg()) == SgprRB);
+ return true;
+ }
+ case ApplyAllVgpr: {
+ B.setInstr(MI);
+ unsigned NumDefs = MI.getNumDefs();
+
+ for (unsigned i = 0; i < NumDefs; ++i)
+ assert(MRI.getRegBankOrNull(MI.getOperand(i).getReg()) == VgprRB);
+
+ for (unsigned i = NumDefs; i < MI.getNumOperands(); ++i) {
+ Register Reg = MI.getOperand(i).getReg();
+ if (MRI.getRegBank(Reg) != VgprRB) {
+ auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
+ MI.getOperand(i).setReg(Copy.getReg(0));
+ }
+ }
+
+ return true;
+ }
+ case UnmergeToShiftTrunc: {
+ GUnmerge *Unmerge = dyn_cast<GUnmerge>(&MI);
+ LLT Ty = MRI.getType(Unmerge->getSourceReg());
+ if (Ty.getSizeInBits() % 32 != 0) {
+ reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+ "AMDGPU RegBankLegalize: unmerge not multiple of 32",
+ MI);
+ return false;
+ }
+
+ B.setInstr(MI);
+ if (Ty.getSizeInBits() > 32) {
+ auto Unmerge32 = B.buildUnmerge(SgprRB_S32, Unmerge->getSourceReg());
+ for (unsigned i = 0; i < Unmerge32->getNumDefs(); ++i) {
+ auto [Dst0S32, Dst1S32] = unpackAExt(Unmerge32->getOperand(i).getReg());
+ B.buildTrunc(MI.getOperand(i * 2).getReg(), Dst0S32);
+ B.buildTrunc(MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
+ }
+ } else {
+ auto [Dst0S32, Dst1S32] = unpackAExt(MI.getOperand(2).getReg());
+ B.buildTrunc(MI.getOperand(0).getReg(), Dst0S32);
+ B.buildTrunc(MI.getOperand(1).getReg(), Dst1S32);
+ }
+
+ MI.eraseFromParent();
+ return true;
+ }
}
if (!WaterfallSgprs.empty()) {
@@ -1035,6 +1084,11 @@ LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
Ty == LLT::fixed_vector(8, 64))
return Ty;
return LLT();
+ case SgprBRC:
+ case VgprBRC:
+ if (isBRC(Ty))
+ return Ty;
+ return LLT();
default:
return LLT();
}
@@ -1069,6 +1123,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case SgprB128:
case SgprB256:
case SgprB512:
+ case SgprBRC:
case UniInVcc:
case UniInVgprS16:
case UniInVgprS32:
@@ -1108,6 +1163,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case VgprB128:
case VgprB256:
case VgprB512:
+ case VgprBRC:
case Vgpr32SExt:
case Vgpr32ZExt:
return VgprRB;
@@ -1167,6 +1223,7 @@ bool RegBankLegalizeHelper::applyMappingDst(
case SgprB128:
case SgprB256:
case SgprB512:
+ case SgprBRC:
case SgprPtr32:
case SgprPtr64:
case SgprPtr128:
@@ -1176,6 +1233,7 @@ bool RegBankLegalizeHelper::applyMappingDst(
case VgprB128:
case VgprB256:
case VgprB512:
+ case VgprBRC:
case VgprPtr32:
case VgprPtr64:
case VgprPtr128: {
@@ -1307,6 +1365,7 @@ bool RegBankLegalizeHelper::applyMappingSrc(
case SgprB128:
case SgprB256:
case SgprB512:
+ case SgprBRC:
case SgprPtr32:
case SgprPtr64:
case SgprPtr128: {
@@ -1341,6 +1400,7 @@ bool RegBankLegalizeHelper::applyMappingSrc(
case VgprB128:
case VgprB256:
case VgprB512:
+ case VgprBRC:
case VgprPtr32:
case VgprPtr64:
case VgprPtr128: {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index abe01e752960f..789339f5fc5fa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -14,6 +14,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPURegBankLegalizeRules.h"
+#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPUInstrInfo.h"
#include "GCNSubtarget.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
@@ -132,6 +133,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isUniform(Reg);
case UniB512:
return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isUniform(Reg);
+ case UniBRC:
+ return isBRC(MRI.getType(Reg)) && MUI.isUniform(Reg);
case DivS1:
return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergent(Reg);
case DivS16:
@@ -172,6 +175,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isDivergent(Reg);
case DivB512:
return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isDivergent(Reg);
+ case DivBRC:
+ return isBRC(MRI.getType(Reg)) && MUI.isDivergent(Reg);
case _:
return true;
default:
@@ -560,6 +565,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{UniS1, _}, {{Sgpr32Trunc}, {None}, UniCstExt}});
addRulesForGOpcs({G_FREEZE}).Any({{DivS1}, {{Vcc}, {Vcc}}});
+ addRulesForGOpcs({G_UNMERGE_VALUES})
+ .Any({{UniS16}, {{}, {}, UnmergeToShiftTrunc}})
+ .Any({{DivS16}, {{}, {}, ApplyAllVgpr}})
+ .Any({{UniBRC}, {{}, {}, VerifyAllSgpr}})
+ .Any({{DivBRC}, {{}, {}, ApplyAllVgpr}});
+
addRulesForGOpcs({G_ICMP})
.Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
.Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 1ac117304b76f..4aeea206ab5ca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -113,6 +113,7 @@ enum UniformityLLTOpPredicateID {
UniB128,
UniB256,
UniB512,
+ UniBRC,
DivB32,
DivB64,
@@ -120,6 +121,7 @@ enum UniformityLLTOpPredicateID {
DivB128,
DivB256,
DivB512,
+ DivBRC
};
// How to apply register bank on register operand.
@@ -156,6 +158,7 @@ enum RegBankLLTMappingApplyID {
SgprB128,
SgprB256,
SgprB512,
+ SgprBRC,
// vgpr scalars, pointers, vectors and B-types
Vgpr16,
@@ -178,6 +181,7 @@ enum RegBankLLTMappingApplyID {
VgprB128,
VgprB256,
VgprB512,
+ VgprBRC,
VgprV4S32,
// Dst only modifiers: read-any-lane and truncs
@@ -233,7 +237,10 @@ enum LoweringMethodID {
SplitLoad,
WidenLoad,
WidenMMOToS32,
- UnpackAExt
+ UnpackAExt,
+ VerifyAllSgpr,
+ ApplyAllVgpr,
+ UnmergeToShiftTrunc
};
enum FastRulesTypes {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/unmerge-sgpr-s16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/unmerge-sgpr-s16.ll
new file mode 100644
index 0000000000000..fb013d35d540b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/unmerge-sgpr-s16.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX11 %s
+
+define amdgpu_ps void @unmerge_sgprS16_from_V2S16(ptr addrspace(1) inreg %ptr, ptr addrspace(1) inreg %out) {
+; GFX11-LABEL: unmerge_sgprS16_from_V2S16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_pack_hl_b32_b16 s0, s0, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11-NEXT: s_endpgm
+ %load = load <2 x i16>, ptr addrspace(1) %ptr
+ %shuffle = shufflevector <2 x i16> %load, <2 x i16> poison, <2 x i32> <i32 1, i32 0>
+ store <2 x i16> %shuffle, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @unmerge_sgprS16_from_V4S16(ptr addrspace(1) inreg %ptr, ptr addrspace(1) inreg %out) {
+; GFX11-LABEL: unmerge_sgprS16_from_V4S16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_pack_lh_b32_b16 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11-NEXT: s_endpgm
+ %load = load <4 x i16>, ptr addrspace(1) %ptr
+ %shuffle = shufflevector <4 x i16> %load, <4 x i16> poison, <2 x i32> <i32 0, i32 3>
+ store <2 x i16> %shuffle, ptr addrspace(1) %out
+ ret void
+}
|
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
🪟 Windows x64 Test Results
✅ The build succeeded and all tests passed. |
| .Any({{UniS16}, {{}, {}, UnmergeToShiftTrunc}}) | ||
| .Any({{DivS16}, {{}, {}, ApplyAllVgpr}}) | ||
| .Any({{UniBRC}, {{}, {}, VerifyAllSgpr}}) | ||
| .Any({{DivBRC}, {{}, {}, ApplyAllVgpr}}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need tests for DivS16 and DivBRC
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are many in existing files, just moved this to AMDGPURegBankLegalizeRules.cpp, the sgpr s16 is changed. Also sgpr s16 appears in some tests and it was fine to just apply sgpr to all operands since it was combined away. I added test for the case when sgpr s16 g_unmerge was not combined away and actually meant to be inst selected but unlike any of the existing tests, tests in unmerge-sgpr-s16.ll are the cases that crash the compiler.
5aee752 to
9953d40
Compare
| ; | ||
| ; GFX11-TRUE16-LABEL: fpext_v2f16_to_v2f32_uniform: | ||
| ; GFX11-TRUE16: ; %bb.0: | ||
| ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, s0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was a bug.
There were _lo and _hi subreg copies, but they got lost at some point since input was a copy from physical sgpr. After that subreg copies looked like 32 bit sgpr copy (there were two but looked the same, so the result of the first was copied to result of second v_mov_b32_e32 v1, v0).
Maybe true 16 is missing assert somewhere?
Also for context, s16 unmerge now only appears on targets with true 16, see isRegisterClassType in AMDGPULegalizerInfo.cpp
| @@ -0,0 +1,36 @@ | |||
| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | |||
| ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX11 %s | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For testing purpose of the crash, try -global-isel or
if you wan to try out -global-isel -new-reg-bank-select replace
.Any({{UniS16}, {{}, {}, UnmergeToShiftTrunc}}) with
.Any({{UniS16}, {{}, {}, VerifyAllSgpr}})
need a true16 target because of legalizer
|
LGTM |
Move G_UNMERGE_VALUES handling to AMDGPURegBankLegalizeRules.cpp. Fix sgpr S16 unmerge by lowering using shift and using S32. Previously sgpr S16 unmerge was selected using _lo16 and _hi16 subreg indexes which are exclusive to vgpr register classes. For remaing cases we do trivial mapping, assigns same reg bank to all operands, vgpr or sgpr.
9953d40 to
c299742
Compare
|
ping: |

Move G_UNMERGE_VALUES handling to AMDGPURegBankLegalizeRules.cpp.
Fix sgpr S16 unmerge by lowering using shift and using S32.
Previously sgpr S16 unmerge was selected using _lo16 and _hi16 subreg
indexes which are exclusive to vgpr register classes.
For remaing cases we do trivial mapping, assigns same reg bank
to all operands, vgpr or sgpr.