-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[AMDGPU][GlobalISel] Add register bank legalization for G_FADD #163407
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Abhinav Garg (abhigargrepo) ChangesThis patch adds register bank legalization support for G_FADD opcodes in the AMDGPU GlobalISel pipeline. Full diff: https://github.com/llvm/llvm-project/pull/163407.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 540756653dd22..198ee6b73b0b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -837,6 +837,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
return LLT::scalar(32);
case Sgpr64:
case Vgpr64:
+ case UniInVgprS64:
return LLT::scalar(64);
case Sgpr128:
case Vgpr128:
@@ -960,6 +961,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case UniInVcc:
case UniInVgprS16:
case UniInVgprS32:
+ case UniInVgprS64:
case UniInVgprV2S16:
case UniInVgprV4S32:
case UniInVgprB32:
@@ -1092,6 +1094,7 @@ void RegBankLegalizeHelper::applyMappingDst(
break;
}
case UniInVgprS32:
+ case UniInVgprS64:
case UniInVgprV2S16:
case UniInVgprV4S32: {
assert(Ty == getTyFromID(MethodIDs[OpIdx]));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index bfe2c80c810ef..9cf0c52717318 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -904,9 +904,18 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
bool hasSALUFloat = ST->hasSALUFloatInsts();
addRulesForGOpcs({G_FADD}, Standard)
+ .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
+ .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
.Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat)
- .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+ .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
+ .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
+ .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}})
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
+ .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}})
+ .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}});
addRulesForGOpcs({G_FPTOUI})
.Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 93e0efda77fdd..1cf9ae2e226ca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -92,8 +92,10 @@ enum UniformityLLTOpPredicateID {
V4S32,
UniV2S16,
+ UniV2S32,
DivV2S16,
+ DivV2S32,
// B types
B32,
@@ -178,7 +180,9 @@ enum RegBankLLTMappingApplyID {
UniInVcc,
UniInVgprS16,
UniInVgprS32,
+ UniInVgprS64,
UniInVgprV2S16,
+ UniInVgprV2S32,
UniInVgprV4S32,
UniInVgprB32,
UniInVgprB64,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll
new file mode 100644
index 0000000000000..ec221496f450c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll
@@ -0,0 +1,246 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+
+define amdgpu_ps half @fadd_s16_uniform(half inreg %a, half inreg %b) {
+; GFX11-SDAG-FAKE16-LABEL: fadd_s16_uniform:
+; GFX11-SDAG-FAKE16: ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT: v_add_f16_e64 v0, s0, s1
+; GFX11-SDAG-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX11-SDAG-TRUE16-LABEL: fadd_s16_uniform:
+; GFX11-SDAG-TRUE16: ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, s1
+; GFX11-SDAG-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-GISEL-FAKE16-LABEL: fadd_s16_uniform:
+; GFX11-GISEL-FAKE16: ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e64 v0, s0, s1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-GISEL-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX11-GISEL-TRUE16-LABEL: fadd_s16_uniform:
+; GFX11-GISEL-TRUE16: ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, s1
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-GISEL-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: fadd_s16_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_add_f16 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %fadd = fadd half %a, %b
+ ret half %fadd
+}
+
+define amdgpu_ps half @fadd_s16_div(half %a, half %b) {
+; GFX11-SDAG-FAKE16-LABEL: fadd_s16_div:
+; GFX11-SDAG-FAKE16: ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX11-SDAG-TRUE16-LABEL: fadd_s16_div:
+; GFX11-SDAG-TRUE16: ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-GISEL-FAKE16-LABEL: fadd_s16_div:
+; GFX11-GISEL-FAKE16: ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX11-GISEL-TRUE16-LABEL: fadd_s16_div:
+; GFX11-GISEL-TRUE16: ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX12-SDAG-FAKE16-LABEL: fadd_s16_div:
+; GFX12-SDAG-FAKE16: ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX12-SDAG-TRUE16-LABEL: fadd_s16_div:
+; GFX12-SDAG-TRUE16: ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-FAKE16-LABEL: fadd_s16_div:
+; GFX12-GISEL-FAKE16: ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-TRUE16-LABEL: fadd_s16_div:
+; GFX12-GISEL-TRUE16: ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog
+ %fadd = fadd half %a, %b
+ ret half %fadd
+}
+
+define amdgpu_ps float @fadd_s32_uniform(float inreg %a, float inreg %b) {
+; GFX11-LABEL: fadd_s32_uniform:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_add_f32_e64 v0, s0, s1
+; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: fadd_s32_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_add_f32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %fadd = fadd float %a, %b
+ ret float %fadd
+}
+
+define amdgpu_ps float @fadd_s32_div(float %a, float %b) {
+; GCN-LABEL: fadd_s32_div:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_add_f32_e32 v0, v0, v1
+; GCN-NEXT: ; return to shader part epilog
+ %fadd = fadd float %a, %b
+ ret float %fadd
+}
+
+define amdgpu_ps double @fadd_s64_uniform(double inreg %a, double inreg %b) {
+; GFX11-LABEL: fadd_s64_uniform:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], s[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: fadd_s64_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_add_f64_e64 v[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_readfirstlane_b32 s0, v0
+; GFX12-NEXT: v_readfirstlane_b32 s1, v1
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: ; return to shader part epilog
+ %fadd = fadd double %a, %b
+ ret double %fadd
+}
+
+define amdgpu_ps double @fadd_s64_div(double %a, double %b) {
+; GFX11-LABEL: fadd_s64_div:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: fadd_s64_div:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_readfirstlane_b32 s0, v0
+; GFX12-NEXT: v_readfirstlane_b32 s1, v1
+; GFX12-NEXT: ; return to shader part epilog
+ %fadd = fadd double %a, %b
+ ret double %fadd
+}
+
+define <2 x half> @fadd_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) {
+; GFX11-LABEL: fadd_v2s16_uniform:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v0, s0, s1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fadd_v2s16_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_add_f16 v0, s0, s1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %fadd = fadd <2 x half> %a, %b
+ ret <2 x half> %fadd
+}
+
+define <2 x half> @fadd_v2s16_div(<2 x half> %a, <2 x half> %b) {
+; GFX11-LABEL: fadd_v2s16_div:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fadd_v2s16_div:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_add_f16 v0, v0, v1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %fadd = fadd <2 x half> %a, %b
+ ret <2 x half> %fadd
+}
+
+define <2 x float> @fadd_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b) {
+; GFX11-LABEL: fadd_v2s32_uniform:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_f32_e64 v0, s0, s2
+; GFX11-NEXT: v_add_f32_e64 v1, s1, s3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fadd_v2s32_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_add_f32 s0, s0, s2
+; GFX12-NEXT: s_add_f32 s1, s1, s3
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %fadd = fadd <2 x float> %a, %b
+ ret <2 x float> %fadd
+}
+
+define <2 x float> @fadd_v2s32_div(<2 x float> %a, <2 x float> %b) {
+; GFX11-LABEL: fadd_v2s32_div:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fadd_v2s32_div:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %fadd = fadd <2 x float> %a, %b
+ ret <2 x float> %fadd
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-GISEL: {{.*}}
+; GFX11-SDAG: {{.*}}
+; GFX12-GISEL: {{.*}}
+; GFX12-SDAG: {{.*}}
|
| ; GFX11-GISEL-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 | ||
| ; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can be improved, need combine for copy + trunc + readanylane + anyext src->copy src
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added that combine logic
| .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) | ||
| .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}}) | ||
| .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}}) | ||
| .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe do scalarize for hasSALUFloat?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
|
Not sure what happened here but it is not reviewable, need to rebase your commits on top of main |
6313a6b to
f734a0c
Compare
| LLT DstTy = MRI.getType(Dst); | ||
| assert(DstTy == V2S16); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| LLT DstTy = MRI.getType(Dst); | |
| assert(DstTy == V2S16); | |
| assert(MRI.getType(Dst) == V2S16); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
| auto Val1_Hi = B.buildTrunc(SgprRB_S16, Val1_Hi_32); | ||
| auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Val0_Lo, Val1_Lo}, Flags); | ||
| auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Val0_Hi, Val1_Hi}, Flags); | ||
| B.buildMergeLikeInstr(Dst, {Lo.getReg(0), Hi.getReg(0)}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| B.buildMergeLikeInstr(Dst, {Lo.getReg(0), Hi.getReg(0)}); | |
| B.buildMergeLikeInstr(Dst, {Lo, Hi}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
| auto [Val0_Lo_32, Val0_Hi_32] = unpackAExt(MI.getOperand(1).getReg()); | ||
| auto [Val1_Lo_32, Val1_Hi_32] = unpackAExt(MI.getOperand(2).getReg()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Small naming suggestion to keep these similar to surrounding code, use Op1 Op2 instead of Val0 and Val1
for example Val0_Lo_32 -> Op1Lo32
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for pointing it.
…plement combine logic for ReadAnyLane + Trunc + AnyExt.
f734a0c to
cda47e5
Compare
| define amdgpu_ps double @fadd_s64_uniform(double inreg %a, double inreg %b) { | ||
| ; GFX11-LABEL: fadd_s64_uniform: | ||
| ; GFX11: ; %bb.0: | ||
| ; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], s[2:3] | ||
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) | ||
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 | ||
| ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 | ||
| ; GFX11-NEXT: ; return to shader part epilog | ||
| ; | ||
| ; GFX12-LABEL: fadd_s64_uniform: | ||
| ; GFX12: ; %bb.0: | ||
| ; GFX12-NEXT: v_add_f64_e64 v[0:1], s[0:1], s[2:3] | ||
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) | ||
| ; GFX12-NEXT: v_readfirstlane_b32 s0, v0 | ||
| ; GFX12-NEXT: v_readfirstlane_b32 s1, v1 | ||
| ; GFX12-NEXT: s_wait_alu 0xf1ff | ||
| ; GFX12-NEXT: ; return to shader part epilog | ||
| %fadd = fadd double %a, %b | ||
| ret double %fadd | ||
| } | ||
|
|
||
| define amdgpu_ps double @fadd_s64_div(double %a, double %b) { | ||
| ; GFX11-LABEL: fadd_s64_div: | ||
| ; GFX11: ; %bb.0: | ||
| ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] | ||
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) | ||
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 | ||
| ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 | ||
| ; GFX11-NEXT: ; return to shader part epilog | ||
| ; | ||
| ; GFX12-LABEL: fadd_s64_div: | ||
| ; GFX12: ; %bb.0: | ||
| ; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3] | ||
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) | ||
| ; GFX12-NEXT: v_readfirstlane_b32 s0, v0 | ||
| ; GFX12-NEXT: v_readfirstlane_b32 s1, v1 | ||
| ; GFX12-NEXT: ; return to shader part epilog | ||
| %fadd = fadd double %a, %b | ||
| ret double %fadd | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Small nit, amdgpu_ps returns double in sgprs, so just to get rid of v_readfirstlane_b32 since they are not globalisel related it would make sense to me to to do store of %fadd here instead of return;
So just change two tests with double to something like
define amdgpu_ps void@fadd_s64_div(double %a, double %b, ptr addrspace(1) %out,) {
%fadd = fadd double %a, %b
store double %fadd , ptr addrspace(1) %out
ret void
}
| void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) { | ||
| Register Dst = MI.getOperand(0).getReg(); | ||
| assert(MRI.getType(Dst) == V2S16); | ||
| auto [Op0Lo32, Op0Hi32] = unpackAExt(MI.getOperand(1).getReg()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Op1
Op2
…163407) This patch adds register bank legalization support for G_FADD opcodes in the AMDGPU GlobalISel pipeline. Added new reg bank type UniInVgprS64. This patch also adds a combine logic for ReadAnyLane + Trunc + AnyExt. --------- Co-authored-by: Abhinav Garg <abhigarg@amd.com>
This patch adds register bank legalization support for G_FADD opcodes in the AMDGPU GlobalISel pipeline.
Added new reg bank type UniInVgprS64.