-
Notifications
You must be signed in to change notification settings - Fork 15.2k
AMDGPU/GlobalISel: RegBankLegalize rules for G_FABS and G_FNEG #168411
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/petar-avramovic/readanylane-combine
Are you sure you want to change the base?
AMDGPU/GlobalISel: RegBankLegalize rules for G_FABS and G_FNEG #168411
Conversation
|
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Petar Avramovic (petar-avramovic) ChangesPatch is 21.42 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168411.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 1765d054a3c0d..d719f3d40295d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -629,10 +629,23 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
Register Dst = MI.getOperand(0).getReg();
assert(MRI.getType(Dst) == V2S16);
- auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg());
- auto [Op2Lo32, Op2Hi32] = unpackAExt(MI.getOperand(2).getReg());
unsigned Opc = MI.getOpcode();
auto Flags = MI.getFlags();
+
+ if (MI.getNumOperands() == 2) {
+ auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg());
+ auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32);
+ auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32);
+ auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
+ auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
+ B.buildMergeLikeInstr(Dst, {Lo, Hi});
+ MI.eraseFromParent();
+ return;
+ }
+
+ assert(MI.getNumOperands() == 3);
+ auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg());
+ auto [Op2Lo32, Op2Hi32] = unpackAExt(MI.getOperand(2).getReg());
auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32);
auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32);
auto Op2Lo = B.buildTrunc(SgprRB_S16, Op2Lo32);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index b81a08de383d9..4051dc8495f6f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -951,6 +951,25 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}})
.Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}});
+ // FNEG and FABS are either folded as source modifiers or can be selected as
+ // bitwise XOR and AND with Mask. XOR and AND are available on SALU but for
+ // targets without SALU float we still select them as VGPR since there would
+ // be no real sgpr use.
+ addRulesForGOpcs({G_FNEG, G_FABS}, Standard)
+ .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasSALUFloat)
+ .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasSALUFloat)
+ .Div(S16, {{Vgpr16}, {Vgpr16}})
+ .Uni(S32, {{UniInVgprS32}, {Vgpr32}}, !hasSALUFloat)
+ .Uni(S32, {{Sgpr32}, {Sgpr32}}, hasSALUFloat)
+ .Div(S32, {{Vgpr32}, {Vgpr32}})
+ .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
+ .Div(S64, {{Vgpr64}, {Vgpr64}})
+ .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}}, !hasSALUFloat)
+ .Uni(V2S16, {{SgprV2S16}, {SgprV2S16}, ScalarizeToS16}, hasSALUFloat)
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16}})
+ .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}})
+ .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}});
+
addRulesForGOpcs({G_FPTOUI})
.Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
.Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll
new file mode 100644
index 0000000000000..093cdf744e3b4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll
@@ -0,0 +1,233 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL %s
+
+define amdgpu_ps void @v_fabs_f16(half %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fabs_f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GCN-NEXT: global_store_b16 v[1:2], v0, off
+; GCN-NEXT: s_endpgm
+ %fabs = call half @llvm.fabs.f16(half %in)
+ store half %fabs, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fabs_f16(half inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_add_f16_e64 v2, |s0|, |s0|
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fabs_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT: s_add_f16 s0, s0, s0
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %fabs = call half @llvm.fabs.f16(half %in)
+ %fadd = fadd half %fabs, %fabs
+ store half %fadd, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fabs_f32(float %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fabs_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT: global_store_b32 v[1:2], v0, off
+; GCN-NEXT: s_endpgm
+ %fabs = call float @llvm.fabs.f32(float %in)
+ store float %fabs, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fabs_f32(float inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_add_f32_e64 v2, |s0|, |s0|
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fabs_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_bitset0_b32 s0, 31
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT: s_add_f32 s0, s0, s0
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %fabs = call float @llvm.fabs.f32(float %in)
+ %fadd = fadd float %fabs, %fabs
+ store float %fadd, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fabs_f64(double %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fabs_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
+; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GCN-NEXT: s_endpgm
+ %fabs = call double @llvm.fabs.f64(double %in)
+ store double %fabs, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fabs_f64(double inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_add_f64 v[2:3], |s[0:1]|, |s[0:1]|
+; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fabs_f64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_add_f64_e64 v[2:3], |s[0:1]|, |s[0:1]|
+; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT: s_endpgm
+ %fabs = call double @llvm.fabs.f64(double %in)
+ %fadd = fadd double %fabs, %fabs
+ store double %fadd, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fabs_v2f16(<2 x half> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fabs_v2f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GCN-NEXT: global_store_b32 v[1:2], v0, off
+; GCN-NEXT: s_endpgm
+ %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
+ store <2 x half> %fabs, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fabs_v2f16(<2 x half> inreg %in, ptr addrspace(1) %out) {
+; GFX11-SDAG-LABEL: s_fabs_v2f16:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 0x7fff7fff
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_pk_add_f16 v2, s0, s0
+; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: s_fabs_v2f16:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_and_b32_e64 v2, 0x7fff7fff, s0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_pk_add_f16 v2, v2, v2
+; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: s_fabs_v2f16:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_and_b32 s0, s0, 0x7fff7fff
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: v_pk_add_f16 v2, s0, s0
+; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: s_fabs_v2f16:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_lshr_b32 s1, s0, 16
+; GFX12-GISEL-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX12-GISEL-NEXT: s_and_b32 s1, s1, 0x7fff
+; GFX12-GISEL-NEXT: s_add_f16 s0, s0, s0
+; GFX12-GISEL-NEXT: s_add_f16 s1, s1, s1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-GISEL-NEXT: s_endpgm
+ %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
+ %fadd = fadd <2 x half> %fabs, %fabs
+ store <2 x half> %fadd, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fabs_v2f32(<2 x float> %in, ptr addrspace(1) %out) {
+; GFX11-SDAG-LABEL: v_fabs_v2f32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_fabs_v2f32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX11-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_fabs_v2f32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX12-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_fabs_v2f32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX12-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX12-GISEL-NEXT: s_endpgm
+ %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+ store <2 x float> %fabs, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fabs_v2f32(<2 x float> inreg %in, ptr addrspace(1) %out) {
+; GFX11-SDAG-LABEL: s_fabs_v2f32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_add_f32_e64 v3, |s1|, |s1|
+; GFX11-SDAG-NEXT: v_add_f32_e64 v2, |s0|, |s0|
+; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: s_fabs_v2f32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_add_f32_e64 v2, |s0|, |s0|
+; GFX11-GISEL-NEXT: v_add_f32_e64 v3, |s1|, |s1|
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: s_fabs_v2f32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_bitset0_b32 s0, 31
+; GFX12-SDAG-NEXT: s_bitset0_b32 s1, 31
+; GFX12-SDAG-NEXT: s_add_f32 s0, s0, s0
+; GFX12-SDAG-NEXT: s_add_f32 s1, s1, s1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: s_fabs_v2f32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_bitset0_b32 s0, 31
+; GFX12-GISEL-NEXT: s_bitset0_b32 s1, 31
+; GFX12-GISEL-NEXT: s_add_f32 s0, s0, s0
+; GFX12-GISEL-NEXT: s_add_f32 s1, s1, s1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-GISEL-NEXT: s_endpgm
+ %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+ %fadd = fadd <2 x float> %fabs, %fabs
+ store <2 x float> %fadd, ptr addrspace(1) %out
+ ret void
+}
+
+declare half @llvm.fabs.f16(half)
+declare float @llvm.fabs.f32(float)
+declare double @llvm.fabs.f64(double)
+declare <2 x half> @llvm.fabs.v2f16(<2 x half>)
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll
new file mode 100644
index 0000000000000..f837c62821951
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL %s
+
+define amdgpu_ps void @v_fneg_f16(half %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fneg_f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GCN-NEXT: global_store_b16 v[1:2], v0, off
+; GCN-NEXT: s_endpgm
+ %fneg = fneg half %in
+ store half %fneg, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fneg_f16(half inreg %in, half inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mul_f16_e64 v2, -s0, s1
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fneg_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_xor_b32 s0, s0, 0x8000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT: s_mul_f16 s0, s0, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %fneg = fneg half %in
+ %fmul = fmul half %fneg, %val
+ store half %fmul, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fneg_f32(float %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fneg_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: global_store_b32 v[1:2], v0, off
+; GCN-NEXT: s_endpgm
+ %fneg = fneg float %in
+ store float %fneg, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fneg_f32(float inreg %in, float inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mul_f32_e64 v2, -s0, s1
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fneg_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_xor_b32 s0, s0, 0x80000000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT: s_mul_f32 s0, s0, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %fneg = fneg float %in
+ %fmul = fmul float %fneg, %val
+ store float %fmul, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fneg_f64(double %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fneg_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GCN-NEXT: s_endpgm
+ %fneg = fneg double %in
+ store double %fneg, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fneg_f64(double inreg %in, double inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mul_f64 v[2:3], -s[0:1], s[2:3]
+; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fneg_f64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mul_f64_e64 v[2:3], -s[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT: s_endpgm
+ %fneg = fneg double %in
+ %fmul = fmul double %fneg, %val
+ store double %fmul, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fneg_v2f16(<2 x half> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fneg_v2f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; GCN-NEXT: global_store_b32 v[1:2], v0, off
+; GCN-NEXT: s_endpgm
+ %fneg = fneg <2 x half> %in
+ store <2 x half> %fneg, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fneg_v2f16(<2 x half> inreg %in, <2 x half> inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_pk_mul_f16 v2, s0, s1 neg_lo:[1,0] neg_hi:[1,0]
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: s_fneg_v2f16:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_pk_mul_f16 v2, s0, s1 neg_lo:[1,0] neg_hi:[1,0]
+; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: s_fneg_v2f16:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_lshr_b32 s2, s0, 16
+; GFX12-GISEL-NEXT: s_xor_b32 s0, s0, 0x8000
+; GFX12-GISEL-NEXT: s_xor_b32 s2, s2, 0x8000
+; GFX12-GISEL-NEXT: s_lshr_b32 s3, s1, 16
+; GFX12-GISEL-NEXT: s_mul_f16 s0, s0, s1
+; GFX12-GISEL-NEXT: s_mul_f16 s1, s2, s3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-GISEL-NEXT: s_endpgm
+ %fneg = fneg <2 x half> %in
+ %fmul = fmul <2 x half> %fneg, %val
+ store <2 x half> %fmul, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fneg_v2f32(<2 x float> %in, ptr addrspace(1) %out) {
+; GFX11-SDAG-LABEL: v_fneg_v2f32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-SDAG-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX11-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: v_fneg_v2f32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX11-GISEL-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: v_fneg_v2f32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX12-SDAG-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX12-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: v_fneg_v2f32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX12-GISEL-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX12-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX12-GISEL-NEXT: s_endpgm
+ %fneg = fneg <2 x float> %in
+ store <2 x float> %fneg, ptr addrspace(1) %out
+ ret void
+}
+define amdgpu_ps void @s_fneg_v2f32(<2 x float> inreg %in, <2 x float> inreg %val, ptr addrspace(1) %out) {
+; GFX11-SDAG-LABEL: s_fneg_v2f32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_mul_f32_e64 v3, -s1, s3
+; GFX11-SDAG-NEXT: v_mul_f32_e64 v2, -s0, s2
+; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: s_fneg_v2f32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_mul_f32_e64 v2, -s0, s2
+; GFX11-GISEL-NEXT: v_mul_f32_e64 v3, -s1, s3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(N...
[truncated]
|
🐧 Linux x64 Test Results
|
| %fmul = fmul <2 x float> %fneg, %val | ||
| store <2 x float> %fmul, ptr addrspace(1) %out | ||
| ret void | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you also test the fneg + fabs case
| store <2 x float> %fabs, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
| define amdgpu_ps void @s_fabs_v2f32(<2 x float> inreg %in, ptr addrspace(1) %out) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This test is somewhat misleading since it's not a standalone scalar fabs. It doesn't really count if it folds into a source modifier of a VALU instruction?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
right, switched to "used by" g_select since that one is available on salu for all the types. Added version that folds readanylanes and salu_use version that requires readanylane, at least for now
| unsigned Opc = MI.getOpcode(); | ||
| auto Flags = MI.getFlags(); | ||
|
|
||
| if (MI.getNumOperands() == 2) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is this here? These don't require splitting to handle?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it is for unary opcodes, old one below was for binary
3590a6e to
9e70882
Compare
529b6f2 to
73f2bf8
Compare
| define amdgpu_ps void @v_fabs_fneg_f32(float %in, ptr addrspace(1) %out) { | ||
| ; GCN-LABEL: v_fabs_fneg_f32: | ||
| ; GCN: ; %bb.0: | ||
| ; GCN-NEXT: v_or_b32_e32 v0, 0x80000000, v0 | ||
| ; GCN-NEXT: global_store_b32 v[1:2], v0, off | ||
| ; GCN-NEXT: s_endpgm | ||
| %fabs = call float @llvm.fabs.f32(float %in) | ||
| %fneg = fneg float %fabs | ||
| store float %fneg, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
| define amdgpu_ps void @s_fabs_fneg_f32(float inreg %in, ptr addrspace(1) %out) { | ||
| ; GFX11-LABEL: s_fabs_fneg_f32: | ||
| ; GFX11: ; %bb.0: | ||
| ; GFX11-NEXT: v_or_b32_e64 v2, 0x80000000, s0 | ||
| ; GFX11-NEXT: global_store_b32 v[0:1], v2, off | ||
| ; GFX11-NEXT: s_endpgm | ||
| ; | ||
| ; GFX12-LABEL: s_fabs_fneg_f32: | ||
| ; GFX12: ; %bb.0: | ||
| ; GFX12-NEXT: s_bitset1_b32 s0, 31 | ||
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | ||
| ; GFX12-NEXT: v_mov_b32_e32 v2, s0 | ||
| ; GFX12-NEXT: global_store_b32 v[0:1], v2, off | ||
| ; GFX12-NEXT: s_endpgm | ||
| %fabs = call float @llvm.fabs.f32(float %in) | ||
| %fneg = fneg float %fabs | ||
| store float %fneg, ptr addrspace(1) %out | ||
| ret void | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added are some fneg fabs tests, what are we trying to test here exactly?

No description provided.