Skip to content

Conversation

@petar-avramovic
Copy link
Collaborator

No description provided.

Copy link
Collaborator Author

petar-avramovic commented Nov 17, 2025

Warning

This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
Learn more

This stack of pull requests is managed by Graphite. Learn more about stacking.

@llvmbot
Copy link
Member

llvmbot commented Nov 17, 2025

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-amdgpu

Author: Petar Avramovic (petar-avramovic)

Changes

Patch is 21.42 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168411.diff

4 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp (+15-2)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp (+19)
  • (added) llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll (+233)
  • (added) llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll (+216)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 1765d054a3c0d..d719f3d40295d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -629,10 +629,23 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
 void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
   Register Dst = MI.getOperand(0).getReg();
   assert(MRI.getType(Dst) == V2S16);
-  auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg());
-  auto [Op2Lo32, Op2Hi32] = unpackAExt(MI.getOperand(2).getReg());
   unsigned Opc = MI.getOpcode();
   auto Flags = MI.getFlags();
+
+  if (MI.getNumOperands() == 2) {
+    auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg());
+    auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32);
+    auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32);
+    auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
+    auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
+    B.buildMergeLikeInstr(Dst, {Lo, Hi});
+    MI.eraseFromParent();
+    return;
+  }
+
+  assert(MI.getNumOperands() == 3);
+  auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg());
+  auto [Op2Lo32, Op2Hi32] = unpackAExt(MI.getOperand(2).getReg());
   auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32);
   auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32);
   auto Op2Lo = B.buildTrunc(SgprRB_S16, Op2Lo32);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index b81a08de383d9..4051dc8495f6f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -951,6 +951,25 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}})
       .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}});
 
+  // FNEG and FABS are either folded as source modifiers or can be selected as
+  // bitwise XOR and AND with Mask. XOR and AND are available on SALU but for
+  // targets without SALU float we still select them as VGPR since there would
+  // be no real sgpr use.
+  addRulesForGOpcs({G_FNEG, G_FABS}, Standard)
+      .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasSALUFloat)
+      .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasSALUFloat)
+      .Div(S16, {{Vgpr16}, {Vgpr16}})
+      .Uni(S32, {{UniInVgprS32}, {Vgpr32}}, !hasSALUFloat)
+      .Uni(S32, {{Sgpr32}, {Sgpr32}}, hasSALUFloat)
+      .Div(S32, {{Vgpr32}, {Vgpr32}})
+      .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
+      .Div(S64, {{Vgpr64}, {Vgpr64}})
+      .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}}, !hasSALUFloat)
+      .Uni(V2S16, {{SgprV2S16}, {SgprV2S16}, ScalarizeToS16}, hasSALUFloat)
+      .Div(V2S16, {{VgprV2S16}, {VgprV2S16}})
+      .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}})
+      .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}});
+
   addRulesForGOpcs({G_FPTOUI})
       .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
       .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll
new file mode 100644
index 0000000000000..093cdf744e3b4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll
@@ -0,0 +1,233 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL %s
+
+define amdgpu_ps void @v_fabs_f16(half %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fabs_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GCN-NEXT:    global_store_b16 v[1:2], v0, off
+; GCN-NEXT:    s_endpgm
+  %fabs = call half @llvm.fabs.f16(half %in)
+  store half %fabs, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fabs_f16(half inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_f16_e64 v2, |s0|, |s0|
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fabs_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT:    s_add_f16 s0, s0, s0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX12-NEXT:    s_endpgm
+  %fabs = call half @llvm.fabs.f16(half %in)
+  %fadd = fadd half %fabs, %fabs
+  store half %fadd, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @v_fabs_f32(float %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fabs_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT:    global_store_b32 v[1:2], v0, off
+; GCN-NEXT:    s_endpgm
+  %fabs = call float @llvm.fabs.f32(float %in)
+  store float %fabs, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fabs_f32(float inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_f32_e64 v2, |s0|, |s0|
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fabs_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_bitset0_b32 s0, 31
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT:    s_add_f32 s0, s0, s0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_endpgm
+  %fabs = call float @llvm.fabs.f32(float %in)
+  %fadd = fadd float %fabs, %fabs
+  store float %fadd, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @v_fabs_f64(double %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fabs_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GCN-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GCN-NEXT:    s_endpgm
+  %fabs = call double @llvm.fabs.f64(double %in)
+  store double %fabs, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fabs_f64(double inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_f64 v[2:3], |s[0:1]|, |s[0:1]|
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fabs_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_add_f64_e64 v[2:3], |s[0:1]|, |s[0:1]|
+; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT:    s_endpgm
+  %fabs = call double @llvm.fabs.f64(double %in)
+  %fadd = fadd double %fabs, %fabs
+  store double %fadd, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @v_fabs_v2f16(<2 x half> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fabs_v2f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GCN-NEXT:    global_store_b32 v[1:2], v0, off
+; GCN-NEXT:    s_endpgm
+  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
+  store <2 x half> %fabs, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fabs_v2f16(<2 x half> inreg %in, ptr addrspace(1) %out) {
+; GFX11-SDAG-LABEL: s_fabs_v2f16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, 0x7fff7fff
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_pk_add_f16 v2, s0, s0
+; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: s_fabs_v2f16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_and_b32_e64 v2, 0x7fff7fff, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v2, v2, v2
+; GFX11-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: s_fabs_v2f16:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_and_b32 s0, s0, 0x7fff7fff
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    v_pk_add_f16 v2, s0, s0
+; GFX12-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: s_fabs_v2f16:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX12-GISEL-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX12-GISEL-NEXT:    s_and_b32 s1, s1, 0x7fff
+; GFX12-GISEL-NEXT:    s_add_f16 s0, s0, s0
+; GFX12-GISEL-NEXT:    s_add_f16 s1, s1, s1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-GISEL-NEXT:    s_endpgm
+  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
+  %fadd = fadd <2 x half> %fabs, %fabs
+  store <2 x half> %fadd, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @v_fabs_v2f32(<2 x float> %in, ptr addrspace(1) %out) {
+; GFX11-SDAG-LABEL: v_fabs_v2f32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-SDAG-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_fabs_v2f32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX11-GISEL-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_fabs_v2f32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX12-SDAG-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_fabs_v2f32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX12-GISEL-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX12-GISEL-NEXT:    s_endpgm
+  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+  store <2 x float> %fabs, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fabs_v2f32(<2 x float> inreg %in, ptr addrspace(1) %out) {
+; GFX11-SDAG-LABEL: s_fabs_v2f32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_add_f32_e64 v3, |s1|, |s1|
+; GFX11-SDAG-NEXT:    v_add_f32_e64 v2, |s0|, |s0|
+; GFX11-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: s_fabs_v2f32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_add_f32_e64 v2, |s0|, |s0|
+; GFX11-GISEL-NEXT:    v_add_f32_e64 v3, |s1|, |s1|
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: s_fabs_v2f32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_bitset0_b32 s0, 31
+; GFX12-SDAG-NEXT:    s_bitset0_b32 s1, 31
+; GFX12-SDAG-NEXT:    s_add_f32 s0, s0, s0
+; GFX12-SDAG-NEXT:    s_add_f32 s1, s1, s1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: s_fabs_v2f32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_bitset0_b32 s0, 31
+; GFX12-GISEL-NEXT:    s_bitset0_b32 s1, 31
+; GFX12-GISEL-NEXT:    s_add_f32 s0, s0, s0
+; GFX12-GISEL-NEXT:    s_add_f32 s1, s1, s1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-GISEL-NEXT:    s_endpgm
+  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+  %fadd = fadd <2 x float> %fabs, %fabs
+  store <2 x float> %fadd, ptr addrspace(1) %out
+  ret void
+}
+
+declare half @llvm.fabs.f16(half)
+declare float @llvm.fabs.f32(float)
+declare double @llvm.fabs.f64(double)
+declare <2 x half> @llvm.fabs.v2f16(<2 x half>)
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll
new file mode 100644
index 0000000000000..f837c62821951
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL %s
+
+define amdgpu_ps void @v_fneg_f16(half %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fneg_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GCN-NEXT:    global_store_b16 v[1:2], v0, off
+; GCN-NEXT:    s_endpgm
+  %fneg = fneg half %in
+  store half %fneg, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fneg_f16(half inreg %in, half inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mul_f16_e64 v2, -s0, s1
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fneg_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_xor_b32 s0, s0, 0x8000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT:    s_mul_f16 s0, s0, s1
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX12-NEXT:    s_endpgm
+  %fneg = fneg half %in
+  %fmul = fmul half %fneg, %val
+  store half %fmul, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @v_fneg_f32(float %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fneg_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    global_store_b32 v[1:2], v0, off
+; GCN-NEXT:    s_endpgm
+  %fneg = fneg float %in
+  store float %fneg, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fneg_f32(float inreg %in, float inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mul_f32_e64 v2, -s0, s1
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fneg_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_xor_b32 s0, s0, 0x80000000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT:    s_mul_f32 s0, s0, s1
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_endpgm
+  %fneg = fneg float %in
+  %fmul = fmul float %fneg, %val
+  store float %fmul, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @v_fneg_f64(double %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fneg_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GCN-NEXT:    s_endpgm
+  %fneg = fneg double %in
+  store double %fneg, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fneg_f64(double inreg %in, double inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mul_f64 v[2:3], -s[0:1], s[2:3]
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fneg_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mul_f64_e64 v[2:3], -s[0:1], s[2:3]
+; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT:    s_endpgm
+  %fneg = fneg double %in
+  %fmul = fmul double %fneg, %val
+  store double %fmul, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @v_fneg_v2f16(<2 x half> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fneg_v2f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GCN-NEXT:    global_store_b32 v[1:2], v0, off
+; GCN-NEXT:    s_endpgm
+  %fneg = fneg <2 x half> %in
+  store <2 x half> %fneg, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fneg_v2f16(<2 x half> inreg %in, <2 x half> inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_pk_mul_f16 v2, s0, s1 neg_lo:[1,0] neg_hi:[1,0]
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: s_fneg_v2f16:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_pk_mul_f16 v2, s0, s1 neg_lo:[1,0] neg_hi:[1,0]
+; GFX12-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: s_fneg_v2f16:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX12-GISEL-NEXT:    s_xor_b32 s0, s0, 0x8000
+; GFX12-GISEL-NEXT:    s_xor_b32 s2, s2, 0x8000
+; GFX12-GISEL-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX12-GISEL-NEXT:    s_mul_f16 s0, s0, s1
+; GFX12-GISEL-NEXT:    s_mul_f16 s1, s2, s3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-GISEL-NEXT:    s_endpgm
+  %fneg = fneg <2 x half> %in
+  %fmul = fmul <2 x half> %fneg, %val
+  store <2 x half> %fmul, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @v_fneg_v2f32(<2 x float> %in, ptr addrspace(1) %out) {
+; GFX11-SDAG-LABEL: v_fneg_v2f32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX11-SDAG-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_fneg_v2f32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-GISEL-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: v_fneg_v2f32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX12-SDAG-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX12-SDAG-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: v_fneg_v2f32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX12-GISEL-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX12-GISEL-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX12-GISEL-NEXT:    s_endpgm
+  %fneg = fneg <2 x float> %in
+  store <2 x float> %fneg, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fneg_v2f32(<2 x float> inreg %in, <2 x float> inreg %val, ptr addrspace(1) %out) {
+; GFX11-SDAG-LABEL: s_fneg_v2f32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_mul_f32_e64 v3, -s1, s3
+; GFX11-SDAG-NEXT:    v_mul_f32_e64 v2, -s0, s2
+; GFX11-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: s_fneg_v2f32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_mul_f32_e64 v2, -s0, s2
+; GFX11-GISEL-NEXT:    v_mul_f32_e64 v3, -s1, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(N...
[truncated]

@github-actions
Copy link

github-actions bot commented Nov 17, 2025

🐧 Linux x64 Test Results

  • 186295 tests passed
  • 4849 tests skipped

%fmul = fmul <2 x float> %fneg, %val
store <2 x float> %fmul, ptr addrspace(1) %out
ret void
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you also test the fneg + fabs case

store <2 x float> %fabs, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @s_fabs_v2f32(<2 x float> inreg %in, ptr addrspace(1) %out) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is somewhat misleading since it's not a standalone scalar fabs. It doesn't really count if it folds into a source modifier of a VALU instruction?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right, switched to "used by" g_select since that one is available on salu for all the types. Added version that folds readanylanes and salu_use version that requires readanylane, at least for now

unsigned Opc = MI.getOpcode();
auto Flags = MI.getFlags();

if (MI.getNumOperands() == 2) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this here? These don't require splitting to handle?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is for unary opcodes, old one below was for binary

@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/readanylane-combine branch from 3590a6e to 9e70882 Compare November 18, 2025 13:59
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/fabs-fneg branch from 529b6f2 to 73f2bf8 Compare November 18, 2025 13:59
Comment on lines +305 to +334
define amdgpu_ps void @v_fabs_fneg_f32(float %in, ptr addrspace(1) %out) {
; GCN-LABEL: v_fabs_fneg_f32:
; GCN: ; %bb.0:
; GCN-NEXT: v_or_b32_e32 v0, 0x80000000, v0
; GCN-NEXT: global_store_b32 v[1:2], v0, off
; GCN-NEXT: s_endpgm
%fabs = call float @llvm.fabs.f32(float %in)
%fneg = fneg float %fabs
store float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @s_fabs_fneg_f32(float inreg %in, ptr addrspace(1) %out) {
; GFX11-LABEL: s_fabs_fneg_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_or_b32_e64 v2, 0x80000000, s0
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_fabs_fneg_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_bitset1_b32 s0, 31
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_endpgm
%fabs = call float @llvm.fabs.f32(float %in)
%fneg = fneg float %fabs
store float %fneg, ptr addrspace(1) %out
ret void
}
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added are some fneg fabs tests, what are we trying to test here exactly?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants