From 27698002ede0e9fe1b0dec71eb32b7a824ff4506 Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Wed, 12 Nov 2025 07:52:19 -0500 Subject: [PATCH 1/3] [AMDGPU] Lower S_ABSDIFF_I32 to VALU instructions Added support for lowering the scalar S_ABSDIFF_I32 instruction to equivalent VALU operations. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 36 ++++++++++++++++++ llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 + llvm/test/CodeGen/AMDGPU/absdiff.ll | 38 +++++++++++++++++++ .../CodeGen/AMDGPU/move-to-valu-absdiff.mir | 35 +++++++++++++++++ 4 files changed, 111 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/move-to-valu-absdiff.mir diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 6e39a2de9b805..f7c4fb2351b69 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7775,6 +7775,11 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, Inst.eraseFromParent(); return; + case AMDGPU::S_ABSDIFF_I32: + lowerScalarAbsDiff(Worklist, Inst); + Inst.eraseFromParent(); + return; + case AMDGPU::S_CBRANCH_SCC0: case AMDGPU::S_CBRANCH_SCC1: { // Clear unused bits of vcc @@ -8424,6 +8429,37 @@ void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } +void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist, + MachineInstr &Inst) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + DebugLoc DL = Inst.getDebugLoc(); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src1 = Inst.getOperand(1); + MachineOperand &Src2 = Inst.getOperand(2); + Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + unsigned SubOp = + ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32; + + BuildMI(MBB, MII, DL, get(SubOp), SubResultReg) + .addReg(Src1.getReg()) + .addReg(Src2.getReg()); + + BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) + .addReg(SubResultReg) + .addReg(TmpReg); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); +} + void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 2ecd94186e1e0..d24dfd657ddcc 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -136,6 +136,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { void lowerScalarAbs(SIInstrWorklist &Worklist, MachineInstr &Inst) const; + void lowerScalarAbsDiff(SIInstrWorklist &Worklist, MachineInstr &Inst) const; + void lowerScalarXnor(SIInstrWorklist &Worklist, MachineInstr &Inst) const; void splitScalarNotBinop(SIInstrWorklist &Worklist, MachineInstr &Inst, diff --git a/llvm/test/CodeGen/AMDGPU/absdiff.ll b/llvm/test/CodeGen/AMDGPU/absdiff.ll index 9cb397fb9d1c6..5e4947ea5e0b5 100644 --- a/llvm/test/CodeGen/AMDGPU/absdiff.ll +++ b/llvm/test/CodeGen/AMDGPU/absdiff.ll @@ -1,6 +1,44 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s + +define amdgpu_gs float @absdiff_valu_input_regression() { +; CHECK-LABEL: absdiff_valu_input_regression: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: .LBB0_1: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_mov_b32 s1, s0 +; CHECK-NEXT: s_or_b32 s0, s0, 1 +; CHECK-NEXT: s_cmp_gt_i32 s1, 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 +; CHECK-NEXT: ; %bb.2: ; %bb11 +; CHECK-NEXT: v_med3_i32 v0, s1, 0, 1 +; CHECK-NEXT: v_sub_u32_e32 v0, 0, v0 +; CHECK-NEXT: v_sub_u32_e32 v1, 0, v0 +; CHECK-NEXT: v_max_i32_e32 v0, v0, v1 +; CHECK-NEXT: ; return to shader part epilog +bb: + br label %bb1 + +bb1: ; preds = %bb1, %bb + %i = phi i32 [ 0, %bb ], [ %i9, %bb1 ] + %i2 = phi i32 [ 0, %bb ], [ %i5, %bb1 ] + %i3 = or i32 %i2, 1 + %i4 = or i32 %i3, 0 + %i5 = call i32 @llvm.smax.i32(i32 %i, i32 0) + %i6 = call i32 @llvm.umin.i32(i32 %i5, i32 1) + %i7 = sub i32 0, %i6 + %i8 = call i32 @llvm.abs.i32(i32 %i7, i1 false) + %i9 = or i32 %i, 1 + %i10 = icmp sgt i32 %i, 0 + br i1 %i10, label %bb1, label %bb11 + +bb11: ; preds = %bb1 + %i12 = bitcast i32 %i8 to float + ret float %i12 +} + define amdgpu_ps i16 @absdiff_i16_false(i16 inreg %arg0, i16 inreg %arg1) { ; CHECK-LABEL: absdiff_i16_false: ; CHECK: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-absdiff.mir b/llvm/test/CodeGen/AMDGPU/move-to-valu-absdiff.mir new file mode 100644 index 0000000000000..f4c99881170d1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-absdiff.mir @@ -0,0 +1,35 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefix=GFX8 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefix=GFX12 %s + +--- +name: absdiff_i32 +body: | + bb.0: + ; GFX8-LABEL: name: absdiff_i32 + ; GFX8: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX8-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX8-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX8-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[DEF]], [[DEF1]], [[DEF2]], implicit $exec + ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 10 + ; GFX8-NEXT: [[V_SUB_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_CO_U32_e32 [[S_MOV_B32_]], [[V_LSHL_ADD_U32_e64_]], implicit-def $vcc, implicit $exec + ; GFX8-NEXT: [[V_SUB_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_SUB_CO_U32_e32 0, [[V_SUB_CO_U32_e32_]], implicit-def $vcc, implicit $exec + ; GFX8-NEXT: [[V_MAX_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[V_SUB_CO_U32_e32_]], [[V_SUB_CO_U32_e32_1]], implicit $exec + ; + ; GFX12-LABEL: name: absdiff_i32 + ; GFX12: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[DEF]], [[DEF1]], [[DEF2]], implicit $exec + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 10 + ; GFX12-NEXT: [[V_SUB_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 [[S_MOV_B32_]], [[V_LSHL_ADD_U32_e64_]], implicit $exec + ; GFX12-NEXT: [[V_SUB_U32_e32_1:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 0, [[V_SUB_U32_e32_]], implicit $exec + ; GFX12-NEXT: [[V_MAX_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[V_SUB_U32_e32_]], [[V_SUB_U32_e32_1]], implicit $exec + %0:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = V_LSHL_ADD_U32_e64 %0:vgpr_32, %1:vgpr_32, %2:vgpr_32, implicit $exec + %4:sreg_32 = COPY %3:vgpr_32 + %5:sreg_32 = S_MOV_B32 10 + %6:sreg_32 = S_ABSDIFF_I32 killed %5:sreg_32, %4:sreg_32, implicit-def dead $scc +... From 0cbff2f2982825d4ddea9f2e347124d591510332 Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Wed, 12 Nov 2025 10:38:58 -0500 Subject: [PATCH 2/3] Use liveins in test --- .../CodeGen/AMDGPU/move-to-valu-absdiff.mir | 26 ++++++++----------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-absdiff.mir b/llvm/test/CodeGen/AMDGPU/move-to-valu-absdiff.mir index f4c99881170d1..7d356d6c6f9f6 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-absdiff.mir +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-absdiff.mir @@ -6,30 +6,26 @@ name: absdiff_i32 body: | bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX8-LABEL: name: absdiff_i32 - ; GFX8: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GFX8-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GFX8-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GFX8-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[DEF]], [[DEF1]], [[DEF2]], implicit $exec + ; GFX8: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 $vgpr0, $vgpr1, $vgpr2, implicit $exec ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 10 ; GFX8-NEXT: [[V_SUB_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_CO_U32_e32 [[S_MOV_B32_]], [[V_LSHL_ADD_U32_e64_]], implicit-def $vcc, implicit $exec ; GFX8-NEXT: [[V_SUB_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_SUB_CO_U32_e32 0, [[V_SUB_CO_U32_e32_]], implicit-def $vcc, implicit $exec ; GFX8-NEXT: [[V_MAX_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[V_SUB_CO_U32_e32_]], [[V_SUB_CO_U32_e32_1]], implicit $exec ; ; GFX12-LABEL: name: absdiff_i32 - ; GFX12: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GFX12-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GFX12-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GFX12-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[DEF]], [[DEF1]], [[DEF2]], implicit $exec + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 $vgpr0, $vgpr1, $vgpr2, implicit $exec ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 10 ; GFX12-NEXT: [[V_SUB_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 [[S_MOV_B32_]], [[V_LSHL_ADD_U32_e64_]], implicit $exec ; GFX12-NEXT: [[V_SUB_U32_e32_1:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 0, [[V_SUB_U32_e32_]], implicit $exec ; GFX12-NEXT: [[V_MAX_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[V_SUB_U32_e32_]], [[V_SUB_U32_e32_1]], implicit $exec - %0:vgpr_32 = IMPLICIT_DEF - %1:vgpr_32 = IMPLICIT_DEF - %2:vgpr_32 = IMPLICIT_DEF - %3:vgpr_32 = V_LSHL_ADD_U32_e64 %0:vgpr_32, %1:vgpr_32, %2:vgpr_32, implicit $exec - %4:sreg_32 = COPY %3:vgpr_32 - %5:sreg_32 = S_MOV_B32 10 - %6:sreg_32 = S_ABSDIFF_I32 killed %5:sreg_32, %4:sreg_32, implicit-def dead $scc + %0:vgpr_32 = V_LSHL_ADD_U32_e64 $vgpr0, $vgpr1, $vgpr2, implicit $exec + %1:sreg_32 = COPY %0:vgpr_32 + %2:sreg_32 = S_MOV_B32 10 + %3:sreg_32 = S_ABSDIFF_I32 killed %2:sreg_32, %1:sreg_32, implicit-def dead $scc ... From 8c5c6dc9b8e11ffeae5133c174b865cd4f25e158 Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Thu, 13 Nov 2025 07:31:19 -0500 Subject: [PATCH 3/3] Changes after review (add const and remove verify-machineinstrs) --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +- llvm/test/CodeGen/AMDGPU/move-to-valu-absdiff.mir | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index f7c4fb2351b69..5ea837c5316dd 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -8434,7 +8434,7 @@ void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist, MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineBasicBlock::iterator MII = Inst; - DebugLoc DL = Inst.getDebugLoc(); + const DebugLoc &DL = Inst.getDebugLoc(); MachineOperand &Dest = Inst.getOperand(0); MachineOperand &Src1 = Inst.getOperand(1); diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-absdiff.mir b/llvm/test/CodeGen/AMDGPU/move-to-valu-absdiff.mir index 7d356d6c6f9f6..267543800bf68 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-absdiff.mir +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-absdiff.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefix=GFX8 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefix=GFX12 %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=si-fix-sgpr-copies -o - %s | FileCheck --check-prefix=GFX8 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=si-fix-sgpr-copies -o - %s | FileCheck --check-prefix=GFX12 %s --- name: absdiff_i32