diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index f377b8aaf1333..ca98b80787fb4 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1420,6 +1420,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, /// \returns true if the target has instructions with xf32 format support. bool hasXF32Insts() const { return HasXF32Insts; } + /// \returns true if the target has packed f32 instructions that only read 32 + /// bits from a scalar operand (SGPR or literal) and replicates the bits to + /// both channels. + bool hasPKF32InstsReplicatingLow32BitsOfScalarInput() const { + return getGeneration() == GFX12 && GFX1250Insts; + } + bool hasBitOp3Insts() const { return HasBitOp3Insts; } bool hasPermlane16Swap() const { return HasPermlane16Swap; } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 18d90346d1d88..289bf1a563ffc 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -766,6 +766,37 @@ static void appendFoldCandidate(SmallVectorImpl &FoldList, FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp)); } +// Returns true if the instruction is a packed f32 instruction that only reads +// 32 bits from a scalar operand (SGPR or literal) and replicates the bits to +// both channels. +static bool +isPKF32InstrReplicatingLow32BitsOfScalarInput(const GCNSubtarget *ST, + MachineInstr *MI) { + if (!ST->hasPKF32InstsReplicatingLow32BitsOfScalarInput()) + return false; + switch (MI->getOpcode()) { + case AMDGPU::V_PK_ADD_F32: + case AMDGPU::V_PK_MUL_F32: + case AMDGPU::V_PK_FMA_F32: + return true; + default: + return false; + } + llvm_unreachable("unknown instruction"); +} + +// Packed FP32 instructions only read 32 bits from a scalar operand (SGPR or +// literal) and replicates the bits to both channels. Therefore, if the hi and +// lo are not same, we can't fold it. +static bool checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput( + const FoldableDef &OpToFold) { + assert(OpToFold.isImm() && "Expected immediate operand"); + uint64_t ImmVal = OpToFold.getEffectiveImmVal().value(); + uint32_t Lo = Lo_32(ImmVal); + uint32_t Hi = Hi_32(ImmVal); + return Lo == Hi; +} + bool SIFoldOperandsImpl::tryAddToFoldList( SmallVectorImpl &FoldList, MachineInstr *MI, unsigned OpNo, const FoldableDef &OpToFold) const { @@ -919,6 +950,13 @@ bool SIFoldOperandsImpl::tryAddToFoldList( return true; } + // Special case for PK_F32 instructions if we are trying to fold an imm to + // src0 or src1. + if (OpToFold.isImm() && + isPKF32InstrReplicatingLow32BitsOfScalarInput(ST, MI) && + !checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput(OpToFold)) + return false; + appendFoldCandidate(FoldList, MI, OpNo, OpToFold); return true; } @@ -1134,6 +1172,9 @@ bool SIFoldOperandsImpl::tryToFoldACImm( return false; if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) { + if (isPKF32InstrReplicatingLow32BitsOfScalarInput(ST, UseMI) && + !checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput(OpToFold)) + return false; appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/bug-pk-f32-imm-fold.mir b/llvm/test/CodeGen/AMDGPU/bug-pk-f32-imm-fold.mir new file mode 100644 index 0000000000000..a725fab5e40c1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bug-pk-f32-imm-fold.mir @@ -0,0 +1,64 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1250 -run-pass=si-fold-operands -o - %s | FileCheck %s + +--- +name: pk_add_f32_imm_fold +body: | + bb.0.entry: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: pk_add_f32_imm_fold + ; CHECK: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1065353216, implicit $exec + ; CHECK-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 11, [[DEF]], 8, [[V_MOV_B]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %0:vreg_64_align2 = IMPLICIT_DEF + %1:sreg_64 = S_MOV_B64 1065353216 + %2:vreg_64_align2 = COPY killed %1 + %3:vreg_64_align2 = V_PK_ADD_F32 11, %0, 8, %2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_f32_imm_fold +body: | + bb.0.entry: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: pk_mul_f32_imm_fold + ; CHECK: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1065353216, implicit $exec + ; CHECK-NEXT: [[V_PK_MUL_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_MUL_F32 11, [[DEF]], 8, [[V_MOV_B]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %0:vreg_64_align2 = IMPLICIT_DEF + %1:sreg_64 = S_MOV_B64 1065353216 + %2:vreg_64_align2 = COPY killed %1 + %3:vreg_64_align2 = V_PK_MUL_F32 11, %0, 8, %2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_fma_f32_imm_fold +body: | + bb.0.entry: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: pk_fma_f32_imm_fold + ; CHECK: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1065353216, implicit $exec + ; CHECK-NEXT: [[V_PK_FMA_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_FMA_F32 0, [[DEF]], 8, [[DEF1]], 11, [[V_MOV_B]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vreg_64_align2 = IMPLICIT_DEF + %2:sreg_64 = S_MOV_B64 1065353216 + %3:vreg_64_align2 = COPY killed %2 + %4:vreg_64_align2 = V_PK_FMA_F32 0, %0, 8, %1, 11, %3, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index 1177474f5b4f5..6b45d31da0e95 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -732,12 +732,13 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) { ; GFX1250-SDAG-LABEL: fadd_v2_v_lit_hi0: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x3f800000 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0 -; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fadd_v2_v_lit_hi0: