-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Don't fold an i64 immediate value if it can't be replicated from its lower 32-bit #168458
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
67f5f1f
d679d57
58aa511
a3e8aa6
bb03359
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -766,6 +766,37 @@ static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList, | |
| FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp)); | ||
| } | ||
|
|
||
| // Returns true if the instruction is a packed f32 instruction that only reads | ||
| // 32 bits from a scalar operand (SGPR or literal) and replicates the bits to | ||
| // both channels. | ||
| static bool | ||
| isPKF32InstrReplicatingLow32BitsOfScalarInput(const GCNSubtarget *ST, | ||
| MachineInstr *MI) { | ||
| if (!ST->hasPKF32InstsReplicatingLow32BitsOfScalarInput()) | ||
| return false; | ||
| switch (MI->getOpcode()) { | ||
| case AMDGPU::V_PK_ADD_F32: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should be able to get this from the operand type, not the opcode
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But not all
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are only these 3 of them anyway?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you pass an operand here, I believe only these 3 instructions will have OPERAND_REG_IMM_V2FP32 type.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. but then we need to get
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And the logic still doesn't add up here. The operand of the instruction is not
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is in the MCInstrDesc regardless of an actual operand. The point is not to forget real instructions, just in case. I do not insist though.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's not remotely difficult?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Say Will
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it does not depend on the MachineInstr. It takes the info from MCInstrDesc. |
||
| case AMDGPU::V_PK_MUL_F32: | ||
| case AMDGPU::V_PK_FMA_F32: | ||
| return true; | ||
| default: | ||
| return false; | ||
| } | ||
| llvm_unreachable("unknown instruction"); | ||
| } | ||
|
|
||
| // Packed FP32 instructions only read 32 bits from a scalar operand (SGPR or | ||
| // literal) and replicates the bits to both channels. Therefore, if the hi and | ||
| // lo are not same, we can't fold it. | ||
| static bool checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput( | ||
| const FoldableDef &OpToFold) { | ||
| assert(OpToFold.isImm() && "Expected immediate operand"); | ||
| uint64_t ImmVal = OpToFold.getEffectiveImmVal().value(); | ||
| uint32_t Lo = Lo_32(ImmVal); | ||
| uint32_t Hi = Hi_32(ImmVal); | ||
| return Lo == Hi; | ||
| } | ||
|
|
||
| bool SIFoldOperandsImpl::tryAddToFoldList( | ||
| SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo, | ||
| const FoldableDef &OpToFold) const { | ||
|
|
@@ -919,6 +950,13 @@ bool SIFoldOperandsImpl::tryAddToFoldList( | |
| return true; | ||
| } | ||
|
|
||
| // Special case for PK_F32 instructions if we are trying to fold an imm to | ||
| // src0 or src1. | ||
| if (OpToFold.isImm() && | ||
| isPKF32InstrReplicatingLow32BitsOfScalarInput(ST, MI) && | ||
| !checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput(OpToFold)) | ||
| return false; | ||
|
|
||
| appendFoldCandidate(FoldList, MI, OpNo, OpToFold); | ||
| return true; | ||
| } | ||
|
|
@@ -1134,6 +1172,9 @@ bool SIFoldOperandsImpl::tryToFoldACImm( | |
| return false; | ||
|
|
||
| if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) { | ||
| if (isPKF32InstrReplicatingLow32BitsOfScalarInput(ST, UseMI) && | ||
| !checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput(OpToFold)) | ||
| return false; | ||
| appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold); | ||
| return true; | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,64 @@ | ||
| # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 | ||
| # RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1250 -run-pass=si-fold-operands -o - %s | FileCheck %s | ||
|
|
||
| --- | ||
| name: pk_add_f32_imm_fold | ||
| body: | | ||
| bb.0.entry: | ||
| liveins: $sgpr0_sgpr1 | ||
|
|
||
| ; CHECK-LABEL: name: pk_add_f32_imm_fold | ||
| ; CHECK: liveins: $sgpr0_sgpr1 | ||
| ; CHECK-NEXT: {{ $}} | ||
| ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF | ||
| ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1065353216, implicit $exec | ||
| ; CHECK-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 11, [[DEF]], 8, [[V_MOV_B]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec | ||
| ; CHECK-NEXT: S_ENDPGM 0 | ||
| %0:vreg_64_align2 = IMPLICIT_DEF | ||
| %1:sreg_64 = S_MOV_B64 1065353216 | ||
| %2:vreg_64_align2 = COPY killed %1 | ||
| %3:vreg_64_align2 = V_PK_ADD_F32 11, %0, 8, %2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec | ||
| S_ENDPGM 0 | ||
| ... | ||
|
|
||
| --- | ||
| name: pk_mul_f32_imm_fold | ||
| body: | | ||
| bb.0.entry: | ||
| liveins: $sgpr0_sgpr1 | ||
|
|
||
| ; CHECK-LABEL: name: pk_mul_f32_imm_fold | ||
| ; CHECK: liveins: $sgpr0_sgpr1 | ||
| ; CHECK-NEXT: {{ $}} | ||
| ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF | ||
| ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1065353216, implicit $exec | ||
| ; CHECK-NEXT: [[V_PK_MUL_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_MUL_F32 11, [[DEF]], 8, [[V_MOV_B]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec | ||
| ; CHECK-NEXT: S_ENDPGM 0 | ||
| %0:vreg_64_align2 = IMPLICIT_DEF | ||
| %1:sreg_64 = S_MOV_B64 1065353216 | ||
| %2:vreg_64_align2 = COPY killed %1 | ||
| %3:vreg_64_align2 = V_PK_MUL_F32 11, %0, 8, %2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec | ||
| S_ENDPGM 0 | ||
| ... | ||
|
|
||
| --- | ||
| name: pk_fma_f32_imm_fold | ||
| body: | | ||
| bb.0.entry: | ||
| liveins: $sgpr0_sgpr1 | ||
|
|
||
| ; CHECK-LABEL: name: pk_fma_f32_imm_fold | ||
| ; CHECK: liveins: $sgpr0_sgpr1 | ||
| ; CHECK-NEXT: {{ $}} | ||
| ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF | ||
| ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF | ||
| ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1065353216, implicit $exec | ||
| ; CHECK-NEXT: [[V_PK_FMA_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_FMA_F32 0, [[DEF]], 8, [[DEF1]], 11, [[V_MOV_B]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec | ||
| ; CHECK-NEXT: S_ENDPGM 0 | ||
| %0:vreg_64_align2 = IMPLICIT_DEF | ||
| %1:vreg_64_align2 = IMPLICIT_DEF | ||
| %2:sreg_64 = S_MOV_B64 1065353216 | ||
| %3:vreg_64_align2 = COPY killed %2 | ||
| %4:vreg_64_align2 = V_PK_FMA_F32 0, %0, 8, %1, 11, %3, 0, 0, 0, 0, 0, implicit $mode, implicit $exec | ||
| S_ENDPGM 0 | ||
| ... |
Uh oh!
There was an error while loading. Please reload this page.