-
Notifications
You must be signed in to change notification settings - Fork 15.2k
AMDGPU: Simplify foldImmediate with register class based checks #154682
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesGeneralize the code over the properties of the mov instruction, This is NFC-ish. It now does a better job with imm pseudos which I added a couple of new tests with 16-bit extract of 64-bit sources. Full diff: https://github.com/llvm/llvm-project/pull/154682.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index df638bd65bdaa..75b303086163b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3573,54 +3573,93 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
Register DstReg = UseMI.getOperand(0).getReg();
- unsigned OpSize = getOpSize(UseMI, 0);
- bool Is16Bit = OpSize == 2;
- bool Is64Bit = OpSize == 8;
- bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
- unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
- : AMDGPU::V_MOV_B32_e32
- : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
- : AMDGPU::S_MOV_B32;
-
- std::optional<int64_t> SubRegImm =
- extractSubregFromImm(Imm, UseMI.getOperand(1).getSubReg());
-
- APInt Imm(Is64Bit ? 64 : 32, *SubRegImm,
- /*isSigned=*/true, /*implicitTrunc=*/true);
-
- if (RI.isAGPR(*MRI, DstReg)) {
- if (Is64Bit || !isInlineConstant(Imm))
- return false;
- NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
- }
+ Register UseSubReg = UseMI.getOperand(1).getSubReg();
+
+ const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
+
+ bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
+ RI.getSubRegIdxSize(UseSubReg) == 16;
if (Is16Bit) {
- if (isVGPRCopy)
+ if (RI.hasVGPRs(DstRC))
return false; // Do not clobber vgpr_hi16
- if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
+ if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
return false;
-
- UseMI.getOperand(0).setSubReg(0);
- if (DstReg.isPhysical()) {
- DstReg = RI.get32BitRegister(DstReg);
- UseMI.getOperand(0).setReg(DstReg);
- }
- assert(UseMI.getOperand(1).getReg().isVirtual());
}
MachineFunction *MF = UseMI.getMF();
- const MCInstrDesc &NewMCID = get(NewOpc);
- const TargetRegisterClass *NewDefRC = getRegClass(NewMCID, 0, &RI, *MF);
- if (DstReg.isPhysical()) {
- if (!NewDefRC->contains(DstReg))
- return false;
- } else if (!MRI->constrainRegClass(DstReg, NewDefRC))
+ unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
+ MCRegister MovDstPhysReg =
+ DstReg.isPhysical() ? MCRegister(DstReg) : MCRegister();
+
+ std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
+
+ // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
+ for (unsigned MovOp :
+ {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
+ AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
+ const MCInstrDesc &MovDesc = get(MovOp);
+
+ const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI, *MF);
+ if (Is16Bit) {
+ // We just need to find a correctly sized register class, so the
+ // subregister index compatibility doesn't matter since we're statically
+ // extracting the immediate value.
+ MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
+ if (!MovDstRC)
+ continue;
+
+ if (MovDstPhysReg) {
+ // FIXME: We probably should not do this. If there is a live value in
+ // the high half of the register, it will be corrupted.
+ MovDstPhysReg = RI.getMatchingSuperReg(MCRegister(DstReg),
+ AMDGPU::lo16, MovDstRC);
+ if (!MovDstPhysReg)
+ continue;
+ }
+ }
+
+ // Result class isn't the right size, try the next instruction.
+ if (MovDstPhysReg) {
+ if (!MovDstRC->contains(MovDstPhysReg))
+ return false;
+ } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
+ // TODO: This will be overly conservative in the case of 16-bit virtual
+ // SGPRs. We could hack up the virtual register uses to use a compatible
+ // 32-bit class.
+ continue;
+ }
+
+ const MCOperandInfo &OpInfo = MovDesc.operands()[1];
+
+ // Ensure the interpreted immediate value is a valid operand in the new
+ // mov.
+ //
+ // FIXME: isImmOperandLegal should have form that doesn't require existing
+ // MachineInstr or MachineOperand
+ if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
+ !isInlineConstant(*SubRegImm, OpInfo.OperandType))
+ break;
+
+ NewOpc = MovOp;
+ break;
+ }
+
+ if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
return false;
+ if (Is16Bit) {
+ UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
+ if (MovDstPhysReg)
+ UseMI.getOperand(0).setReg(MovDstPhysReg);
+ assert(UseMI.getOperand(1).getReg().isVirtual());
+ }
+
+ const MCInstrDesc &NewMCID = get(NewOpc);
UseMI.setDesc(NewMCID);
- UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
+ UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
UseMI.addImplicitDefUseOperands(*MF);
return true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
index 770e7c048620d..7b3f924d32e89 100644
--- a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
@@ -128,7 +128,7 @@ body: |
; GCN-LABEL: name: fold_sreg_64_sub0_to_vgpr_32
; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
- ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1412567312, implicit $exec
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2882399984, implicit $exec
; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_MOV_B32_e32_]]
%0:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
%1:vgpr_32 = COPY killed %0.sub0
@@ -188,8 +188,7 @@ body: |
; GCN-LABEL: name: fold_sreg_64_to_sreg_64
; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1311768467750121200
- ; GCN-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
- ; GCN-NEXT: SI_RETURN_TO_EPILOG [[S_MOV_B]]
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG [[S_MOV_B64_]]
%0:sreg_64 = S_MOV_B64 1311768467750121200
%1:sreg_64 = COPY killed %0
SI_RETURN_TO_EPILOG %1
@@ -382,7 +381,7 @@ body: |
; GCN-LABEL: name: fold_v_mov_b64_64_sub0_to_vgpr_32
; GCN: [[V_MOV_B64_e32_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec
- ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1412567312, implicit $exec
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2882399984, implicit $exec
; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_MOV_B32_e32_]]
%0:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec
%1:vgpr_32 = COPY killed %0.sub0
@@ -761,8 +760,8 @@ body: |
bb.0:
; GCN-LABEL: name: fold_av_mov_b32_imm_pseudo_inlineimm_to_av
; GCN: [[AV_MOV_:%[0-9]+]]:av_32 = AV_MOV_B32_IMM_PSEUDO 64, implicit $exec
- ; GCN-NEXT: [[COPY:%[0-9]+]]:av_32 = COPY killed [[AV_MOV_]]
- ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[COPY]]
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 64, implicit $exec
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B32_e32_]]
%0:av_32 = AV_MOV_B32_IMM_PSEUDO 64, implicit $exec
%1:av_32 = COPY killed %0
SI_RETURN_TO_EPILOG implicit %1
@@ -800,9 +799,67 @@ body: |
bb.0:
; GCN-LABEL: name: fold_av_mov_b64_imm_pseudo_inlineimm_to_av
; GCN: [[AV_MOV_:%[0-9]+]]:av_64_align2 = AV_MOV_B64_IMM_PSEUDO 64, implicit $exec
- ; GCN-NEXT: [[COPY:%[0-9]+]]:av_64_align2 = COPY killed [[AV_MOV_]]
- ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[COPY]]
+ ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 64, implicit $exec
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]]
%0:av_64_align2 = AV_MOV_B64_IMM_PSEUDO 64, implicit $exec
%1:av_64_align2 = COPY killed %0
SI_RETURN_TO_EPILOG implicit %1
...
+
+---
+name: fold_simm_16_sub_to_lo_from_mov_64_virt_sgpr16
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: fold_simm_16_sub_to_lo_from_mov_64_virt_sgpr16
+ ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_lo16 = COPY killed [[S_MOV_B64_]].lo16
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]]
+ %0:sreg_64 = S_MOV_B64 64
+ %1:sgpr_lo16 = COPY killed %0.lo16
+ SI_RETURN_TO_EPILOG %1
+
+...
+---
+name: fold_simm_16_sub_to_hi_from_mov_64_inline_imm_virt_sgpr16
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: fold_simm_16_sub_to_hi_from_mov_64_inline_imm_virt_sgpr16
+ ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_lo16 = COPY killed [[S_MOV_B64_]].hi16
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]]
+ %0:sreg_64 = S_MOV_B64 64
+ %1:sgpr_lo16 = COPY killed %0.hi16
+ SI_RETURN_TO_EPILOG %1
+
+...
+
+---
+name: fold_simm_16_sub_to_lo_from_mov_64_phys_sgpr16_lo
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: fold_simm_16_sub_to_lo_from_mov_64_phys_sgpr16_lo
+ ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 64
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0_lo16
+ %0:sreg_64 = S_MOV_B64 64
+ $sgpr0_lo16 = COPY killed %0.lo16
+ SI_RETURN_TO_EPILOG $sgpr0_lo16
+
+...
+---
+name: fold_simm_16_sub_to_hi_from_mov_64_inline_imm_phys_sgpr16_lo
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: fold_simm_16_sub_to_hi_from_mov_64_inline_imm_phys_sgpr16_lo
+ ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0_lo16
+ %0:sreg_64 = S_MOV_B64 64
+ $sgpr0_lo16 = COPY killed %0.hi16
+ SI_RETURN_TO_EPILOG $sgpr0_lo16
+
+...
|
10ca969
to
a5d4616
Compare
; GCN-LABEL: name: fold_sreg_64_sub0_to_vgpr_32 | ||
; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200 | ||
; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1412567312, implicit $exec | ||
; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2882399984, implicit $exec |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is the immediate rendering change from zero extended to sign extended. Why is it ok for the transformation to choose which one to use?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. The read value is only the 32-bits since it's a 32-bit operand t
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see, thanks. There are some implicit transformations later such that these are the same value.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Although I'd consider this a bug, we try to consistently keep immediates encoded as sign extended. The non-APInt forms of isInlineImmediate will miss cases that aren't properly sign extended
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
; GCN-LABEL: name: fold_sreg_64_sub0_to_vgpr_32 | ||
; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200 | ||
; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1412567312, implicit $exec | ||
; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2882399984, implicit $exec |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see, thanks. There are some implicit transformations later such that these are the same value.
a5d4616
to
a32d42e
Compare
dfd9b00
to
c9ad541
Compare
Generalize the code over the properties of the mov instruction, rather than maintaining parallel logic to figure out the type of mov to use. I've maintained the behavior with 16-bit physical SGPRs, though I think the behavior here is broken and corrupting any value that happens to be live in the high bits. It just happens there's no way to separately write to those with a real instruction but I don't think we should be trying to make assumptions around that property. This is NFC-ish. It now does a better job with imm pseudos which practically won't reach here. This also will make it easier to support more folds in a future patch. I added a couple of new tests with 16-bit extract of 64-bit sources. The only other test change is an immediate rendering change from zero extended to sign extended.
c9ad541
to
0a06817
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/143/builds/10221 Here is the relevant piece of the build log for the reference
|
Generalize the code over the properties of the mov instruction,
rather than maintaining parallel logic to figure out the type
of mov to use. I've maintained the behavior with 16-bit physical
SGPRs, though I think the behavior here is broken and corrupting
any value that happens to be live in the high bits. It just happens
there's no way to separately write to those with a real instruction
but I don't think we should be trying to make assumptions around
that property.
This is NFC-ish. It now does a better job with imm pseudos which
practically won't reach here. This also will make it easier
to support more folds in a future patch.
I added a couple of new tests with 16-bit extract of 64-bit sources.