diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 15bf2d27da142..69708c47f6c9c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3573,54 +3573,93 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form"); Register DstReg = UseMI.getOperand(0).getReg(); - unsigned OpSize = getOpSize(UseMI, 0); - bool Is16Bit = OpSize == 2; - bool Is64Bit = OpSize == 8; - bool isVGPRCopy = RI.isVGPR(*MRI, DstReg); - unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO - : AMDGPU::V_MOV_B32_e32 - : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO - : AMDGPU::S_MOV_B32; - - std::optional SubRegImm = - extractSubregFromImm(Imm, UseMI.getOperand(1).getSubReg()); - - APInt Imm(Is64Bit ? 64 : 32, *SubRegImm, - /*isSigned=*/true, /*implicitTrunc=*/true); - - if (RI.isAGPR(*MRI, DstReg)) { - if (Is64Bit || !isInlineConstant(Imm)) - return false; - NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64; - } + Register UseSubReg = UseMI.getOperand(1).getSubReg(); + + const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg); + + bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister && + RI.getSubRegIdxSize(UseSubReg) == 16; if (Is16Bit) { - if (isVGPRCopy) + if (RI.hasVGPRs(DstRC)) return false; // Do not clobber vgpr_hi16 - if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16) + if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16) return false; - - UseMI.getOperand(0).setSubReg(0); - if (DstReg.isPhysical()) { - DstReg = RI.get32BitRegister(DstReg); - UseMI.getOperand(0).setReg(DstReg); - } - assert(UseMI.getOperand(1).getReg().isVirtual()); } MachineFunction *MF = UseMI.getMF(); - const MCInstrDesc &NewMCID = get(NewOpc); - const TargetRegisterClass *NewDefRC = getRegClass(NewMCID, 0, &RI, *MF); - if (DstReg.isPhysical()) { - if (!NewDefRC->contains(DstReg)) - return false; - } else if (!MRI->constrainRegClass(DstReg, NewDefRC)) + unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END; + MCRegister MovDstPhysReg = + DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister(); + + std::optional SubRegImm = extractSubregFromImm(Imm, UseSubReg); + + // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64 + for (unsigned MovOp : + {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64, + AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) { + const MCInstrDesc &MovDesc = get(MovOp); + + const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI, *MF); + if (Is16Bit) { + // We just need to find a correctly sized register class, so the + // subregister index compatibility doesn't matter since we're statically + // extracting the immediate value. + MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16); + if (!MovDstRC) + continue; + + if (MovDstPhysReg) { + // FIXME: We probably should not do this. If there is a live value in + // the high half of the register, it will be corrupted. + MovDstPhysReg = + RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC); + if (!MovDstPhysReg) + continue; + } + } + + // Result class isn't the right size, try the next instruction. + if (MovDstPhysReg) { + if (!MovDstRC->contains(MovDstPhysReg)) + return false; + } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) { + // TODO: This will be overly conservative in the case of 16-bit virtual + // SGPRs. We could hack up the virtual register uses to use a compatible + // 32-bit class. + continue; + } + + const MCOperandInfo &OpInfo = MovDesc.operands()[1]; + + // Ensure the interpreted immediate value is a valid operand in the new + // mov. + // + // FIXME: isImmOperandLegal should have form that doesn't require existing + // MachineInstr or MachineOperand + if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) && + !isInlineConstant(*SubRegImm, OpInfo.OperandType)) + break; + + NewOpc = MovOp; + break; + } + + if (NewOpc == AMDGPU::INSTRUCTION_LIST_END) return false; + if (Is16Bit) { + UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister); + if (MovDstPhysReg) + UseMI.getOperand(0).setReg(MovDstPhysReg); + assert(UseMI.getOperand(1).getReg().isVirtual()); + } + + const MCInstrDesc &NewMCID = get(NewOpc); UseMI.setDesc(NewMCID); - UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue()); + UseMI.getOperand(1).ChangeToImmediate(*SubRegImm); UseMI.addImplicitDefUseOperands(*MF); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir index b611bf2c49a2f..21455a9f5074f 100644 --- a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir +++ b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir @@ -188,8 +188,7 @@ body: | ; GCN-LABEL: name: fold_sreg_64_to_sreg_64 ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1311768467750121200 - ; GCN-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200 - ; GCN-NEXT: SI_RETURN_TO_EPILOG [[S_MOV_B]] + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[S_MOV_B64_]] %0:sreg_64 = S_MOV_B64 1311768467750121200 %1:sreg_64 = COPY killed %0 SI_RETURN_TO_EPILOG %1 @@ -761,8 +760,8 @@ body: | bb.0: ; GCN-LABEL: name: fold_av_mov_b32_imm_pseudo_inlineimm_to_av ; GCN: [[AV_MOV_:%[0-9]+]]:av_32 = AV_MOV_B32_IMM_PSEUDO 64, implicit $exec - ; GCN-NEXT: [[COPY:%[0-9]+]]:av_32 = COPY killed [[AV_MOV_]] - ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[COPY]] + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 64, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B32_e32_]] %0:av_32 = AV_MOV_B32_IMM_PSEUDO 64, implicit $exec %1:av_32 = COPY killed %0 SI_RETURN_TO_EPILOG implicit %1 @@ -800,9 +799,67 @@ body: | bb.0: ; GCN-LABEL: name: fold_av_mov_b64_imm_pseudo_inlineimm_to_av ; GCN: [[AV_MOV_:%[0-9]+]]:av_64_align2 = AV_MOV_B64_IMM_PSEUDO 64, implicit $exec - ; GCN-NEXT: [[COPY:%[0-9]+]]:av_64_align2 = COPY killed [[AV_MOV_]] - ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[COPY]] + ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 64, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]] %0:av_64_align2 = AV_MOV_B64_IMM_PSEUDO 64, implicit $exec %1:av_64_align2 = COPY killed %0 SI_RETURN_TO_EPILOG implicit %1 ... + +--- +name: fold_simm_16_sub_to_lo_from_mov_64_virt_sgpr16 +body: | + bb.0: + + ; GCN-LABEL: name: fold_simm_16_sub_to_lo_from_mov_64_virt_sgpr16 + ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_lo16 = COPY killed [[S_MOV_B64_]].lo16 + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]] + %0:sreg_64 = S_MOV_B64 64 + %1:sgpr_lo16 = COPY killed %0.lo16 + SI_RETURN_TO_EPILOG %1 + +... +--- +name: fold_simm_16_sub_to_hi_from_mov_64_inline_imm_virt_sgpr16 +body: | + bb.0: + + ; GCN-LABEL: name: fold_simm_16_sub_to_hi_from_mov_64_inline_imm_virt_sgpr16 + ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_lo16 = COPY killed [[S_MOV_B64_]].hi16 + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]] + %0:sreg_64 = S_MOV_B64 64 + %1:sgpr_lo16 = COPY killed %0.hi16 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_simm_16_sub_to_lo_from_mov_64_phys_sgpr16_lo +body: | + bb.0: + + ; GCN-LABEL: name: fold_simm_16_sub_to_lo_from_mov_64_phys_sgpr16_lo + ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64 + ; GCN-NEXT: $sgpr0 = S_MOV_B32 64 + ; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0_lo16 + %0:sreg_64 = S_MOV_B64 64 + $sgpr0_lo16 = COPY killed %0.lo16 + SI_RETURN_TO_EPILOG $sgpr0_lo16 + +... +--- +name: fold_simm_16_sub_to_hi_from_mov_64_inline_imm_phys_sgpr16_lo +body: | + bb.0: + + ; GCN-LABEL: name: fold_simm_16_sub_to_hi_from_mov_64_inline_imm_phys_sgpr16_lo + ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64 + ; GCN-NEXT: $sgpr0 = S_MOV_B32 0 + ; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0_lo16 + %0:sreg_64 = S_MOV_B64 64 + $sgpr0_lo16 = COPY killed %0.hi16 + SI_RETURN_TO_EPILOG $sgpr0_lo16 + +...