diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index cf391856bf733..30e3179f8eb7d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -724,24 +724,39 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const { const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg); + unsigned Size = RI.getRegSizeInBits(*RC); + const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg); + unsigned SrcSize = RI.getRegSizeInBits(*SrcRC); + + // The rest of copyPhysReg assumes Src and Dst size are the same size. + // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can + // we remove Fix16BitCopies and this code block? + if (Fix16BitCopies) { + if (((Size == 16) != (SrcSize == 16))) { + if (ST.hasTrue16BitInsts()) { + // Non-VGPR Src and Dst will later be expanded back to 32 bits. + MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg; + MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16); + RegToFix = SubReg; + } else { + MCRegister &RegToFix = (Size == 16) ? DestReg : SrcReg; + MCRegister Super = RI.get32BitRegister(RegToFix); + assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix || + RI.getSubReg(Super, AMDGPU::hi16) == RegToFix); + RegToFix = Super; + } - // FIXME: This is hack to resolve copies between 16 bit and 32 bit - // registers until all patterns are fixed. - if (Fix16BitCopies && - ((RI.getRegSizeInBits(*RC) == 16) ^ - (RI.getRegSizeInBits(*RI.getPhysRegBaseClass(SrcReg)) == 16))) { - MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg; - MCRegister Super = RI.get32BitRegister(RegToFix); - assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix); - RegToFix = Super; - - if (DestReg == SrcReg) { - // Insert empty bundle since ExpandPostRA expects an instruction here. - BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE)); - return; + if (DestReg == SrcReg) { + // Identity copy. Insert empty bundle since ExpandPostRA expects an + // instruction here. + BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE)); + return; + } + RC = RI.getPhysRegBaseClass(DestReg); + Size = RI.getRegSizeInBits(*RC); + SrcRC = RI.getPhysRegBaseClass(SrcReg); + SrcSize = RI.getRegSizeInBits(*SrcRC); } - - RC = RI.getPhysRegBaseClass(DestReg); } if (RC == &AMDGPU::VGPR_32RegClass) { @@ -865,10 +880,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - const unsigned Size = RI.getRegSizeInBits(*RC); if (Size == 16) { - assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || - AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || + assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)); @@ -906,6 +919,25 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } + if (ST.hasTrue16BitInsts()) { + if (IsSGPRSrc) { + assert(SrcLow); + SrcReg = NewSrcReg; + } + // Use the smaller instruction encoding if possible. + if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) && + (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) { + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg) + .addReg(SrcReg); + } else { + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg) + .addImm(0) // src0_modifiers + .addReg(SrcReg) + .addImm(0); // op_sel + } + return; + } + if (IsSGPRSrc && !ST.hasSDWAScalar()) { if (!DstLow || !SrcLow) { reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, @@ -932,7 +964,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg); if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) { if (ST.hasMovB64()) { BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg) @@ -1288,7 +1319,11 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { if (RI.isAGPRClass(DstRC)) return AMDGPU::COPY; - if (RI.getRegSizeInBits(*DstRC) == 32) { + if (RI.getRegSizeInBits(*DstRC) == 16) { + // Assume hi bits are unneeded. Only _e64 true16 instructions are legal + // before RA. + return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64; + } else if (RI.getRegSizeInBits(*DstRC) == 32) { return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { return AMDGPU::S_MOV_B64; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 00e4701e33bf5..734db326fb77d 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -656,6 +656,7 @@ let SubtargetPredicate = isGFX11Plus in { getVOP1Pat64.ret, /*VOP1Only=*/ 1>; + defm V_MOV_B16_t16 : VOP1Inst<"v_mov_b16_t16", VOPProfile_True16>; defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>; defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>; defm V_CVT_U32_U16 : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>; @@ -804,6 +805,7 @@ defm V_CTZ_I32_B32 : VOP1_Real_FULL_with_name_gfx11<0x03a, defm V_CLS_I32 : VOP1_Real_FULL_with_name_gfx11<0x03b, "V_FFBH_I32", "v_cls_i32">; defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11<0x067>; +defm V_MOV_B16_t16 : VOP1_Real_FULL_t16_gfx11<0x01c, "v_mov_b16">; defm V_NOT_B16_t16 : VOP1_Real_FULL_t16_gfx11<0x069, "v_not_b16">; defm V_CVT_I32_I16_t16 : VOP1_Real_FULL_t16_gfx11<0x06a, "v_cvt_i32_i16">; defm V_CVT_U32_U16_t16 : VOP1_Real_FULL_t16_gfx11<0x06b, "v_cvt_u32_u16">;