Skip to content

Commit

Permalink
[AMDGPU][True16] Support emitting copies between different register s…
Browse files Browse the repository at this point in the history
…izes.

Differential Revision: https://reviews.llvm.org/D156105
  • Loading branch information
kosarev committed Sep 26, 2023
1 parent 431969e commit 758df22
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 21 deletions.
77 changes: 56 additions & 21 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -724,24 +724,39 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
const DebugLoc &DL, MCRegister DestReg,
MCRegister SrcReg, bool KillSrc) const {
const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
unsigned Size = RI.getRegSizeInBits(*RC);
const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);

// The rest of copyPhysReg assumes Src and Dst size are the same size.
// TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
// we remove Fix16BitCopies and this code block?
if (Fix16BitCopies) {
if (((Size == 16) != (SrcSize == 16))) {
if (ST.hasTrue16BitInsts()) {
// Non-VGPR Src and Dst will later be expanded back to 32 bits.
MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
RegToFix = SubReg;
} else {
MCRegister &RegToFix = (Size == 16) ? DestReg : SrcReg;
MCRegister Super = RI.get32BitRegister(RegToFix);
assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix ||
RI.getSubReg(Super, AMDGPU::hi16) == RegToFix);
RegToFix = Super;
}

// FIXME: This is hack to resolve copies between 16 bit and 32 bit
// registers until all patterns are fixed.
if (Fix16BitCopies &&
((RI.getRegSizeInBits(*RC) == 16) ^
(RI.getRegSizeInBits(*RI.getPhysRegBaseClass(SrcReg)) == 16))) {
MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg;
MCRegister Super = RI.get32BitRegister(RegToFix);
assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix);
RegToFix = Super;

if (DestReg == SrcReg) {
// Insert empty bundle since ExpandPostRA expects an instruction here.
BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
return;
if (DestReg == SrcReg) {
// Identity copy. Insert empty bundle since ExpandPostRA expects an
// instruction here.
BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
return;
}
RC = RI.getPhysRegBaseClass(DestReg);
Size = RI.getRegSizeInBits(*RC);
SrcRC = RI.getPhysRegBaseClass(SrcReg);
SrcSize = RI.getRegSizeInBits(*SrcRC);
}

RC = RI.getPhysRegBaseClass(DestReg);
}

if (RC == &AMDGPU::VGPR_32RegClass) {
Expand Down Expand Up @@ -865,10 +880,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}

const unsigned Size = RI.getRegSizeInBits(*RC);
if (Size == 16) {
assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
AMDGPU::VGPR_HI16RegClass.contains(SrcReg) ||
assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
AMDGPU::AGPR_LO16RegClass.contains(SrcReg));

Expand Down Expand Up @@ -906,6 +919,25 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}

if (ST.hasTrue16BitInsts()) {
if (IsSGPRSrc) {
assert(SrcLow);
SrcReg = NewSrcReg;
}
// Use the smaller instruction encoding if possible.
if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
(IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
.addReg(SrcReg);
} else {
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
.addImm(0) // src0_modifiers
.addReg(SrcReg)
.addImm(0); // op_sel
}
return;
}

if (IsSGPRSrc && !ST.hasSDWAScalar()) {
if (!DstLow || !SrcLow) {
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
Expand All @@ -932,7 +964,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}

const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
if (ST.hasMovB64()) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
Expand Down Expand Up @@ -1288,7 +1319,11 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {

if (RI.isAGPRClass(DstRC))
return AMDGPU::COPY;
if (RI.getRegSizeInBits(*DstRC) == 32) {
if (RI.getRegSizeInBits(*DstRC) == 16) {
// Assume hi bits are unneeded. Only _e64 true16 instructions are legal
// before RA.
return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
} else if (RI.getRegSizeInBits(*DstRC) == 32) {
return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
} else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
return AMDGPU::S_MOV_B64;
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP1Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -656,6 +656,7 @@ let SubtargetPredicate = isGFX11Plus in {
getVOP1Pat64<int_amdgcn_permlane64,
VOP_MOVRELS>.ret,
/*VOP1Only=*/ 1>;
defm V_MOV_B16_t16 : VOP1Inst<"v_mov_b16_t16", VOPProfile_True16<VOP_I16_I16>>;
defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;
defm V_CVT_U32_U16 : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>;
Expand Down Expand Up @@ -804,6 +805,7 @@ defm V_CTZ_I32_B32 : VOP1_Real_FULL_with_name_gfx11<0x03a,
defm V_CLS_I32 : VOP1_Real_FULL_with_name_gfx11<0x03b,
"V_FFBH_I32", "v_cls_i32">;
defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11<0x067>;
defm V_MOV_B16_t16 : VOP1_Real_FULL_t16_gfx11<0x01c, "v_mov_b16">;
defm V_NOT_B16_t16 : VOP1_Real_FULL_t16_gfx11<0x069, "v_not_b16">;
defm V_CVT_I32_I16_t16 : VOP1_Real_FULL_t16_gfx11<0x06a, "v_cvt_i32_i16">;
defm V_CVT_U32_U16_t16 : VOP1_Real_FULL_t16_gfx11<0x06b, "v_cvt_u32_u16">;
Expand Down

0 comments on commit 758df22

Please sign in to comment.