Skip to content
This repository has been archived by the owner on Apr 23, 2020. It is now read-only.

Commit

Permalink
[AMDGPU] gfx1010 allows VOP3 to have a literal
Browse files Browse the repository at this point in the history
Differential Revision: https://reviews.llvm.org/D61413

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@359756 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
rampitec committed May 2, 2019
1 parent 2090ec9 commit ffc5401
Show file tree
Hide file tree
Showing 18 changed files with 629 additions and 120 deletions.
13 changes: 2 additions & 11 deletions lib/Target/AMDGPU/AMDGPUInstrInfo.td
Expand Up @@ -65,10 +65,6 @@ def AMDGPUIfBreakOp : SDTypeProfile<1, 2,
[SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, i64>]
>;

def AMDGPUAddeSubeOp : SDTypeProfile<2, 3,
[SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>]
>;

//===----------------------------------------------------------------------===//
// AMDGPU DAG Nodes
//
Expand Down Expand Up @@ -203,12 +199,6 @@ def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>;
// out = (src1 > src0) ? 1 : 0
def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>;

// TODO: remove AMDGPUadde/AMDGPUsube when ADDCARRY/SUBCARRY get their own
// nodes in TargetSelectionDAG.td.
def AMDGPUadde : SDNode<"ISD::ADDCARRY", AMDGPUAddeSubeOp, []>;

def AMDGPUsube : SDNode<"ISD::SUBCARRY", AMDGPUAddeSubeOp, []>;

def AMDGPUSetCCOp : SDTypeProfile<1, 3, [ // setcc
SDTCisVT<0, i64>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
]>;
Expand Down Expand Up @@ -249,7 +239,8 @@ def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>;

// Special case divide FMA with scale and flags (src0 = Quotient,
// src1 = Denominator, src2 = Numerator).
def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>;
def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp,
[SDNPOptInGlue]>;

// Single or double precision division fixup.
// Special case divide fixup and flags(src0 = Quotient, src1 =
Expand Down
74 changes: 64 additions & 10 deletions lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
Expand Up @@ -236,7 +236,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
}

bool isRegOrImmWithInputMods(unsigned RCID, MVT type) const {
return isRegClass(RCID) || isInlinableImm(type);
return isRegClass(RCID) || isInlinableImm(type) || isLiteralImm(type);
}

bool isRegOrImmWithInt16InputMods() const {
Expand Down Expand Up @@ -461,7 +461,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
}

bool isVSrcB32() const {
return isVCSrcF32() || isLiteralImm(MVT::i32) || isExpr();
return isVCSrcF32() || isLiteralImm(MVT::i32);
}

bool isVSrcB64() const {
Expand All @@ -473,12 +473,11 @@ class AMDGPUOperand : public MCParsedAsmOperand {
}

bool isVSrcV2B16() const {
llvm_unreachable("cannot happen");
return isVSrcB16();
return isVSrcB16() || isLiteralImm(MVT::v2i16);
}

bool isVSrcF32() const {
return isVCSrcF32() || isLiteralImm(MVT::f32) || isExpr();
return isVCSrcF32() || isLiteralImm(MVT::f32);
}

bool isVSrcF64() const {
Expand All @@ -490,8 +489,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
}

bool isVSrcV2F16() const {
llvm_unreachable("cannot happen");
return isVSrcF16();
return isVSrcF16() || isLiteralImm(MVT::v2f16);
}

bool isKImmFP32() const {
Expand Down Expand Up @@ -1145,6 +1143,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
bool validateMIMGD16(const MCInst &Inst);
bool validateMIMGDim(const MCInst &Inst);
bool validateLdsDirect(const MCInst &Inst);
bool validateVOP3Literal(const MCInst &Inst) const;
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
Expand Down Expand Up @@ -1287,6 +1286,8 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
return &APFloat::IEEEhalf();
default:
llvm_unreachable("unsupported fp type");
Expand Down Expand Up @@ -1419,8 +1420,14 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
return false;
}

// We allow fp literals with f16x2 operands assuming that the specified
// literal goes into the lower half and the upper half is zero. We also
// require that the literal may be losslesly converted to f16.
MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 :
(type == MVT::v2i16)? MVT::i16 : type;

APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val));
return canLosslesslyConvertToFPType(FPLiteral, type);
return canLosslesslyConvertToFPType(FPLiteral, ExpectedType);
}

bool AMDGPUOperand::isRegClass(unsigned RCID) const {
Expand Down Expand Up @@ -1535,7 +1542,9 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP16: {
bool lost;
APFloat FPLiteral(APFloat::IEEEdouble(), Literal);
// Convert literal to single precision
Expand All @@ -1562,6 +1571,8 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
if (isSafeTruncation(Val, 32) &&
AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val),
AsmParser->hasInv2PiInlineImm())) {
Expand Down Expand Up @@ -2419,7 +2430,9 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
case 2: {
const unsigned OperandType = Desc.OpInfo[OpIdx].OperandType;
if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 ||
OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16) {
OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16 ||
OperandType == AMDGPU::OPERAND_REG_IMM_V2INT16 ||
OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16) {
return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm());
} else {
return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm());
Expand Down Expand Up @@ -2919,6 +2932,42 @@ bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const {
return NumLiterals <= 1;
}

// VOP3 literal is only allowed in GFX10+ and only one can be used
bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const {
unsigned Opcode = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opcode);
if (!(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)))
return true;

const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);

const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };

unsigned NumLiterals = 0;
uint32_t LiteralValue;

for (int OpIdx : OpIndices) {
if (OpIdx == -1) break;

const MCOperand &MO = Inst.getOperand(OpIdx);
if (!MO.isImm() || !AMDGPU::isSISrcOperand(Desc, OpIdx))
continue;

if (!isInlineConstant(Inst, OpIdx)) {
uint32_t Value = static_cast<uint32_t>(MO.getImm());
if (NumLiterals == 0 || LiteralValue != Value) {
LiteralValue = Value;
++NumLiterals;
}
}
}

return !NumLiterals ||
(NumLiterals == 1 && getFeatureBits()[AMDGPU::FeatureVOP3Literal]);
}

bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
const SMLoc &IDLoc) {
if (!validateLdsDirect(Inst)) {
Expand All @@ -2931,6 +2980,11 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
"only one literal operand is allowed");
return false;
}
if (!validateVOP3Literal(Inst)) {
Error(IDLoc,
"invalid literal operand");
return false;
}
if (!validateConstantBusLimitations(Inst)) {
Error(IDLoc,
"invalid operand (violates constant bus restrictions)");
Expand Down
8 changes: 8 additions & 0 deletions lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
Expand Up @@ -618,6 +618,14 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
case AMDGPU::OPERAND_REG_IMM_FP16:
printImmediate16(Op.getImm(), STI, O);
break;
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
if (!isUInt<16>(Op.getImm()) &&
STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]) {
printImmediate32(Op.getImm(), STI, O);
break;
}
LLVM_FALLTHROUGH;
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
printImmediateV216(Op.getImm(), STI, O);
Expand Down
5 changes: 5 additions & 0 deletions lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
Expand Up @@ -249,6 +249,11 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
// which does not have f16 support?
return getLit16Encoding(static_cast<uint16_t>(Imm), STI);

case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
if (!isUInt<16>(Imm) && STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal])
return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
LLVM_FALLTHROUGH;
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
uint16_t Lo16 = static_cast<uint16_t>(Imm);
Expand Down
13 changes: 10 additions & 3 deletions lib/Target/AMDGPU/SIFoldOperands.cpp
Expand Up @@ -165,13 +165,16 @@ FunctionPass *llvm::createSIFoldOperandsPass() {

static bool updateOperand(FoldCandidate &Fold,
const SIInstrInfo &TII,
const TargetRegisterInfo &TRI) {
const TargetRegisterInfo &TRI,
const GCNSubtarget &ST) {
MachineInstr *MI = Fold.UseMI;
MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
assert(Old.isReg());

if (Fold.isImm()) {
if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked) {
if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked &&
AMDGPU::isInlinableLiteralV216(static_cast<uint16_t>(Fold.ImmToFold),
ST.hasInv2PiInlineImm())) {
// Set op_sel/op_sel_hi on this operand or bail out if op_sel is
// already set.
unsigned Opcode = MI->getOpcode();
Expand All @@ -192,6 +195,8 @@ static bool updateOperand(FoldCandidate &Fold,
// Only apply the following transformation if that operand requries
// a packed immediate.
switch (TII.get(Opcode).OpInfo[OpNo].OperandType) {
case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
// If upper part is all zero we do not need op_sel_hi.
Expand All @@ -203,6 +208,8 @@ static bool updateOperand(FoldCandidate &Fold,
return true;
}
Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
return true;
}
break;
default:
Expand Down Expand Up @@ -891,7 +898,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
Copy->addImplicitDefUseOperands(*MF);

for (FoldCandidate &Fold : FoldList) {
if (updateOperand(Fold, *TII, *TRI)) {
if (updateOperand(Fold, *TII, *TRI, *ST)) {
// Clear kill flags.
if (Fold.isReg()) {
assert(Fold.OpToFold && Fold.OpToFold->isReg());
Expand Down
60 changes: 34 additions & 26 deletions lib/Target/AMDGPU/SIInstrInfo.cpp
Expand Up @@ -2549,19 +2549,12 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,

return false;
}
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
if (isUInt<16>(Imm)) {
int16_t Trunc = static_cast<int16_t>(Imm);
return ST.has16BitInsts() &&
AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
}
if (!(Imm & 0xffff)) {
return ST.has16BitInsts() &&
AMDGPU::isInlinableLiteral16(Imm >> 16, ST.hasInv2PiInlineImm());
}
uint32_t Trunc = static_cast<uint32_t>(Imm);
return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
}
default:
llvm_unreachable("invalid bitwidth");
Expand Down Expand Up @@ -2603,7 +2596,8 @@ static bool compareMachineOp(const MachineOperand &Op0,

bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
const MachineOperand &MO) const {
const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
const MCInstrDesc &InstDesc = MI.getDesc();
const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];

assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());

Expand All @@ -2616,7 +2610,15 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
if (MO.isImm() && isInlineConstant(MO, OpInfo))
return RI.opCanUseInlineConstant(OpInfo.OperandType);

return RI.opCanUseLiteralConstant(OpInfo.OperandType);
if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
return false;

if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
return true;

const MachineFunction *MF = MI.getParent()->getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
return ST.hasVOP3Literal();
}

bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
Expand Down Expand Up @@ -3600,17 +3602,14 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
MachineOperand &Src1 = MI.getOperand(Src1Idx);

// If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
// we need to only have one constant bus use.
//
// Note we do not need to worry about literal constants here. They are
// disabled for the operand type for instructions because they will always
// violate the one constant bus use rule.
// we need to only have one constant bus use before GFX10.
bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1) {
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
MachineOperand &Src0 = MI.getOperand(Src0Idx);

if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
if (Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) ||
isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx])))
legalizeOpWithMove(MI, Src0Idx);
}

Expand Down Expand Up @@ -3702,10 +3701,8 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
Src1.setSubReg(Src0SubReg);
}

// Legalize VOP3 operands. Because all operand types are supported for any
// operand, and since literal constants are not allowed and should never be
// seen, we only need to worry about inserting copies if we use multiple SGPR
// operands.
// Legalize VOP3 operands. All operand types are supported for any operand
// but only one literal constant and only starting from GFX10.
void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
MachineInstr &MI) const {
unsigned Opc = MI.getOpcode();
Expand Down Expand Up @@ -5732,18 +5729,29 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
SIEncodingFamily Gen = subtargetEncodingFamily(ST);

if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
ST.getGeneration() >= AMDGPUSubtarget::GFX9)
ST.getGeneration() == AMDGPUSubtarget::GFX9)
Gen = SIEncodingFamily::GFX9;

if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
: SIEncodingFamily::SDWA;
// Adjust the encoding family to GFX80 for D16 buffer instructions when the
// subtarget has UnpackedD16VMem feature.
// TODO: remove this when we discard GFX80 encoding.
if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
Gen = SIEncodingFamily::GFX80;

if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
switch (ST.getGeneration()) {
default:
Gen = SIEncodingFamily::SDWA;
break;
case AMDGPUSubtarget::GFX9:
Gen = SIEncodingFamily::SDWA9;
break;
case AMDGPUSubtarget::GFX10:
Gen = SIEncodingFamily::SDWA10;
break;
}
}

int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);

// -1 means that Opcode is already a native instruction.
Expand Down

0 comments on commit ffc5401

Please sign in to comment.