213 changes: 206 additions & 7 deletions llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
ImmTySdwaSrc1Sel,
ImmTySdwaDstUnused,
ImmTyDMask,
ImmTyDim,
ImmTyUNorm,
ImmTyDA,
ImmTyR128A16,
Expand Down Expand Up @@ -296,6 +297,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
bool isClampSI() const { return isImmTy(ImmTyClampSI); }
bool isOModSI() const { return isImmTy(ImmTyOModSI); }
bool isDMask() const { return isImmTy(ImmTyDMask); }
bool isDim() const { return isImmTy(ImmTyDim); }
bool isUNorm() const { return isImmTy(ImmTyUNorm); }
bool isDA() const { return isImmTy(ImmTyDA); }
bool isR128A16() const { return isImmTy(ImmTyR128A16); }
Expand Down Expand Up @@ -695,6 +697,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
case ImmTySdwaSrc1Sel: OS << "SdwaSrc1Sel"; break;
case ImmTySdwaDstUnused: OS << "SdwaDstUnused"; break;
case ImmTyDMask: OS << "DMask"; break;
case ImmTyDim: OS << "Dim"; break;
case ImmTyUNorm: OS << "UNorm"; break;
case ImmTyDA: OS << "DA"; break;
case ImmTyR128A16: OS << "R128A16"; break;
Expand Down Expand Up @@ -926,6 +929,10 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
enum AMDGPUMatchResultTy {
Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY
};
enum OperandMode {
OperandMode_Default,
OperandMode_NSA,
};

using OptionalImmIndexMap = std::map<AMDGPUOperand::ImmTy, unsigned>;

Expand Down Expand Up @@ -1065,7 +1072,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
uint64_t &ErrorInfo,
bool MatchingInlineAsm) override;
bool ParseDirective(AsmToken DirectiveID) override;
OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic);
OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic,
OperandMode Mode = OperandMode_Default);
StringRef parseMnemonicSuffix(StringRef Name);
bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
SMLoc NameLoc, OperandVector &Operands) override;
Expand Down Expand Up @@ -1133,7 +1141,9 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
bool validateMIMGAtomicDMask(const MCInst &Inst);
bool validateMIMGGatherDMask(const MCInst &Inst);
bool validateMIMGDataSize(const MCInst &Inst);
bool validateMIMGAddrSize(const MCInst &Inst);
bool validateMIMGD16(const MCInst &Inst);
bool validateMIMGDim(const MCInst &Inst);
bool validateLdsDirect(const MCInst &Inst);
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
Expand Down Expand Up @@ -1211,6 +1221,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
bool IsAtomic = false);
void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands);

OperandMatchResultTy parseDim(OperandVector &Operands);
OperandMatchResultTy parseDPPCtrl(OperandVector &Operands);
AMDGPUOperand::Ptr defaultRowMask() const;
AMDGPUOperand::Ptr defaultBankMask() const;
Expand Down Expand Up @@ -2565,6 +2576,46 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) {
return (VDataSize / 4) == DataSize + TFESize;
}

bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) {
const unsigned Opc = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opc);

if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0 || !isGFX10())
return true;

const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
int SrsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
int DimIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dim);

assert(VAddr0Idx != -1);
assert(SrsrcIdx != -1);
assert(DimIdx != -1);
assert(SrsrcIdx > VAddr0Idx);

unsigned Dim = Inst.getOperand(DimIdx).getImm();
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim);
bool IsNSA = SrsrcIdx - VAddr0Idx > 1;
unsigned VAddrSize =
IsNSA ? SrsrcIdx - VAddr0Idx
: AMDGPU::getRegOperandSize(getMRI(), Desc, VAddr0Idx) / 4;

unsigned AddrSize = BaseOpcode->NumExtraArgs +
(BaseOpcode->Gradients ? DimInfo->NumGradients : 0) +
(BaseOpcode->Coordinates ? DimInfo->NumCoords : 0) +
(BaseOpcode->LodOrClampOrMip ? 1 : 0);
if (!IsNSA) {
if (AddrSize > 8)
AddrSize = 16;
else if (AddrSize > 4)
AddrSize = 8;
}

return VAddrSize == AddrSize;
}

bool AMDGPUAsmParser::validateMIMGAtomicDMask(const MCInst &Inst) {

const unsigned Opc = Inst.getOpcode();
Expand Down Expand Up @@ -2621,6 +2672,24 @@ bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) {
return true;
}

bool AMDGPUAsmParser::validateMIMGDim(const MCInst &Inst) {
const unsigned Opc = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opc);

if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
return true;

int DimIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dim);
if (DimIdx < 0)
return true;

long Imm = Inst.getOperand(DimIdx).getImm();
if (Imm < 0 || Imm >= 8)
return false;

return true;
}

static bool IsRevOpcode(const unsigned Opcode)
{
switch (Opcode) {
Expand Down Expand Up @@ -2853,11 +2922,20 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
"d16 modifier is not supported on this GPU");
return false;
}
if (!validateMIMGDim(Inst)) {
Error(IDLoc, "dim modifier is required on this GPU");
return false;
}
if (!validateMIMGDataSize(Inst)) {
Error(IDLoc,
"image data size does not match dmask and tfe");
return false;
}
if (!validateMIMGAddrSize(Inst)) {
Error(IDLoc,
"image address size does not match dim and a16");
return false;
}
if (!validateMIMGAtomicDMask(Inst)) {
Error(IDLoc,
"invalid atomic image dmask");
Expand Down Expand Up @@ -3217,6 +3295,24 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
IDRange);
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FP16_OVFL, Val,
ValRange);
} else if (ID == ".amdhsa_workgroup_processor_mode") {
if (IVersion.Major < 10)
return getParser().Error(IDRange.Start, "directive requires gfx10+",
IDRange);
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_WGP_MODE, Val,
ValRange);
} else if (ID == ".amdhsa_memory_ordered") {
if (IVersion.Major < 10)
return getParser().Error(IDRange.Start, "directive requires gfx10+",
IDRange);
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_MEM_ORDERED, Val,
ValRange);
} else if (ID == ".amdhsa_forward_progress") {
if (IVersion.Major < 10)
return getParser().Error(IDRange.Start, "directive requires gfx10+",
IDRange);
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FWD_PROGRESS, Val,
ValRange);
} else if (ID == ".amdhsa_exception_fp_ieee_invalid_op") {
PARSE_BITS_ENTRY(
KD.compute_pgm_rsrc2,
Expand Down Expand Up @@ -3370,6 +3466,22 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
return TokError(Err.str());
}
Lex();

if (ID == "enable_wgp_mode") {
if (G_00B848_WGP_MODE(Header.compute_pgm_resource_registers) && !isGFX10())
return TokError("enable_wgp_mode=1 is only allowed on GFX10+");
}

if (ID == "enable_mem_ordered") {
if (G_00B848_MEM_ORDERED(Header.compute_pgm_resource_registers) && !isGFX10())
return TokError("enable_mem_ordered=1 is only allowed on GFX10+");
}

if (ID == "enable_fwd_progress") {
if (G_00B848_FWD_PROGRESS(Header.compute_pgm_resource_registers) && !isGFX10())
return TokError("enable_fwd_progress=1 is only allowed on GFX10+");
}

return false;
}

Expand Down Expand Up @@ -3669,7 +3781,8 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
}

OperandMatchResultTy
AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
OperandMode Mode) {
// Try to parse with a custom parser
OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);

Expand All @@ -3683,6 +3796,35 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
getLexer().is(AsmToken::EndOfStatement))
return ResTy;

if (Mode == OperandMode_NSA && getLexer().is(AsmToken::LBrac)) {
unsigned Prefix = Operands.size();
SMLoc LBraceLoc = getTok().getLoc();
Parser.Lex(); // eat the '['

for (;;) {
ResTy = parseReg(Operands);
if (ResTy != MatchOperand_Success)
return ResTy;

if (getLexer().is(AsmToken::RBrac))
break;

if (getLexer().isNot(AsmToken::Comma))
return MatchOperand_ParseFail;
Parser.Lex();
}

if (Operands.size() - Prefix > 1) {
Operands.insert(Operands.begin() + Prefix,
AMDGPUOperand::CreateToken(this, "[", LBraceLoc));
Operands.push_back(AMDGPUOperand::CreateToken(this, "]",
getTok().getLoc()));
}

Parser.Lex(); // eat the ']'
return MatchOperand_Success;
}

ResTy = parseRegOrImm(Operands);

if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail)
Expand Down Expand Up @@ -3736,8 +3878,13 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
Name = parseMnemonicSuffix(Name);
Operands.push_back(AMDGPUOperand::CreateToken(this, Name, NameLoc));

bool IsMIMG = Name.startswith("image_");

while (!getLexer().is(AsmToken::EndOfStatement)) {
OperandMatchResultTy Res = parseOperand(Operands, Name);
OperandMode Mode = OperandMode_Default;
if (IsMIMG && isGFX10() && Operands.size() == 2)
Mode = OperandMode_NSA;
OperandMatchResultTy Res = parseOperand(Operands, Name, Mode);

// Eat the comma or space if there is one.
if (getLexer().is(AsmToken::Comma))
Expand Down Expand Up @@ -5275,14 +5422,16 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
Op.addRegOperands(Inst, 1);
} else if (Op.isImmModifier()) {
OptionalIdx[Op.getImmTy()] = I;
} else {
} else if (!Op.isToken()) {
llvm_unreachable("unexpected operand type");
}
}

bool IsGFX10 = isGFX10();

addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask);
if (IsGFX10)
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDim, -1);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
if (IsGFX10)
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
Expand All @@ -5291,7 +5440,8 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128A16);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
if (!IsGFX10)
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyD16);
}

Expand Down Expand Up @@ -5389,7 +5539,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"offset", AMDGPUOperand::ImmTyOffset, false, nullptr},
{"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr},
{"dlc", AMDGPUOperand::ImmTyDLC, true, nullptr},
{"dfmt", AMDGPUOperand::ImmTyFORMAT, false, nullptr},
{"format", AMDGPUOperand::ImmTyFORMAT, false, nullptr},
{"glc", AMDGPUOperand::ImmTyGLC, true, nullptr},
{"slc", AMDGPUOperand::ImmTySLC, true, nullptr},
{"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr},
Expand All @@ -5404,6 +5554,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"lwe", AMDGPUOperand::ImmTyLWE, true, nullptr},
{"d16", AMDGPUOperand::ImmTyD16, true, nullptr},
{"dmask", AMDGPUOperand::ImmTyDMask, false, nullptr},
{"dim", AMDGPUOperand::ImmTyDim, false, nullptr},
{"row_mask", AMDGPUOperand::ImmTyDppRowMask, false, nullptr},
{"bank_mask", AMDGPUOperand::ImmTyDppBankMask, false, nullptr},
{"bound_ctrl", AMDGPUOperand::ImmTyDppBoundCtrl, false, ConvertBoundCtrl},
Expand Down Expand Up @@ -5472,7 +5623,9 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands)
Op.Type == AMDGPUOperand::ImmTyNegHi) {
res = parseOperandArrayWithPrefix(Op.Name, Operands, Op.Type,
Op.ConvertResult);
} else if (Op.Type == AMDGPUOperand::ImmTyFORMAT) {
} else if (Op.Type == AMDGPUOperand::ImmTyDim) {
res = parseDim(Operands);
} else if (Op.Type == AMDGPUOperand::ImmTyFORMAT && !isGFX10()) {
res = parseDfmtNfmt(Operands);
} else {
res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult);
Expand Down Expand Up @@ -5758,6 +5911,52 @@ bool AMDGPUOperand::isU16Imm() const {
return isImm() && isUInt<16>(getImm());
}

OperandMatchResultTy AMDGPUAsmParser::parseDim(OperandVector &Operands) {
if (!isGFX10())
return MatchOperand_NoMatch;

SMLoc S = Parser.getTok().getLoc();

if (getLexer().isNot(AsmToken::Identifier))
return MatchOperand_NoMatch;
if (getLexer().getTok().getString() != "dim")
return MatchOperand_NoMatch;

Parser.Lex();
if (getLexer().isNot(AsmToken::Colon))
return MatchOperand_ParseFail;

Parser.Lex();

// We want to allow "dim:1D" etc., but the initial 1 is tokenized as an
// integer.
std::string Token;
if (getLexer().is(AsmToken::Integer)) {
SMLoc Loc = getLexer().getTok().getEndLoc();
Token = getLexer().getTok().getString();
Parser.Lex();
if (getLexer().getTok().getLoc() != Loc)
return MatchOperand_ParseFail;
}
if (getLexer().isNot(AsmToken::Identifier))
return MatchOperand_ParseFail;
Token += getLexer().getTok().getString();

StringRef DimId = Token;
if (DimId.startswith("SQ_RSRC_IMG_"))
DimId = DimId.substr(12);

const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByAsmSuffix(DimId);
if (!DimInfo)
return MatchOperand_ParseFail;

Parser.Lex();

Operands.push_back(AMDGPUOperand::CreateImm(this, DimInfo->Encoding, S,
AMDGPUOperand::ImmTyDim));
return MatchOperand_Success;
}

OperandMatchResultTy
AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
using namespace AMDGPU::DPP;
Expand Down
146 changes: 106 additions & 40 deletions llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,26 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}

if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) {
Res = convertMIMGInst(MI);
int VAddr0Idx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
int RsrcIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
unsigned NSAArgs = RsrcIdx - VAddr0Idx - 1;
if (VAddr0Idx >= 0 && NSAArgs > 0) {
unsigned NSAWords = (NSAArgs + 3) / 4;
if (Bytes.size() < 4 * NSAWords) {
Res = MCDisassembler::Fail;
} else {
for (unsigned i = 0; i < NSAArgs; ++i) {
MI.insert(MI.begin() + VAddr0Idx + 1 + i,
decodeOperand_VGPR_32(Bytes[i]));
}
Bytes = Bytes.slice(4 * NSAWords);
}
}

if (Res)
Res = convertMIMGInst(MI);
}

if (Res && IsSDWA)
Expand Down Expand Up @@ -339,17 +358,18 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
return MCDisassembler::Success;
}

// Note that MIMG format provides no information about VADDR size.
// Consequently, decoded instructions always show address
// as if it has 1 dword, which could be not really so.
// Note that before gfx10, the MIMG encoding provided no information about
// VADDR size. Consequently, decoded instructions always show address as if it
// has 1 dword, which could be not really so.
DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {

int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::vdst);

int VDataIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::vdata);

int VAddr0Idx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::dmask);

Expand All @@ -362,16 +382,42 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
assert(DMaskIdx != -1);
assert(TFEIdx != -1);

const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
bool IsAtomic = (VDstIdx != -1);
bool IsGather4 = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::Gather4;

unsigned DMask = MI.getOperand(DMaskIdx).getImm() & 0xf;
if (DMask == 0)
return MCDisassembler::Success;
bool IsNSA = false;
unsigned AddrSize = Info->VAddrDwords;

if (STI.getFeatureBits()[AMDGPU::FeatureGFX10]) {
unsigned DimIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dim);
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
const AMDGPU::MIMGDimInfo *Dim =
AMDGPU::getMIMGDimInfoByEncoding(MI.getOperand(DimIdx).getImm());

AddrSize = BaseOpcode->NumExtraArgs +
(BaseOpcode->Gradients ? Dim->NumGradients : 0) +
(BaseOpcode->Coordinates ? Dim->NumCoords : 0) +
(BaseOpcode->LodOrClampOrMip ? 1 : 0);
IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA;
if (!IsNSA) {
if (AddrSize > 8)
AddrSize = 16;
else if (AddrSize > 4)
AddrSize = 8;
} else {
if (AddrSize > Info->VAddrDwords) {
// The NSA encoding does not contain enough operands for the combination
// of base opcode / dimension. Should this be an error?
return MCDisassembler::Success;
}
}
}

unsigned DstSize = IsGather4 ? 4 : countPopulation(DMask);
if (DstSize == 1)
return MCDisassembler::Success;
unsigned DMask = MI.getOperand(DMaskIdx).getImm() & 0xf;
unsigned DstSize = IsGather4 ? 4 : std::max(countPopulation(DMask), 1u);

bool D16 = D16Idx >= 0 && MI.getOperand(D16Idx).getImm();
if (D16 && AMDGPU::hasPackedD16(STI)) {
Expand All @@ -382,44 +428,64 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
if (MI.getOperand(TFEIdx).getImm())
return MCDisassembler::Success;

int NewOpcode = -1;
if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords)
return MCDisassembler::Success;

if (IsGather4) {
if (D16 && AMDGPU::hasPackedD16(STI))
NewOpcode = AMDGPU::getMaskedMIMGOp(MI.getOpcode(), 2);
else
int NewOpcode =
AMDGPU::getMIMGOpcode(Info->BaseOpcode, Info->MIMGEncoding, DstSize, AddrSize);
if (NewOpcode == -1)
return MCDisassembler::Success;

// Widen the register to the correct number of enabled channels.
unsigned NewVdata = AMDGPU::NoRegister;
if (DstSize != Info->VDataDwords) {
auto DataRCID = MCII->get(NewOpcode).OpInfo[VDataIdx].RegClass;

// Get first subregister of VData
unsigned Vdata0 = MI.getOperand(VDataIdx).getReg();
unsigned VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0);
Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0;

NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0,
&MRI.getRegClass(DataRCID));
if (NewVdata == AMDGPU::NoRegister) {
// It's possible to encode this such that the low register + enabled
// components exceeds the register count.
return MCDisassembler::Success;
} else {
NewOpcode = AMDGPU::getMaskedMIMGOp(MI.getOpcode(), DstSize);
if (NewOpcode == -1)
}
}

unsigned NewVAddr0 = AMDGPU::NoRegister;
if (STI.getFeatureBits()[AMDGPU::FeatureGFX10] && !IsNSA &&
AddrSize != Info->VAddrDwords) {
unsigned VAddr0 = MI.getOperand(VAddr0Idx).getReg();
unsigned VAddrSub0 = MRI.getSubReg(VAddr0, AMDGPU::sub0);
VAddr0 = (VAddrSub0 != 0) ? VAddrSub0 : VAddr0;

auto AddrRCID = MCII->get(NewOpcode).OpInfo[VAddr0Idx].RegClass;
NewVAddr0 = MRI.getMatchingSuperReg(VAddr0, AMDGPU::sub0,
&MRI.getRegClass(AddrRCID));
if (NewVAddr0 == AMDGPU::NoRegister)
return MCDisassembler::Success;
}

auto RCID = MCII->get(NewOpcode).OpInfo[VDataIdx].RegClass;
MI.setOpcode(NewOpcode);

// Get first subregister of VData
unsigned Vdata0 = MI.getOperand(VDataIdx).getReg();
unsigned VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0);
Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0;
if (NewVdata != AMDGPU::NoRegister) {
MI.getOperand(VDataIdx) = MCOperand::createReg(NewVdata);

// Widen the register to the correct number of enabled channels.
auto NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0,
&MRI.getRegClass(RCID));
if (NewVdata == AMDGPU::NoRegister) {
// It's possible to encode this such that the low register + enabled
// components exceeds the register count.
return MCDisassembler::Success;
if (IsAtomic) {
// Atomic operations have an additional operand (a copy of data)
MI.getOperand(VDstIdx) = MCOperand::createReg(NewVdata);
}
}

MI.setOpcode(NewOpcode);
// vaddr will be always appear as a single VGPR. This will look different than
// how it is usually emitted because the number of register components is not
// in the instruction encoding.
MI.getOperand(VDataIdx) = MCOperand::createReg(NewVdata);

if (IsAtomic) {
// Atomic operations have an additional operand (a copy of data)
MI.getOperand(VDstIdx) = MCOperand::createReg(NewVdata);
if (NewVAddr0 != AMDGPU::NoRegister) {
MI.getOperand(VAddr0Idx) = MCOperand::createReg(NewVAddr0);
} else if (IsNSA) {
assert(AddrSize <= Info->VAddrDwords);
MI.erase(MI.begin() + VAddr0Idx + AddrSize,
MI.begin() + VAddr0Idx + Info->VAddrDwords);
}

return MCDisassembler::Success;
Expand Down
20 changes: 18 additions & 2 deletions llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,18 @@ void AMDGPUInstPrinter::printDMask(const MCInst *MI, unsigned OpNo,
}
}

void AMDGPUInstPrinter::printDim(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
unsigned Dim = MI->getOperand(OpNo).getImm();
O << " dim:SQ_RSRC_IMG_";

const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim);
if (DimInfo)
O << DimInfo->AsmSuffix;
else
O << Dim;
}

void AMDGPUInstPrinter::printUNorm(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
printNamedBit(MI, OpNo, O, "unorm");
Expand Down Expand Up @@ -254,8 +266,12 @@ void AMDGPUInstPrinter::printFORMAT(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
if (unsigned Val = MI->getOperand(OpNo).getImm()) {
O << " dfmt:" << (Val & 15);
O << ", nfmt:" << (Val >> 4);
if (AMDGPU::isGFX10(STI))
O << " format:" << Val;
else {
O << " dfmt:" << (Val & 15);
O << ", nfmt:" << (Val >> 4);
}
}
}

Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ class AMDGPUInstPrinter : public MCInstPrinter {
raw_ostream &O);
void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printDim(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printUNorm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printDA(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
Expand Down
24 changes: 22 additions & 2 deletions llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "MCTargetDesc/AMDGPUFixupKinds.h"
#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIDefines.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
Expand Down Expand Up @@ -273,7 +274,25 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff));
}

if (bytes > 4)
// NSA encoding.
if (AMDGPU::isGFX10(STI) && Desc.TSFlags & SIInstrFlags::MIMG) {
int vaddr0 = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::vaddr0);
int srsrc = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::srsrc);
assert(vaddr0 >= 0 && srsrc > vaddr0);
unsigned NumExtraAddrs = srsrc - vaddr0 - 1;
unsigned NumPadding = (-NumExtraAddrs) & 3;

for (unsigned i = 0; i < NumExtraAddrs; ++i)
OS.write((uint8_t)getMachineOpValue(MI, MI.getOperand(vaddr0 + 1 + i),
Fixups, STI));
for (unsigned i = 0; i < NumPadding; ++i)
OS.write(0);
}

if ((bytes > 8 && STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]) ||
(bytes > 4 && !STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]))
return;

// Check for additional literals in SRC0/1/2 (Op 1/2/3)
Expand Down Expand Up @@ -428,7 +447,8 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
if (AMDGPU::isSISrcOperand(Desc, OpNo)) {
uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI);
if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4))
if (Enc != ~0U &&
(Enc != 255 || Desc.getSize() == 4 || Desc.getSize() == 8))
return Enc;

} else if (MO.isImm())
Expand Down
401 changes: 315 additions & 86 deletions llvm/lib/Target/AMDGPU/MIMGInstructions.td

Large diffs are not rendered by default.

79 changes: 64 additions & 15 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1003,6 +1003,13 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
// GFX9 added a 13-bit signed offset. When using regular flat instructions,
// the sign bit is ignored and is treated as a 12-bit unsigned offset.

// GFX10 shrinked signed offset to 12 bits. When using regular flat
// instructions, the sign bit is also ignored and is treated as 11-bit
// unsigned offset.

if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
return isUInt<11>(AM.BaseOffs) && AM.Scale == 0;

// Just r + i
return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
}
Expand Down Expand Up @@ -2828,8 +2835,9 @@ unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,

}

if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
if ((Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||
Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) &&
Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
report_fatal_error(Twine("invalid register \""
+ StringRef(RegName) + "\" for subtarget."));
}
Expand Down Expand Up @@ -4656,7 +4664,7 @@ static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
}

static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
SDValue *GLC, SDValue *SLC) {
SDValue *GLC, SDValue *SLC, SDValue *DLC) {
auto CachePolicyConst = cast<ConstantSDNode>(CachePolicy.getNode());

uint64_t Value = CachePolicyConst->getZExtValue();
Expand All @@ -4669,6 +4677,10 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
*SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
Value &= ~(uint64_t)0x2;
}
if (DLC) {
*DLC = DAG.getTargetConstant((Value & 0x4) ? 1 : 0, DL, MVT::i32);
Value &= ~(uint64_t)0x4;
}

return Value == 0;
}
Expand Down Expand Up @@ -4786,6 +4798,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
unsigned IntrOpcode = Intr->BaseOpcode;
bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10;

SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
Expand Down Expand Up @@ -4924,7 +4937,22 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
VAddrs.push_back(Op.getOperand(AddrIdx + i));
}

SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
// If the register allocator cannot place the address registers contiguously
// without introducing moves, then using the non-sequential address encoding
// is always preferable, since it saves VALU instructions and is usually a
// wash in terms of code size or even better.
//
// However, we currently have no way of hinting to the register allocator that
// MIMG addresses should be placed contiguously when it is possible to do so,
// so force non-NSA for the common 2-address case as a heuristic.
//
// SIShrinkInstructions will convert NSA encodings to non-NSA after register
// allocation when possible.
bool UseNSA =
ST->hasFeature(AMDGPU::FeatureNSAEncoding) && VAddrs.size() >= 3;
SDValue VAddr;
if (!UseNSA)
VAddr = getBuildDwordsVector(DAG, DL, VAddrs);

SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
Expand Down Expand Up @@ -4987,45 +5015,66 @@ SDValue SITargetLowering::lowerImage(SDValue Op,

SDValue GLC;
SDValue SLC;
SDValue DLC;
if (BaseOpcode->Atomic) {
GLC = True; // TODO no-return optimization
if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC))
if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC,
IsGFX10 ? &DLC : nullptr))
return Op;
} else {
if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC))
if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC,
IsGFX10 ? &DLC : nullptr))
return Op;
}

SmallVector<SDValue, 14> Ops;
SmallVector<SDValue, 26> Ops;
if (BaseOpcode->Store || BaseOpcode->Atomic)
Ops.push_back(VData); // vdata
Ops.push_back(VAddr);
if (UseNSA) {
for (const SDValue &Addr : VAddrs)
Ops.push_back(Addr);
} else {
Ops.push_back(VAddr);
}
Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
if (BaseOpcode->Sampler)
Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
if (IsGFX10)
Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
Ops.push_back(Unorm);
if (IsGFX10)
Ops.push_back(DLC);
Ops.push_back(GLC);
Ops.push_back(SLC);
Ops.push_back(IsA16 && // a16 or r128
ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
Ops.push_back(TFE); // tfe
Ops.push_back(LWE); // lwe
Ops.push_back(DimInfo->DA ? True : False);
if (!IsGFX10)
Ops.push_back(DimInfo->DA ? True : False);
if (BaseOpcode->HasD16)
Ops.push_back(IsD16 ? True : False);
if (isa<MemSDNode>(Op))
Ops.push_back(Op.getOperand(0)); // chain

int NumVAddrDwords = VAddr.getValueType().getSizeInBits() / 32;
int NumVAddrDwords =
UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
int Opcode = -1;

if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
NumVDataDwords, NumVAddrDwords);
if (Opcode == -1)
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
if (IsGFX10) {
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
UseNSA ? AMDGPU::MIMGEncGfx10NSA
: AMDGPU::MIMGEncGfx10Default,
NumVDataDwords, NumVAddrDwords);
} else {
if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
NumVDataDwords, NumVAddrDwords);
if (Opcode == -1)
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
NumVDataDwords, NumVAddrDwords);
}
assert(Opcode != -1);

MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
Expand Down
32 changes: 26 additions & 6 deletions llvm/lib/Target/AMDGPU/SIInstrFormats.td
Original file line number Diff line number Diff line change
Expand Up @@ -246,38 +246,58 @@ class VINTRPe <bits<2> op> : Enc32 {
let Inst{31-26} = 0x32; // encoding
}

class MIMGe <bits<7> op> : Enc64 {
class MIMGe : Enc64 {
bits<8> vdata;
bits<4> dmask;
bits<1> unorm;
bits<1> glc;
bits<1> da;
bits<1> r128;
bits<1> tfe;
bits<1> lwe;
bits<1> slc;
bit d16;
bits<8> vaddr;
bits<7> srsrc;
bits<7> ssamp;

let Inst{11-8} = dmask;
let Inst{12} = unorm;
let Inst{13} = glc;
let Inst{14} = da;
let Inst{15} = r128;
let Inst{16} = tfe;
let Inst{17} = lwe;
let Inst{24-18} = op;
let Inst{25} = slc;
let Inst{31-26} = 0x3c;
let Inst{39-32} = vaddr;
let Inst{47-40} = vdata;
let Inst{52-48} = srsrc{6-2};
let Inst{57-53} = ssamp{6-2};
let Inst{63} = d16;
}

class MIMGe_gfx6789 <bits<7> op> : MIMGe {
bits<8> vaddr;
bits<1> da;

let Inst{14} = da;
let Inst{24-18} = op;
let Inst{39-32} = vaddr;
}

class MIMGe_gfx10 <bits<8> op> : MIMGe {
bits<8> vaddr0;
bits<3> dim;
bits<2> nsa;
bits<1> dlc;
bits<1> a16 = 0; // TODO: this should be an operand

let Inst{0} = op{7};
let Inst{2-1} = nsa;
let Inst{5-3} = dim;
let Inst{7} = dlc;
let Inst{24-18} = op{6-0};
let Inst{39-32} = vaddr0;
let Inst{62} = a16;
}

class EXPe : Enc64 {
bits<4> en;
bits<6> tgt;
Expand Down
63 changes: 60 additions & 3 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3223,6 +3223,53 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}

if (isMIMG(MI)) {
const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
if (DimOp) {
int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
AMDGPU::OpName::vaddr0);
int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
const AMDGPU::MIMGDimInfo *Dim =
AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm());

if (!Dim) {
ErrInfo = "dim is out of range";
return false;
}

bool IsNSA = SRsrcIdx - VAddr0Idx > 1;
unsigned AddrWords = BaseOpcode->NumExtraArgs +
(BaseOpcode->Gradients ? Dim->NumGradients : 0) +
(BaseOpcode->Coordinates ? Dim->NumCoords : 0) +
(BaseOpcode->LodOrClampOrMip ? 1 : 0);

unsigned VAddrWords;
if (IsNSA) {
VAddrWords = SRsrcIdx - VAddr0Idx;
} else {
const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx);
VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32;
if (AddrWords > 8)
AddrWords = 16;
else if (AddrWords > 4)
AddrWords = 8;
else if (AddrWords == 3 && VAddrWords == 4) {
// CodeGen uses the V4 variant of instructions for three addresses,
// because the selection DAG does not support non-power-of-two types.
AddrWords = 4;
}
}

if (VAddrWords != AddrWords) {
ErrInfo = "bad vaddr size";
return false;
}
}
}

const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
if (DppCt) {
using namespace AMDGPU::DPP;
Expand Down Expand Up @@ -5356,25 +5403,35 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
return DescSize; // No operands.

if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
return DescSize + 4;
return isVOP3(MI) ? 12 : (DescSize + 4);

int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
if (Src1Idx == -1)
return DescSize;

if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
return DescSize + 4;
return isVOP3(MI) ? 12 : (DescSize + 4);

int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
if (Src2Idx == -1)
return DescSize;

if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx]))
return DescSize + 4;
return isVOP3(MI) ? 12 : (DescSize + 4);

return DescSize;
}

// Check whether we have extra NSA words.
if (isMIMG(MI)) {
int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
if (VAddr0Idx < 0)
return 8;

int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
}

switch (Opc) {
case TargetOpcode::IMPLICIT_DEF:
case TargetOpcode::KILL:
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -845,6 +845,7 @@ def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
def FORMAT : NamedOperandU8<"FORMAT", NamedMatchClass<"FORMAT">>;

def DMask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
def Dim : NamedOperandU8<"Dim", NamedMatchClass<"Dim", 0>>;

def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>;
Expand Down
100 changes: 100 additions & 0 deletions llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ class SIShrinkInstructions : public MachineFunctionPass {
public:
static char ID;

void shrinkMIMG(MachineInstr &MI);

public:
SIShrinkInstructions() : MachineFunctionPass(ID) {
}
Expand Down Expand Up @@ -211,6 +213,96 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
}
}

// Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
if (Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
return;

MachineFunction *MF = MI.getParent()->getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
int VAddr0Idx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
unsigned NewAddrDwords = Info->VAddrDwords;
const TargetRegisterClass *RC;

if (Info->VAddrDwords == 2) {
RC = &AMDGPU::VReg_64RegClass;
} else if (Info->VAddrDwords == 3) {
RC = &AMDGPU::VReg_96RegClass;
} else if (Info->VAddrDwords == 4) {
RC = &AMDGPU::VReg_128RegClass;
} else if (Info->VAddrDwords <= 8) {
RC = &AMDGPU::VReg_256RegClass;
NewAddrDwords = 8;
} else {
RC = &AMDGPU::VReg_512RegClass;
NewAddrDwords = 16;
}

unsigned VgprBase = 0;
bool IsUndef = true;
bool IsKill = NewAddrDwords == Info->VAddrDwords;
for (unsigned i = 0; i < Info->VAddrDwords; ++i) {
const MachineOperand &Op = MI.getOperand(VAddr0Idx + i);
unsigned Vgpr = TRI.getHWRegIndex(Op.getReg());

if (i == 0) {
VgprBase = Vgpr;
} else if (VgprBase + i != Vgpr)
return;

if (!Op.isUndef())
IsUndef = false;
if (!Op.isKill())
IsKill = false;
}

if (VgprBase + NewAddrDwords > 256)
return;

// Further check for implicit tied operands - this may be present if TFE is
// enabled
int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe);
unsigned TFEVal = MI.getOperand(TFEIdx).getImm();
unsigned LWEVal = MI.getOperand(LWEIdx).getImm();
int ToUntie = -1;
if (TFEVal || LWEVal) {
// TFE/LWE is enabled so we need to deal with an implicit tied operand
for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) {
if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() &&
MI.getOperand(i).isImplicit()) {
// This is the tied operand
assert(
ToUntie == -1 &&
"found more than one tied implicit operand when expecting only 1");
ToUntie = i;
MI.untieRegOperand(ToUntie);
}
}
}

unsigned NewOpcode =
AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default,
Info->VDataDwords, NewAddrDwords);
MI.setDesc(TII->get(NewOpcode));
MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase));
MI.getOperand(VAddr0Idx).setIsUndef(IsUndef);
MI.getOperand(VAddr0Idx).setIsKill(IsKill);

for (unsigned i = 1; i < Info->VAddrDwords; ++i)
MI.RemoveOperand(VAddr0Idx + 1);

if (ToUntie >= 0) {
MI.tieOperands(
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata),
ToUntie - (Info->VAddrDwords - 1));
}
}

/// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals.
/// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
/// If the inverse of the immediate is legal, use ANDN2, ORN2 or
Expand Down Expand Up @@ -597,6 +689,14 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
continue;
}

if (TII->isMIMG(MI.getOpcode()) &&
ST.getGeneration() >= AMDGPUSubtarget::GFX10 &&
MF.getProperties().hasProperty(
MachineFunctionProperties::Property::NoVRegs)) {
shrinkMIMG(MI);
continue;
}

if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
continue;

Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,8 @@ struct MIMGDimInfo {
uint8_t NumCoords;
uint8_t NumGradients;
bool DA;
uint8_t Encoding;
const char *AsmSuffix;
};

LLVM_READONLY
Expand Down
64 changes: 43 additions & 21 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GCN %s

; GCN-LABEL: {{^}}atomic_swap_1d:
; GCN: image_atomic_swap v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX6789: image_atomic_swap v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX10: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc ;
define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -11,7 +13,8 @@ main_body:
}

; GCN-LABEL: {{^}}atomic_add_1d:
; GCN: image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX6789: image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX10: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc ;
define amdgpu_ps float @atomic_add_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -20,7 +23,8 @@ main_body:
}

; GCN-LABEL: {{^}}atomic_sub_1d:
; GCN: image_atomic_sub v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX6789: image_atomic_sub v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX10: image_atomic_sub v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc ;
define amdgpu_ps float @atomic_sub_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -29,7 +33,8 @@ main_body:
}

; GCN-LABEL: {{^}}atomic_smin_1d:
; GCN: image_atomic_smin v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX6789: image_atomic_smin v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX10: image_atomic_smin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc ;
define amdgpu_ps float @atomic_smin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -38,7 +43,8 @@ main_body:
}

; GCN-LABEL: {{^}}atomic_umin_1d:
; GCN: image_atomic_umin v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX6789: image_atomic_umin v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX10: image_atomic_umin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc ;
define amdgpu_ps float @atomic_umin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -47,7 +53,8 @@ main_body:
}

; GCN-LABEL: {{^}}atomic_smax_1d:
; GCN: image_atomic_smax v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX6789: image_atomic_smax v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX10: image_atomic_smax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc ;
define amdgpu_ps float @atomic_smax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -56,7 +63,8 @@ main_body:
}

; GCN-LABEL: {{^}}atomic_umax_1d:
; GCN: image_atomic_umax v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX6789: image_atomic_umax v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX10: image_atomic_umax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc ;
define amdgpu_ps float @atomic_umax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -65,7 +73,8 @@ main_body:
}

; GCN-LABEL: {{^}}atomic_and_1d:
; GCN: image_atomic_and v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX6789: image_atomic_and v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX10: image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc ;
define amdgpu_ps float @atomic_and_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -74,7 +83,8 @@ main_body:
}

; GCN-LABEL: {{^}}atomic_or_1d:
; GCN: image_atomic_or v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX6789: image_atomic_or v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX10: image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc ;
define amdgpu_ps float @atomic_or_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -83,7 +93,8 @@ main_body:
}

; GCN-LABEL: {{^}}atomic_xor_1d:
; GCN: image_atomic_xor v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX6789: image_atomic_xor v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX10: image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc ;
define amdgpu_ps float @atomic_xor_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -92,7 +103,8 @@ main_body:
}

; GCN-LABEL: {{^}}atomic_inc_1d:
; GCN: image_atomic_inc v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX6789: image_atomic_inc v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX10: image_atomic_inc v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc ;
define amdgpu_ps float @atomic_inc_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -101,7 +113,8 @@ main_body:
}

; GCN-LABEL: {{^}}atomic_dec_1d:
; GCN: image_atomic_dec v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX6789: image_atomic_dec v0, v1, s[0:7] dmask:0x1 unorm glc{{$}}
; GFX10: image_atomic_dec v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc ;
define amdgpu_ps float @atomic_dec_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -110,7 +123,8 @@ main_body:
}

; GCN-LABEL: {{^}}atomic_cmpswap_1d:
; GCN: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc{{$}}
; GFX6789: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc{{$}}
; GFX10: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc ;
define amdgpu_ps float @atomic_cmpswap_1d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %swap, i32 %s) {
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -119,7 +133,8 @@ main_body:
}

; GCN-LABEL: {{^}}atomic_add_2d:
; GCN: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc{{$}}
; GFX6789: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc{{$}}
; GFX10: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm glc ;
define amdgpu_ps float @atomic_add_2d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t) {
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -128,7 +143,8 @@ main_body:
}

; GCN-LABEL: {{^}}atomic_add_3d:
; GCN: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 unorm glc{{$}}
; GFX6789: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 unorm glc{{$}}
; GFX10: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm glc ;
define amdgpu_ps float @atomic_add_3d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %r) {
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32 %data, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -137,7 +153,8 @@ main_body:
}

; GCN-LABEL: {{^}}atomic_add_cube:
; GCN: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 unorm glc da{{$}}
; GFX6789: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 unorm glc da{{$}}
; GFX10: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm glc ;
define amdgpu_ps float @atomic_add_cube(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %face) {
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32 %data, i32 %s, i32 %t, i32 %face, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -146,7 +163,8 @@ main_body:
}

; GCN-LABEL: {{^}}atomic_add_1darray:
; GCN: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc da{{$}}
; GFX6789: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc da{{$}}
; GFX10: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY unorm glc ;
define amdgpu_ps float @atomic_add_1darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %slice) {
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32 %data, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -155,7 +173,8 @@ main_body:
}

; GCN-LABEL: {{^}}atomic_add_2darray:
; GCN: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 unorm glc da{{$}}
; GFX6789: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 unorm glc da{{$}}
; GFX10: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc ;
define amdgpu_ps float @atomic_add_2darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice) {
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -164,7 +183,8 @@ main_body:
}

; GCN-LABEL: {{^}}atomic_add_2dmsaa:
; GCN: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 unorm glc{{$}}
; GFX6789: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 unorm glc{{$}}
; GFX10: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm glc ;
define amdgpu_ps float @atomic_add_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %fragid) {
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -173,7 +193,8 @@ main_body:
}

; GCN-LABEL: {{^}}atomic_add_2darraymsaa:
; GCN: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 unorm glc da{{$}}
; GFX6789: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 unorm glc da{{$}}
; GFX10: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm glc ;
define amdgpu_ps float @atomic_add_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -182,7 +203,8 @@ main_body:
}

; GCN-LABEL: {{^}}atomic_add_1d_slc:
; GCN: image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc slc{{$}}
; GFX6789: image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc slc{{$}}
; GFX10: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc slc ;
define amdgpu_ps float @atomic_add_1d_slc(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
Expand Down
20 changes: 15 additions & 5 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,UNPACKED,GFX89 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,GFX81,GFX89 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,GFX9,GFX89 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s

; GCN-LABEL: {{^}}image_load_f16:
; GCN: image_load v0, v[0:1], s[0:7] dmask:0x1 unorm d16{{$}}
; GFX89: image_load v0, v[0:1], s[0:7] dmask:0x1 unorm d16{{$}}
; GFX10: image_load v0, v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm d16{{$}}
define amdgpu_ps half @image_load_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
main_body:
%tex = call half @llvm.amdgcn.image.load.2d.f16.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -13,6 +15,7 @@ main_body:
; GCN-LABEL: {{^}}image_load_v2f16:
; UNPACKED: image_load v[0:1], v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}
; PACKED: image_load v0, v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}
; GFX10: image_load v0, v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm d16{{$}}
define amdgpu_ps float @image_load_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
main_body:
%tex = call <2 x half> @llvm.amdgcn.image.load.2d.v2f16.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -23,6 +26,7 @@ main_body:
; GCN-LABEL: {{^}}image_load_v4f16:
; UNPACKED: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
; PACKED: image_load v[0:1], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
; GFX10: image_load v[0:1], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm d16{{$}}
define amdgpu_ps <2 x float> @image_load_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
main_body:
%tex = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -33,6 +37,7 @@ main_body:
; GCN-LABEL: {{^}}image_load_mip_v4f16:
; UNPACKED: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm d16{{$}}
; PACKED: image_load_mip v[0:1], v[0:3], s[0:7] dmask:0xf unorm d16{{$}}
; GFX10: image_load_mip v[0:1], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm d16{{$}}
define amdgpu_ps <2 x float> @image_load_mip_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
main_body:
%tex = call <4 x half> @llvm.amdgcn.image.load.mip.2d.v4f16.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -43,6 +48,7 @@ main_body:
; GCN-LABEL: {{^}}image_load_3d_v2f16:
; UNPACKED: image_load v[0:1], v[0:3], s[0:7] dmask:0x3 unorm d16{{$}}
; PACKED: image_load v0, v[0:3], s[0:7] dmask:0x3 unorm d16{{$}}
; GFX10: image_load v0, v[0:2], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm d16{{$}}
define amdgpu_ps float @image_load_3d_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
main_body:
%tex = call <2 x half> @llvm.amdgcn.image.load.3d.v2f16.i32(i32 3, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -51,7 +57,8 @@ main_body:
}

; GCN-LABEL: {{^}}image_store_f16
; GCN: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16{{$}}
; GFX89: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16{{$}}
; GFX10: image_store v2, v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm d16{{$}}
define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, half %data) {
main_body:
call void @llvm.amdgcn.image.store.2d.f16.i32(half %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
Expand All @@ -63,6 +70,7 @@ main_body:
; UNPACKED: v_and_b32_e32
; UNPACKED: image_store v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}
; PACKED: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}
; GFX10: image_store v2, v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm d16{{$}}
define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, float %in) {
main_body:
%data = bitcast float %in to <2 x half>
Expand All @@ -77,6 +85,7 @@ main_body:
; UNPACKED: v_and_b32_e32
; UNPACKED: image_store v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
; PACKED: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
; GFX10: image_store v[2:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm d16{{$}}
define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x float> %in) {
main_body:
%data = bitcast <2 x float> %in to <4 x half>
Expand All @@ -91,6 +100,7 @@ main_body:
; UNPACKED: v_and_b32_e32
; UNPACKED: image_store_mip v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
; PACKED: image_store_mip v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
; GFX10: image_store_mip v[2:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm d16{{$}}
define amdgpu_ps void @image_store_mip_1d_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %mip, <2 x float> %in) {
main_body:
%data = bitcast <2 x float> %in to <4 x half>
Expand Down
8 changes: 5 additions & 3 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,UNPACKED,GFX89 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,GFX81,GFX89 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,GFX9,GFX89 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s

; GCN-LABEL: {{^}}image_gather4_b_2d_v4f16:
; UNPACKED: image_gather4_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x4 d16{{$}}
; PACKED: image_gather4_b v[0:1], v[0:3], s[0:7], s[8:11] dmask:0x4 d16{{$}}
; GFX10: image_gather4_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D d16{{$}}
define amdgpu_ps <2 x float> @image_gather4_b_2d_v4f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
main_body:
%tex = call <4 x half> @llvm.amdgcn.image.gather4.b.2d.v4f16.f32.f32(i32 4, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand Down
56 changes: 37 additions & 19 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.dim.ll
Original file line number Diff line number Diff line change
@@ -1,136 +1,154 @@
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6789 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6789 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX10 %s

; GCN-LABEL: {{^}}gather4_2d:
; GCN: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX6789: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX10: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ;
define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}gather4_cube:
; GCN: image_gather4 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 da{{$}}
; GFX6789: image_gather4 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 da{{$}}
; GFX10: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE ;
define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %face) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f32(i32 1, float %s, float %t, float %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}gather4_2darray:
; GCN: image_gather4 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 da{{$}}
; GFX6789: image_gather4 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 da{{$}}
; GFX10: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ;
define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %slice) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f32(i32 1, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}gather4_c_2d:
; GCN: image_gather4_c v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX6789: image_gather4_c v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX10: image_gather4_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ;
define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}gather4_cl_2d:
; GCN: image_gather4_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX6789: image_gather4_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX10: image_gather4_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ;
define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %clamp) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 1, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}gather4_c_cl_2d:
; GCN: image_gather4_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX6789: image_gather4_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX10: image_gather4_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ;
define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %clamp) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}gather4_b_2d:
; GCN: image_gather4_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX6789: image_gather4_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX10: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ;
define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}gather4_c_b_2d:
; GCN: image_gather4_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX6789: image_gather4_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX10: image_gather4_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ;
define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}gather4_b_cl_2d:
; GCN: image_gather4_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX6789: image_gather4_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX10: image_gather4_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ;
define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t, float %clamp) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}gather4_c_b_cl_2d:
; GCN: image_gather4_c_b_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX6789: image_gather4_c_b_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX10: image_gather4_c_b_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ;
define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}gather4_l_2d:
; GCN: image_gather4_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX6789: image_gather4_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX10: image_gather4_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ;
define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 1, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}gather4_c_l_2d:
; GCN: image_gather4_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX6789: image_gather4_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX10: image_gather4_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ;
define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}gather4_lz_2d:
; GCN: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX6789: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX10: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ;
define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}gather4_c_lz_2d:
; GCN: image_gather4_c_lz v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX6789: image_gather4_c_lz v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}}
; GFX10: image_gather4_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ;
define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}gather4_2d_dmask_2:
; GCN: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x2{{$}}
; GFX6789: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x2{{$}}
; GFX10: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x2 dim:SQ_RSRC_IMG_2D ;
define amdgpu_ps <4 x float> @gather4_2d_dmask_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 2, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}gather4_2d_dmask_4:
; GCN: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x4{{$}}
; GFX6789: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x4{{$}}
; GFX10: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D ;
define amdgpu_ps <4 x float> @gather4_2d_dmask_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 4, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}gather4_2d_dmask_8:
; GCN: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x8{{$}}
; GFX6789: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x8{{$}}
; GFX10: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x8 dim:SQ_RSRC_IMG_2D ;
define amdgpu_ps <4 x float> @gather4_2d_dmask_8(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 8, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand Down
11 changes: 7 additions & 4 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=GCN %s
; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefixes=GCN,PRE-GFX10 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefixes=GCN,PRE-GFX10 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck --check-prefixes=GCN,GFX10 %s

; GCN-LABEL: {{^}}getlod_1d:
; GCN: image_get_lod v[0:3], v0, s[0:7], s[8:11] dmask:0xf{{$}}
; PRE-GFX10: image_get_lod v[0:3], v0, s[0:7], s[8:11] dmask:0xf{{$}}
; GFX10: image_get_lod v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GCN: s_waitcnt vmcnt(0)
define amdgpu_ps <4 x float> @getlod_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
main_body:
Expand All @@ -11,7 +13,8 @@ main_body:
}

; GCN-LABEL: {{^}}getlod_2d:
; GCN: image_get_lod v[0:1], v[0:1], s[0:7], s[8:11] dmask:0x3{{$}}
; PRE-GFX10: image_get_lod v[0:1], v[0:1], s[0:7], s[8:11] dmask:0x3{{$}}
; GFX10: image_get_lod v[0:1], v[0:1], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_2D
; GCN: s_waitcnt vmcnt(0)
define amdgpu_ps <2 x float> @getlod_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
main_body:
Expand Down
91 changes: 91 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-nsa-encoding -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,NONSA %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,NSA %s

; GCN-LABEL: {{^}}sample_2d:
;
; TODO: use NSA here
; GCN: v_mov_b32_e32 v2, v0
;
; GCN: image_sample v[0:3], v[1:2],
define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %t, float %s) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}sample_3d:
; NONSA: v_mov_b32_e32 v3, v0
; NONSA: image_sample v[0:3], v[1:4],
; NSA: image_sample v[0:3], [v1, v2, v0],
define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %r, float %s, float %t) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}sample_d_3d:
; NSA: image_sample_d v[0:3], [v3, v8, v7, v5, v4, v6, v0, v2, v1],
define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %r, float %t, float %dsdh, float %dtdv, float %dsdv, float %drdv, float %drdh, float %dtdh) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32(i32 15, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}sample_contig_nsa:
; GCN: image_sample_c_l v0, v[0:7],
; NSA: image_sample v1, [v6, v7, v5],
define amdgpu_ps <2 x float> @sample_contig_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) {
main_body:
%v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
%v2 = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %s2, float %t2, float %r2, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
%r.0 = insertelement <2 x float> undef, float %v1, i32 0
%r = insertelement <2 x float> %r.0, float %v2, i32 1
ret <2 x float> %r
}

; GCN-LABEL: {{^}}sample_nsa_nsa:
; NSA: image_sample_c_l v0, [v1, v2, v3, v4, v0],
; NSA: image_sample v1, [v6, v7, v5],
define amdgpu_ps <2 x float> @sample_nsa_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %r2, float %s2, float %t2) {
main_body:
%v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
%v2 = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %s2, float %t2, float %r2, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
%r.0 = insertelement <2 x float> undef, float %v1, i32 0
%r = insertelement <2 x float> %r.0, float %v2, i32 1
ret <2 x float> %r
}

; GCN-LABEL: {{^}}sample_nsa_contig:
; NSA: image_sample_c_l v0, [v1, v2, v3, v4, v0],
; NSA: image_sample v1, v[5:7],
define amdgpu_ps <2 x float> @sample_nsa_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %s2, float %t2, float %r2) {
main_body:
%v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
%v2 = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %s2, float %t2, float %r2, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
%r.0 = insertelement <2 x float> undef, float %v1, i32 0
%r = insertelement <2 x float> %r.0, float %v2, i32 1
ret <2 x float> %r
}

; GCN-LABEL: {{^}}sample_contig_contig:
; GCN: image_sample_c_l v0, v[0:7],
; NSA: image_sample v1, v[5:7],
; NONSA: image_sample v1, v[5:8],
define amdgpu_ps <2 x float> @sample_contig_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %s2, float %t2, float %r2) {
main_body:
%v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
%v2 = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %s2, float %t2, float %r2, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
%r.0 = insertelement <2 x float> undef, float %v1, i32 0
%r = insertelement <2 x float> %r.0, float %v2, i32 1
ret <2 x float> %r
}


declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32(i32, float, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1

declare float @llvm.amdgcn.image.sample.3d.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1

attributes #1 = { nounwind readonly }
12 changes: 8 additions & 4 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,UNPACKED,GFX89 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,GFX81,GFX89 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,GFX9,GFX89 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s

; GCN-LABEL: {{^}}image_sample_2d_f16:
; GCN: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16{{$}}
; GFX89: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16{{$}}
; GFX10: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D d16{{$}}
define amdgpu_ps half @image_sample_2d_f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
main_body:
%tex = call half @llvm.amdgcn.image.sample.2d.f16.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -26,6 +28,7 @@ main_body:
; GCN-LABEL: {{^}}image_sample_c_d_1d_v2f16:
; UNPACKED: image_sample_c_d v[0:1], v[0:3], s[0:7], s[8:11] dmask:0x3 d16{{$}}
; PACKED: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 d16{{$}}
; GFX10: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D d16{{$}}
define amdgpu_ps float @image_sample_c_d_1d_v2f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) {
main_body:
%tex = call <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -52,6 +55,7 @@ main_body:
; GCN-LABEL: {{^}}image_sample_b_2d_v4f16:
; UNPACKED: image_sample_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf d16{{$}}
; PACKED: image_sample_b v[0:1], v[0:3], s[0:7], s[8:11] dmask:0xf d16{{$}}
; GFX10: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D d16{{$}}
define amdgpu_ps <2 x float> @image_sample_b_2d_v4f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
main_body:
%tex = call <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand Down
206 changes: 134 additions & 72 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll

Large diffs are not rendered by default.

21 changes: 13 additions & 8 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED,PREGFX10,PREGFX10-UNPACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PREGFX10,PREGFX10-PACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PREGFX10,PREGFX10-PACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,GFX10,GFX10-PACKED %s

; GCN-LABEL: {{^}}tbuffer_load_d16_x:
; GCN: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0
; PREGFX10: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0
; GFX10: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], format:22, 0
define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) {
main_body:
%data = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 22, i32 0)
ret half %data
}

; GCN-LABEL: {{^}}tbuffer_load_d16_xy:
; UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0
; PREGFX10-UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0
; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]

; PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0
; PREGFX10-PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0
; GFX10-PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], format:22, 0
; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]]
define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) {
main_body:
Expand All @@ -24,10 +27,12 @@ main_body:
}

; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw:
; UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0
; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0
; GFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], format:22, 0
; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]

; PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0
; PREGFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0
; GFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], format:22, 0
; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]]
define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) {
main_body:
Expand Down
30 changes: 17 additions & 13 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
Original file line number Diff line number Diff line change
@@ -1,36 +1,39 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED,PREGFX10,PREGFX10-UNPACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PREGFX10,PREGFX10-PACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PREGFX10,PREGFX10-PACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,GFX10,GFX10-PACKED %s


; GCN-LABEL: {{^}}tbuffer_store_d16_x:
; GCN: s_load_dwordx4
; GCN: s_load_dword s[[S_LO:[0-9]+]]
; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]]
; GCN: tbuffer_store_format_d16_x v[[V_LO]], off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0
; GCN-DAG: s_load_dwordx4
; GCN-DAG: s_load_dword s[[S_LO:[0-9]+]]
; GCN-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]]
; PREGFX10: tbuffer_store_format_d16_x v[[V_LO]], off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0
; GFX10: tbuffer_store_format_d16_x v[[V_LO]], off, s[{{[0-9]+:[0-9]+}}], format:33, 0
define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) {
main_body:
call void @llvm.amdgcn.raw.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0)
ret void
}

; GCN-LABEL: {{^}}tbuffer_store_d16_xy:
; GCN: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
; GCN: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}},
; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}}
; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]]
; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]]
; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0
; PREGFX10-UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0

; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0
; PREGFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0
; GFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], format:33, 0
define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data) {
main_body:
call void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0)
ret void
}

; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw:
; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}},

; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
Expand All @@ -40,12 +43,13 @@ main_body:

; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0
; PREGFX10-UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0


; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0
; PREGFX10-PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0
; GFX10-PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], format:33, 0
define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data) {
main_body:
call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0)
Expand Down
34 changes: 22 additions & 12 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GCN,VERDE %s
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s
;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GCN,VERDE,PREGFX10 %s
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,PREGFX10 %s
;RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s

; GCN-LABEL: {{^}}tbuffer_store:
; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:12, nfmt:2, 0
; GCN: tbuffer_store_format_xyzw v[4:7], off, s[0:3], dfmt:13, nfmt:3, 0 glc
; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 slc
; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0
; PREGFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:12, nfmt:2, 0
; PREGFX10: tbuffer_store_format_xyzw v[4:7], off, s[0:3], dfmt:13, nfmt:3, 0 glc
; PREGFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 slc
; PREGFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 glc
; GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], format:44, 0
; GFX10: tbuffer_store_format_xyzw v[4:7], off, s[0:3], format:61, 0 glc
; GFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], format:78, 0 slc
; GFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], format:78, 0 glc dlc
define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
main_body:
%in1 = bitcast <4 x float> %1 to <4 x i32>
Expand All @@ -14,12 +19,13 @@ main_body:
call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 44, i32 0)
call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in2, <4 x i32> %0, i32 0, i32 0, i32 61, i32 1)
call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in3, <4 x i32> %0, i32 0, i32 0, i32 78, i32 2)
call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 78, i32 0)
call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 78, i32 5)
ret void
}

; GCN-LABEL: {{^}}tbuffer_store_immoffs:
; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, 0 offset:42
; PREGFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, 0 offset:42
; GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], format:117, 0 offset:42
define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
main_body:
%in1 = bitcast <4 x float> %1 to <4 x i32>
Expand All @@ -28,7 +34,8 @@ main_body:
}

; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs:
; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, {{s[0-9]+}} offset:42
; PREGFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, {{s[0-9]+}} offset:42
; GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], format:117, {{s[0-9]+}} offset:42
define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) {
main_body:
%in1 = bitcast <4 x float> %vdata to <4 x i32>
Expand All @@ -37,7 +44,8 @@ main_body:
}

; GCN-LABEL: {{^}}buffer_store_ofs:
; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:3, nfmt:7, 0 offen
; PREGFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:3, nfmt:7, 0 offen
; GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], format:115, 0 offen
define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) {
main_body:
%in1 = bitcast <4 x float> %vdata to <4 x i32>
Expand All @@ -46,7 +54,8 @@ main_body:
}

; GCN-LABEL: {{^}}buffer_store_x1:
; GCN: tbuffer_store_format_x v0, off, s[0:3], dfmt:13, nfmt:7, 0
; PREGFX10: tbuffer_store_format_x v0, off, s[0:3], dfmt:13, nfmt:7, 0
; GFX10: tbuffer_store_format_x v0, off, s[0:3], format:125, 0
define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data) {
main_body:
%data.i = bitcast float %data to i32
Expand All @@ -55,7 +64,8 @@ main_body:
}

; GCN-LABEL: {{^}}buffer_store_x2:
; GCN: tbuffer_store_format_xy v[0:1], off, s[0:3], dfmt:1, nfmt:2, 0
; PREGFX10: tbuffer_store_format_xy v[0:1], off, s[0:3], dfmt:1, nfmt:2, 0
; GFX10: tbuffer_store_format_xy v[0:1], off, s[0:3], format:33, 0
define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data) {
main_body:
%data.i = bitcast <2 x float> %data to <2 x i32>
Expand Down
24 changes: 14 additions & 10 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED,PREGFX10,PREGFX10-UNPACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PREGFX10,PREGFX10-PACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PREGFX10,PREGFX10-PACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,GFX10,GFX10-PACKED %s

; GCN-LABEL: {{^}}tbuffer_load_d16_x:
; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0
; GCN: tbuffer_load_format_d16_x v{{[0-9]+}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen
; PREGFX10: tbuffer_load_format_d16_x v{{[0-9]+}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen
; GFX10: tbuffer_load_format_d16_x v{{[0-9]+}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], format:22, 0 idxen
define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) {
main_body:
%data = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0)
Expand All @@ -13,10 +15,11 @@ main_body:

; GCN-LABEL: {{^}}tbuffer_load_d16_xy:
; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0
; UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen
; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
; PREGFX10-UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen
; PREGFX10-UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]

; PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen
; PREGFX10-PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen
; GFX10-PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], format:22, 0 idxen
; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]]
define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) {
main_body:
Expand All @@ -27,10 +30,11 @@ main_body:

; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw:
; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0
; UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen
; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen
; PREGFX10-UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]

; PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen
; PREGFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen
; GFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], format:22, 0 idxen
; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]]
define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) {
main_body:
Expand Down
Loading