192 changes: 191 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,12 @@ def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts",
"Have scratch_* flat memory instructions"
>;

def FeatureScalarFlatScratchInsts : SubtargetFeature<"scalar-flat-scratch-insts",
"ScalarFlatScratchInsts",
"true",
"Have s_scratch_* flat memory instructions"
>;

def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts",
"AddNoCarryInsts",
"true",
Expand Down Expand Up @@ -115,12 +121,72 @@ def FeatureXNACK : SubtargetFeature<"xnack",
"Enable XNACK support"
>;

def FeatureCuMode : SubtargetFeature<"cumode",
"EnableCuMode",
"true",
"Enable CU wavefront execution mode"
>;

def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
"SGPRInitBug",
"true",
"VI SGPR initialization bug requiring a fixed SGPR allocation size"
>;

def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug",
"LDSMisalignedBug",
"true",
"Some GFX10 bug with misaligned multi-dword LDS access in WGP mode"
>;

def FeatureVcmpxPermlaneHazard : SubtargetFeature<"vcmpx-permlane-hazard",
"HasVcmpxPermlaneHazard",
"true",
"TODO: describe me"
>;

def FeatureVMEMtoScalarWriteHazard : SubtargetFeature<"vmem-to-scalar-write-hazard",
"HasVMEMtoScalarWriteHazard",
"true",
"VMEM instruction followed by scalar writing to EXEC mask, M0 or SGPR leads to incorrect execution."
>;

def FeatureSMEMtoVectorWriteHazard : SubtargetFeature<"smem-to-vector-write-hazard",
"HasSMEMtoVectorWriteHazard",
"true",
"s_load_dword followed by v_cmp page faults"
>;

def FeatureInstFwdPrefetchBug : SubtargetFeature<"inst-fwd-prefetch-bug",
"HasInstFwdPrefetchBug",
"true",
"S_INST_PREFETCH instruction causes shader to hang"
>;

def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard",
"HasVcmpxExecWARHazard",
"true",
"V_CMPX WAR hazard on EXEC (V_CMPX issue ONLY)"
>;

def FeatureLdsBranchVmemWARHazard : SubtargetFeature<"lds-branch-vmem-war-hazard",
"HasLdsBranchVmemWARHazard",
"true",
"Switching between LDS and VMEM-tex not waiting VM_VSRC=0"
>;

def FeatureNSAtoVMEMBug : SubtargetFeature<"nsa-to-vmem-bug",
"HasNSAtoVMEMBug",
"true",
"MIMG-NSA followed by VMEM fail if EXEC_LO or EXEC_HI equals zero"
>;

def FeatureFlatSegmentOffsetBug : SubtargetFeature<"flat-segment-offset-bug",
"HasFlatSegmentOffsetBug",
"true",
"GFX10 bug, inst_offset ignored in flat segment"
>;

class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
"ldsbankcount"#Value,
"LDSBankCount",
Expand Down Expand Up @@ -155,6 +221,12 @@ def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts",
"Additional instructions for GFX9+"
>;

def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts",
"GFX10Insts",
"true",
"Additional instructions for GFX10+"
>;

def FeatureGFX7GFX8GFX9Insts : SubtargetFeature<"gfx7-gfx8-gfx9-insts",
"GFX7GFX8GFX9Insts",
"true",
Expand Down Expand Up @@ -257,6 +329,12 @@ def FeatureR128A16 : SubtargetFeature<"r128-a16",
"Support 16 bit coordindates/gradients/lod/clamp/mip types on gfx9"
>;

def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding",
"HasNSAEncoding",
"true",
"Support NSA encoding for image instructions"
>;

def FeatureIntClamp : SubtargetFeature<"int-clamp-insts",
"HasIntClamp",
"true",
Expand Down Expand Up @@ -299,6 +377,36 @@ def FeatureSRAMECC : SubtargetFeature<"sram-ecc",
"Enable SRAM ECC"
>;

def FeatureNoSdstCMPX : SubtargetFeature<"no-sdst-cmpx",
"HasNoSdstCMPX",
"true",
"V_CMPX does not write VCC/SGPR in addition to EXEC"
>;

def FeatureVscnt : SubtargetFeature<"vscnt",
"HasVscnt",
"true",
"Has separate store vscnt counter"
>;

def FeatureRegisterBanking : SubtargetFeature<"register-banking",
"HasRegisterBanking",
"true",
"Has register banking"
>;

def FeatureVOP3Literal : SubtargetFeature<"vop3-literal",
"HasVOP3Literal",
"true",
"Can use one literal in VOP3"
>;

def FeatureNoDataDepHazard : SubtargetFeature<"no-data-dep-hazard",
"HasNoDataDepHazard",
"true",
"Does not need SW waitstates"
>;

//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
Expand Down Expand Up @@ -487,7 +595,24 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts,
FeatureScalarAtomics, FeatureR128A16
FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16
]
>;

def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
"gfx10",
[FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
FeatureFlatAddressSpace,
FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureInv2PiInlineImm,
FeatureApertureRegs, FeatureGFX9Insts, FeatureGFX10Insts, FeatureVOP3P,
FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts,
FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking,
FeatureVOP3Literal, FeatureNoDataDepHazard,
FeatureDoesNotSupportSRAMECC
]
>;

Expand Down Expand Up @@ -601,6 +726,34 @@ def FeatureISAVersion9_0_9 : FeatureSet<
FeatureXNACK,
FeatureCodeObjectV3]>;

// TODO: Organize more features into groups.
def FeatureGroup {
// Bugs present on gfx10.1.
list<SubtargetFeature> GFX10_1_Bugs = [
FeatureVcmpxPermlaneHazard,
FeatureVMEMtoScalarWriteHazard,
FeatureSMEMtoVectorWriteHazard,
FeatureInstFwdPrefetchBug,
FeatureVcmpxExecWARHazard,
FeatureLdsBranchVmemWARHazard,
FeatureNSAtoVMEMBug,
FeatureFlatSegmentOffsetBug
];
}

def FeatureISAVersion10_1_0 : FeatureSet<
!listconcat(FeatureGroup.GFX10_1_Bugs,
[FeatureGFX10,
FeatureLDSBankCount32,
FeatureDLInsts,
FeatureNSAEncoding,
FeatureWavefrontSize64,
FeatureScalarStores,
FeatureScalarAtomics,
FeatureScalarFlatScratchInsts,
FeatureLdsMisalignedBug,
FeatureCodeObjectV3])>;

//===----------------------------------------------------------------------===//

def AMDGPUInstrInfo : InstrInfo {
Expand Down Expand Up @@ -687,10 +840,21 @@ def isGFX6 :
def isGFX6GFX7 :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS">,
AssemblerPredicate<"!FeatureGCN3Encoding,!FeatureGFX10Insts">;

def isGFX6GFX7GFX10 :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
AssemblerPredicate<"!FeatureGCN3Encoding">;

def isGFX7Only :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS">,
AssemblerPredicate<"!FeatureGCN3Encoding,FeatureCIInsts,!FeatureGFX10Insts">;

def isGFX7GFX10 :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
AssemblerPredicate<"!FeatureGCN3Encoding,FeatureCIInsts">;

def isGFX7GFX8GFX9 :
Expand All @@ -699,6 +863,13 @@ def isGFX7GFX8GFX9 :
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
AssemblerPredicate<"FeatureGFX7GFX8GFX9Insts">;

def isGFX6GFX7GFX8GFX9 :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
AssemblerPredicate<"!FeatureGFX10Insts">;

def isGFX7Plus :
Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
AssemblerPredicate<"FeatureCIInsts">;
Expand All @@ -724,13 +895,19 @@ def isGFX8GFX9 :
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
AssemblerPredicate<"FeatureGFX8Insts,FeatureGCN3Encoding">;

def isGFX10Plus :
Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10">,
AssemblerPredicate<"FeatureGFX10Insts">;

def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
AssemblerPredicate<"FeatureFlatAddressSpace">;

def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">,
AssemblerPredicate<"FeatureFlatGlobalInsts">;
def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">,
AssemblerPredicate<"FeatureFlatScratchInsts">;
def HasScalarFlatScratchInsts : Predicate<"Subtarget->hasScalarFlatScratchInsts()">,
AssemblerPredicate<"FeatureScalarFlatScratchInsts">;
def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,
AssemblerPredicate<"FeatureGFX9Insts">;

Expand Down Expand Up @@ -766,6 +943,10 @@ def HasSDWA9 :
Predicate<"Subtarget->hasSDWA()">,
AssemblerPredicate<"FeatureGCN3Encoding,FeatureGFX9Insts,FeatureSDWA">;

def HasSDWA10 :
Predicate<"Subtarget->hasSDWA()">,
AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureSDWA">;

def HasDPP : Predicate<"Subtarget->hasDPP()">,
AssemblerPredicate<"FeatureGCN3Encoding,FeatureDPP">;

Expand All @@ -778,9 +959,18 @@ def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">,
def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">,
AssemblerPredicate<"FeatureMadMixInsts">;

def HasScalarStores : Predicate<"Subtarget->hasScalarStores()">,
AssemblerPredicate<"FeatureScalarStores">;

def HasScalarAtomics : Predicate<"Subtarget->hasScalarAtomics()">,
AssemblerPredicate<"FeatureScalarAtomics">;

def HasNoSdstCMPX : Predicate<"Subtarget->hasNoSdstCMPX()">,
AssemblerPredicate<"FeatureNoSdstCMPX">;

def HasSdstCMPX : Predicate<"!Subtarget->hasNoSdstCMPX()">,
AssemblerPredicate<"!FeatureNoSdstCMPX">;

def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
Expand Down
36 changes: 35 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,

HasApertureRegs(false),
EnableXNACK(false),
EnableCuMode(false),
TrapHandler(false),

EnableHugePrivateBuffer(false),
Expand All @@ -196,6 +197,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
CIInsts(false),
GFX8Insts(false),
GFX9Insts(false),
GFX10Insts(false),
GFX7GFX8GFX9Insts(false),
SGPRInitBug(false),
HasSMemRealTime(false),
Expand All @@ -212,20 +214,37 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasSDWAOutModsVOPC(false),
HasDPP(false),
HasR128A16(false),
HasNSAEncoding(false),
HasDLInsts(false),
HasDot1Insts(false),
HasDot2Insts(false),
EnableSRAMECC(false),
DoesNotSupportSRAMECC(false),
HasNoSdstCMPX(false),
HasVscnt(false),
HasRegisterBanking(false),
HasVOP3Literal(false),
HasNoDataDepHazard(false),
FlatAddressSpace(false),
FlatInstOffsets(false),
FlatGlobalInsts(false),
FlatScratchInsts(false),
ScalarFlatScratchInsts(false),
AddNoCarryInsts(false),
HasUnpackedD16VMem(false),
LDSMisalignedBug(false),

ScalarizeGlobal(false),

HasVcmpxPermlaneHazard(false),
HasVMEMtoScalarWriteHazard(false),
HasSMEMtoVectorWriteHazard(false),
HasInstFwdPrefetchBug(false),
HasVcmpxExecWARHazard(false),
HasLdsBranchVmemWARHazard(false),
HasNSAtoVMEMBug(false),
HasFlatSegmentOffsetBug(false),

FeatureDisable(false),
InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
TLInfo(TM, *this),
Expand All @@ -243,6 +262,8 @@ unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
return getLocalMemorySize();
unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
if (!WorkGroupsPerCu)
return 0;
unsigned MaxWaves = getMaxWavesPerEU();
return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
}
Expand All @@ -251,6 +272,8 @@ unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
const Function &F) const {
unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
if (!WorkGroupsPerCu)
return 0;
unsigned MaxWaves = getMaxWavesPerEU();
unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
Expand All @@ -271,7 +294,8 @@ AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
case CallingConv::AMDGPU_CS:
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
return std::make_pair(getWavefrontSize() * 2,
std::max(getWavefrontSize() * 4, 256u));
case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_LS:
case CallingConv::AMDGPU_HS:
Expand Down Expand Up @@ -496,7 +520,14 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
Policy.ShouldTrackLaneMasks = true;
}

bool GCNSubtarget::hasMadF16() const {
return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
}

unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
if (getGeneration() >= AMDGPUSubtarget::GFX10)
return 10;

if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
if (SGPRs <= 80)
return 10;
Expand Down Expand Up @@ -543,6 +574,9 @@ unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {

unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
if (getGeneration() >= AMDGPUSubtarget::GFX10)
return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.

if (MFI.hasFlatScratchInit()) {
if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
Expand Down
100 changes: 95 additions & 5 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ class AMDGPUSubtarget {
SOUTHERN_ISLANDS = 4,
SEA_ISLANDS = 5,
VOLCANIC_ISLANDS = 6,
GFX9 = 7
GFX9 = 7,
GFX10 = 8
};

private:
Expand Down Expand Up @@ -293,6 +294,7 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
bool UnalignedBufferAccess;
bool HasApertureRegs;
bool EnableXNACK;
bool EnableCuMode;
bool TrapHandler;

// Used as options.
Expand All @@ -313,6 +315,7 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
bool CIInsts;
bool GFX8Insts;
bool GFX9Insts;
bool GFX10Insts;
bool GFX7GFX8GFX9Insts;
bool SGPRInitBug;
bool HasSMemRealTime;
Expand All @@ -329,24 +332,41 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
bool HasSDWAOutModsVOPC;
bool HasDPP;
bool HasR128A16;
bool HasNSAEncoding;
bool HasDLInsts;
bool HasDot1Insts;
bool HasDot2Insts;
bool EnableSRAMECC;
bool DoesNotSupportSRAMECC;
bool HasNoSdstCMPX;
bool HasVscnt;
bool HasRegisterBanking;
bool HasVOP3Literal;
bool HasNoDataDepHazard;
bool FlatAddressSpace;
bool FlatInstOffsets;
bool FlatGlobalInsts;
bool FlatScratchInsts;
bool ScalarFlatScratchInsts;
bool AddNoCarryInsts;
bool HasUnpackedD16VMem;
bool R600ALUInst;
bool CaymanISA;
bool CFALUBug;
bool LDSMisalignedBug;
bool HasVertexCache;
short TexVTXClauseSize;
bool ScalarizeGlobal;

bool HasVcmpxPermlaneHazard;
bool HasVMEMtoScalarWriteHazard;
bool HasSMEMtoVectorWriteHazard;
bool HasInstFwdPrefetchBug;
bool HasVcmpxExecWARHazard;
bool HasLdsBranchVmemWARHazard;
bool HasNSAtoVMEMBug;
bool HasFlatSegmentOffsetBug;

// Dummy feature to use for assembler in tablegen.
bool FeatureDisable;

Expand Down Expand Up @@ -583,6 +603,10 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
return EnableXNACK;
}

bool isCuModeEnabled() const {
return EnableCuMode;
}

bool hasFlatAddressSpace() const {
return FlatAddressSpace;
}
Expand All @@ -599,6 +623,14 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
return FlatScratchInsts;
}

bool hasScalarFlatScratchInsts() const {
return ScalarFlatScratchInsts;
}

bool hasFlatSegmentOffsetBug() const {
return HasFlatSegmentOffsetBug;
}

bool hasFlatLgkmVMemCountInOrder() const {
return getGeneration() > GFX9;
}
Expand Down Expand Up @@ -654,10 +686,6 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
return HasSDWAOutModsVOPC;
}

bool vmemWriteNeedsExpWaitcnt() const {
return getGeneration() < SEA_ISLANDS;
}

bool hasDLInsts() const {
return HasDLInsts;
}
Expand All @@ -674,6 +702,30 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
return EnableSRAMECC;
}

bool hasNoSdstCMPX() const {
return HasNoSdstCMPX;
}

bool hasVscnt() const {
return HasVscnt;
}

bool hasRegisterBanking() const {
return HasRegisterBanking;
}

bool hasVOP3Literal() const {
return HasVOP3Literal;
}

bool hasNoDataDepHazard() const {
return HasNoDataDepHazard;
}

bool vmemWriteNeedsExpWaitcnt() const {
return getGeneration() < SEA_ISLANDS;
}

// Scratch is allocated in 256 dword per wave blocks for the entire
// wavefront. When viewed from the perspecive of an arbitrary workitem, this
// is 4-byte aligned.
Expand Down Expand Up @@ -782,6 +834,12 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
return HasR128A16;
}

bool hasNSAEncoding() const {
return HasNSAEncoding;
}

bool hasMadF16() const;

bool enableSIScheduler() const {
return EnableSIScheduler;
}
Expand Down Expand Up @@ -816,6 +874,38 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
getGeneration() <= AMDGPUSubtarget::GFX9;
}

bool hasVcmpxPermlaneHazard() const {
return HasVcmpxPermlaneHazard;
}

bool hasVMEMtoScalarWriteHazard() const {
return HasVMEMtoScalarWriteHazard;
}

bool hasSMEMtoVectorWriteHazard() const {
return HasSMEMtoVectorWriteHazard;
}

bool hasLDSMisalignedBug() const {
return LDSMisalignedBug && !EnableCuMode;
}

bool hasInstFwdPrefetchBug() const {
return HasInstFwdPrefetchBug;
}

bool hasVcmpxExecWARHazard() const {
return HasVcmpxExecWARHazard;
}

bool hasLdsBranchVmemWARHazard() const {
return HasLdsBranchVmemWARHazard;
}

bool hasNSAtoVMEMBug() const {
return HasNSAtoVMEMBug;
}

/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
/// SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
Expand Down
8 changes: 6 additions & 2 deletions llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -999,6 +999,10 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
return AMDGPU::isGFX9(getSTI());
}

bool isGFX10() const {
return AMDGPU::isGFX10(getSTI());
}

bool hasInv2PiInlineImm() const {
return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm];
}
Expand Down Expand Up @@ -1407,7 +1411,7 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const {
bool AMDGPUOperand::isSDWAOperand(MVT type) const {
if (AsmParser->isVI())
return isVReg32();
else if (AsmParser->isGFX9())
else if (AsmParser->isGFX9() || AsmParser->isGFX10())
return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(type);
else
return false;
Expand Down Expand Up @@ -2953,7 +2957,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
if (getParser().parseIdentifier(KernelName))
return true;

kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor();
kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor(&getSTI());

StringSet<> Seen;

Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNProcessors.td
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,10 @@ def : ProcessorModel<"gfx909", SIQuarterSpeedModel,
FeatureISAVersion9_0_9.Features
>;

//===----------------------------------------------------------------------===//
// GCN GFX10.
//===----------------------------------------------------------------------===//

def : ProcessorModel<"gfx1010", GFX10SpeedModel,
FeatureISAVersion10_1_0.Features
>;
79 changes: 46 additions & 33 deletions llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,39 +60,40 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
AMDGPU::GPUKind AK;

switch (ElfMach) {
case ELF::EF_AMDGPU_MACH_R600_R600: AK = GK_R600; break;
case ELF::EF_AMDGPU_MACH_R600_R630: AK = GK_R630; break;
case ELF::EF_AMDGPU_MACH_R600_RS880: AK = GK_RS880; break;
case ELF::EF_AMDGPU_MACH_R600_RV670: AK = GK_RV670; break;
case ELF::EF_AMDGPU_MACH_R600_RV710: AK = GK_RV710; break;
case ELF::EF_AMDGPU_MACH_R600_RV730: AK = GK_RV730; break;
case ELF::EF_AMDGPU_MACH_R600_RV770: AK = GK_RV770; break;
case ELF::EF_AMDGPU_MACH_R600_CEDAR: AK = GK_CEDAR; break;
case ELF::EF_AMDGPU_MACH_R600_CYPRESS: AK = GK_CYPRESS; break;
case ELF::EF_AMDGPU_MACH_R600_JUNIPER: AK = GK_JUNIPER; break;
case ELF::EF_AMDGPU_MACH_R600_REDWOOD: AK = GK_REDWOOD; break;
case ELF::EF_AMDGPU_MACH_R600_SUMO: AK = GK_SUMO; break;
case ELF::EF_AMDGPU_MACH_R600_BARTS: AK = GK_BARTS; break;
case ELF::EF_AMDGPU_MACH_R600_CAICOS: AK = GK_CAICOS; break;
case ELF::EF_AMDGPU_MACH_R600_CAYMAN: AK = GK_CAYMAN; break;
case ELF::EF_AMDGPU_MACH_R600_TURKS: AK = GK_TURKS; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600: AK = GK_GFX600; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601: AK = GK_GFX601; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700: AK = GK_GFX700; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701: AK = GK_GFX701; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702: AK = GK_GFX702; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703: AK = GK_GFX703; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704: AK = GK_GFX704; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801: AK = GK_GFX801; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802: AK = GK_GFX802; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803: AK = GK_GFX803; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810: AK = GK_GFX810; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900: AK = GK_GFX900; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902: AK = GK_GFX902; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904: AK = GK_GFX904; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break;
case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break;
case ELF::EF_AMDGPU_MACH_R600_R600: AK = GK_R600; break;
case ELF::EF_AMDGPU_MACH_R600_R630: AK = GK_R630; break;
case ELF::EF_AMDGPU_MACH_R600_RS880: AK = GK_RS880; break;
case ELF::EF_AMDGPU_MACH_R600_RV670: AK = GK_RV670; break;
case ELF::EF_AMDGPU_MACH_R600_RV710: AK = GK_RV710; break;
case ELF::EF_AMDGPU_MACH_R600_RV730: AK = GK_RV730; break;
case ELF::EF_AMDGPU_MACH_R600_RV770: AK = GK_RV770; break;
case ELF::EF_AMDGPU_MACH_R600_CEDAR: AK = GK_CEDAR; break;
case ELF::EF_AMDGPU_MACH_R600_CYPRESS: AK = GK_CYPRESS; break;
case ELF::EF_AMDGPU_MACH_R600_JUNIPER: AK = GK_JUNIPER; break;
case ELF::EF_AMDGPU_MACH_R600_REDWOOD: AK = GK_REDWOOD; break;
case ELF::EF_AMDGPU_MACH_R600_SUMO: AK = GK_SUMO; break;
case ELF::EF_AMDGPU_MACH_R600_BARTS: AK = GK_BARTS; break;
case ELF::EF_AMDGPU_MACH_R600_CAICOS: AK = GK_CAICOS; break;
case ELF::EF_AMDGPU_MACH_R600_CAYMAN: AK = GK_CAYMAN; break;
case ELF::EF_AMDGPU_MACH_R600_TURKS: AK = GK_TURKS; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600: AK = GK_GFX600; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601: AK = GK_GFX601; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700: AK = GK_GFX700; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701: AK = GK_GFX701; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702: AK = GK_GFX702; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703: AK = GK_GFX703; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704: AK = GK_GFX704; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801: AK = GK_GFX801; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802: AK = GK_GFX802; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803: AK = GK_GFX803; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810: AK = GK_GFX810; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900: AK = GK_GFX900; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902: AK = GK_GFX902; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904: AK = GK_GFX904; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break;
case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break;
}

StringRef GPUName = getArchNameAMDGCN(AK);
Expand Down Expand Up @@ -139,6 +140,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
case GK_GFX904: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX904;
case GK_GFX906: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX906;
case GK_GFX909: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909;
case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010;
case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE;
}

Expand Down Expand Up @@ -324,6 +326,17 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
PRINT_FIELD(OS, ".amdhsa_fp16_overflow", KD,
compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL);
if (IVersion.Major >= 10) {
PRINT_FIELD(OS, ".amdhsa_workgroup_processor_mode", KD,
compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_WGP_MODE);
PRINT_FIELD(OS, ".amdhsa_memory_ordered", KD,
compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED);
PRINT_FIELD(OS, ".amdhsa_forward_progress", KD,
compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_FWD_PROGRESS);
}
PRINT_FIELD(
OS, ".amdhsa_exception_fp_ieee_invalid_op", KD,
compute_pgm_rsrc2,
Expand Down
18 changes: 18 additions & 0 deletions llvm/lib/Target/AMDGPU/SIDefines.h
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,15 @@ enum DppCtrl : unsigned {
#define S_00B848_IEEE_MODE(x) (((x) & 0x1) << 23)
#define G_00B848_IEEE_MODE(x) (((x) >> 23) & 0x1)
#define C_00B848_IEEE_MODE 0xFF7FFFFF
#define S_00B848_WGP_MODE(x) (((x) & 0x1) << 29)
#define G_00B848_WGP_MODE(x) (((x) >> 29) & 0x1)
#define C_00B848_WGP_MODE 0xDFFFFFFF
#define S_00B848_MEM_ORDERED(x) (((x) & 0x1) << 30)
#define G_00B848_MEM_ORDERED(x) (((x) >> 30) & 0x1)
#define C_00B848_MEM_ORDERED 0xBFFFFFFF
#define S_00B848_FWD_PROGRESS(x) (((x) & 0x1) << 31)
#define G_00B848_FWD_PROGRESS(x) (((x) >> 31) & 0x1)
#define C_00B848_FWD_PROGRESS 0x7FFFFFFF


// Helpers for setting FLOAT_MODE
Expand Down Expand Up @@ -553,6 +562,15 @@ enum DppCtrl : unsigned {
#define R_0286E8_SPI_TMPRING_SIZE 0x0286E8
#define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12)

#define R_028B54_VGT_SHADER_STAGES_EN 0x028B54
#define S_028B54_HS_W32_EN(x) (((x) & 0x1) << 21)
#define S_028B54_GS_W32_EN(x) (((x) & 0x1) << 22)
#define S_028B54_VS_W32_EN(x) (((x) & 0x1) << 23)
#define R_0286D8_SPI_PS_IN_CONTROL 0x0286D8
#define S_0286D8_PS_W32_EN(x) (((x) & 0x1) << 15)
#define R_00B800_COMPUTE_DISPATCH_INITIATOR 0x00B800
#define S_00B800_CS_W32_EN(x) (((x) & 0x1) << 15)

#define R_SPILLED_SGPRS 0x4
#define R_SPILLED_VGPRS 0x8
} // End namespace llvm
Expand Down
6 changes: 5 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5591,7 +5591,9 @@ enum SIEncodingFamily {
SDWA = 2,
SDWA9 = 3,
GFX80 = 4,
GFX9 = 5
GFX9 = 5,
GFX10 = 6,
SDWA10 = 7
};

static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
Expand All @@ -5604,6 +5606,8 @@ static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
case AMDGPUSubtarget::VOLCANIC_ISLANDS:
case AMDGPUSubtarget::GFX9:
return SIEncodingFamily::VI;
case AMDGPUSubtarget::GFX10:
return SIEncodingFamily::GFX10;
}
llvm_unreachable("Unknown subtarget generation!");
}
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ def SIEncodingFamily {
int SDWA9 = 3;
int GFX80 = 4;
int GFX9 = 5;
int GFX10 = 6;
int SDWA10 = 7;
}

//===----------------------------------------------------------------------===//
Expand Down
24 changes: 12 additions & 12 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,9 @@ def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>,
}

foreach Index = 0-15 in {
def TTMP#Index#_vi : SIReg<"ttmp"#Index, !add(112, Index)>;
def TTMP#Index#_gfx9 : SIReg<"ttmp"#Index, !add(108, Index)>;
def TTMP#Index : SIReg<"", 0>;
def TTMP#Index#_vi : SIReg<"ttmp"#Index, !add(112, Index)>;
def TTMP#Index#_gfx9_gfx10 : SIReg<"ttmp"#Index, !add(108, Index)>;
def TTMP#Index : SIReg<"", 0>;
}

multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> {
Expand Down Expand Up @@ -311,8 +311,8 @@ class TmpRegTuples<string tgt,
getSubRegs<size>.ret>;

foreach Index = {0, 2, 4, 6, 8, 10, 12, 14} in {
def TTMP#Index#_TTMP#!add(Index,1)#_vi : TmpRegTuples<"_vi", 2, Index>;
def TTMP#Index#_TTMP#!add(Index,1)#_gfx9 : TmpRegTuples<"_gfx9", 2, Index>;
def TTMP#Index#_TTMP#!add(Index,1)#_vi : TmpRegTuples<"_vi", 2, Index>;
def TTMP#Index#_TTMP#!add(Index,1)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 2, Index>;
}

foreach Index = {0, 4, 8, 12} in {
Expand All @@ -321,7 +321,7 @@ foreach Index = {0, 4, 8, 12} in {
_TTMP#!add(Index,3)#_vi : TmpRegTuples<"_vi", 4, Index>;
def TTMP#Index#_TTMP#!add(Index,1)#
_TTMP#!add(Index,2)#
_TTMP#!add(Index,3)#_gfx9 : TmpRegTuples<"_gfx9", 4, Index>;
_TTMP#!add(Index,3)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 4, Index>;
}

foreach Index = {0, 4, 8} in {
Expand All @@ -338,7 +338,7 @@ foreach Index = {0, 4, 8} in {
_TTMP#!add(Index,4)#
_TTMP#!add(Index,5)#
_TTMP#!add(Index,6)#
_TTMP#!add(Index,7)#_gfx9 : TmpRegTuples<"_gfx9", 8, Index>;
_TTMP#!add(Index,7)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 8, Index>;
}

def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_vi :
Expand All @@ -348,12 +348,12 @@ def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TT
TTMP8_vi, TTMP9_vi, TTMP10_vi, TTMP11_vi,
TTMP12_vi, TTMP13_vi, TTMP14_vi, TTMP15_vi]>;

def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9 :
def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9_gfx10 :
TmpRegTuplesBase<0, 16,
[TTMP0_gfx9, TTMP1_gfx9, TTMP2_gfx9, TTMP3_gfx9,
TTMP4_gfx9, TTMP5_gfx9, TTMP6_gfx9, TTMP7_gfx9,
TTMP8_gfx9, TTMP9_gfx9, TTMP10_gfx9, TTMP11_gfx9,
TTMP12_gfx9, TTMP13_gfx9, TTMP14_gfx9, TTMP15_gfx9]>;
[TTMP0_gfx9_gfx10, TTMP1_gfx9_gfx10, TTMP2_gfx9_gfx10, TTMP3_gfx9_gfx10,
TTMP4_gfx9_gfx10, TTMP5_gfx9_gfx10, TTMP6_gfx9_gfx10, TTMP7_gfx9_gfx10,
TTMP8_gfx9_gfx10, TTMP9_gfx9_gfx10, TTMP10_gfx9_gfx10, TTMP11_gfx9_gfx10,
TTMP12_gfx9_gfx10, TTMP13_gfx9_gfx10, TTMP14_gfx9_gfx10, TTMP15_gfx9_gfx10]>;


// VGPR 32-bit registers
Expand Down
33 changes: 33 additions & 0 deletions llvm/lib/Target/AMDGPU/SISchedule.td
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ def WriteDouble : SchedWrite;
// half rate f64 instruction (same as v_add_f64)
def WriteDoubleAdd : SchedWrite;

// Conversion to or from f64 instruction
def WriteDoubleCvt : SchedWrite;

// Half rate 64-bit instructions.
def Write64Bit : SchedWrite;

Expand All @@ -61,6 +64,7 @@ class SISchedMachineModel : SchedMachineModel {

def SIFullSpeedModel : SISchedMachineModel;
def SIQuarterSpeedModel : SISchedMachineModel;
def GFX10SpeedModel : SISchedMachineModel;

// XXX: Are the resource counts correct?
def HWBranch : ProcResource<1> {
Expand All @@ -81,6 +85,9 @@ def HWVMEM : ProcResource<1> {
def HWVALU : ProcResource<1> {
let BufferSize = 1;
}
def HWRC : ProcResource<1> { // Register destination cache
let BufferSize = 1;
}

class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,
int latency> : WriteRes<write, resources> {
Expand Down Expand Up @@ -124,6 +131,7 @@ defm : SICommonWriteRes;
def : HWVALUWriteRes<WriteFloatFMA, 1>;
def : HWVALUWriteRes<WriteDouble, 4>;
def : HWVALUWriteRes<WriteDoubleAdd, 2>;
def : HWVALUWriteRes<WriteDoubleCvt, 4>;

def : InstRW<[WriteCopy], (instrs COPY)>;

Expand All @@ -136,7 +144,32 @@ defm : SICommonWriteRes;
def : HWVALUWriteRes<WriteFloatFMA, 16>;
def : HWVALUWriteRes<WriteDouble, 16>;
def : HWVALUWriteRes<WriteDoubleAdd, 8>;
def : HWVALUWriteRes<WriteDoubleCvt, 4>;

def : InstRW<[WriteCopy], (instrs COPY)>;

} // End SchedModel = SIQuarterSpeedModel

let SchedModel = GFX10SpeedModel in {

// The latency values are 1 / (operations / cycle).
// Add 1 stall cycle for VGPR read.
def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>;
def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 9>;
def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 17>;
def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>;
def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 17>;
def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 17>;
def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 17>;

def : HWWriteRes<WriteBranch, [HWBranch], 32>;
def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>;
def : HWWriteRes<WriteLDS, [HWLGKM, HWRC], 20>;
def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 5>;
def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>;
def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;

def : InstRW<[WriteCopy], (instrs COPY)>;

} // End SchedModel = GFX10SpeedModel
98 changes: 60 additions & 38 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -435,11 +435,21 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
Header.kernarg_segment_alignment = 4;
Header.group_segment_alignment = 4;
Header.private_segment_alignment = 4;

if (Version.Major >= 10) {
Header.compute_pgm_resource_registers |=
S_00B848_WGP_MODE(STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1) |
S_00B848_MEM_ORDERED(1);
}
}

amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor() {
amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(
const MCSubtargetInfo *STI) {
IsaVersion Version = getIsaVersion(STI->getCPU());

amdhsa::kernel_descriptor_t KD;
memset(&KD, 0, sizeof(KD));

AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64,
amdhsa::FLOAT_DENORM_MODE_FLUSH_NONE);
Expand All @@ -449,6 +459,13 @@ amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor() {
amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE, 1);
AMDHSA_BITS_SET(KD.compute_pgm_rsrc2,
amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, 1);
if (Version.Major >= 10) {
AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_WGP_MODE,
STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1);
AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED, 1);
}
return KD;
}

Expand Down Expand Up @@ -679,6 +696,10 @@ bool isGFX9(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX9];
}

bool isGFX10(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX10];
}

bool isGCN3Encoding(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding];
}
Expand All @@ -704,46 +725,46 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
CASE_CI_VI(FLAT_SCR) \
CASE_CI_VI(FLAT_SCR_LO) \
CASE_CI_VI(FLAT_SCR_HI) \
CASE_VI_GFX9(TTMP0) \
CASE_VI_GFX9(TTMP1) \
CASE_VI_GFX9(TTMP2) \
CASE_VI_GFX9(TTMP3) \
CASE_VI_GFX9(TTMP4) \
CASE_VI_GFX9(TTMP5) \
CASE_VI_GFX9(TTMP6) \
CASE_VI_GFX9(TTMP7) \
CASE_VI_GFX9(TTMP8) \
CASE_VI_GFX9(TTMP9) \
CASE_VI_GFX9(TTMP10) \
CASE_VI_GFX9(TTMP11) \
CASE_VI_GFX9(TTMP12) \
CASE_VI_GFX9(TTMP13) \
CASE_VI_GFX9(TTMP14) \
CASE_VI_GFX9(TTMP15) \
CASE_VI_GFX9(TTMP0_TTMP1) \
CASE_VI_GFX9(TTMP2_TTMP3) \
CASE_VI_GFX9(TTMP4_TTMP5) \
CASE_VI_GFX9(TTMP6_TTMP7) \
CASE_VI_GFX9(TTMP8_TTMP9) \
CASE_VI_GFX9(TTMP10_TTMP11) \
CASE_VI_GFX9(TTMP12_TTMP13) \
CASE_VI_GFX9(TTMP14_TTMP15) \
CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3) \
CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7) \
CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11) \
CASE_VI_GFX9(TTMP12_TTMP13_TTMP14_TTMP15) \
CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \
CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \
CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
CASE_VI_GFX9_GFX10(TTMP0) \
CASE_VI_GFX9_GFX10(TTMP1) \
CASE_VI_GFX9_GFX10(TTMP2) \
CASE_VI_GFX9_GFX10(TTMP3) \
CASE_VI_GFX9_GFX10(TTMP4) \
CASE_VI_GFX9_GFX10(TTMP5) \
CASE_VI_GFX9_GFX10(TTMP6) \
CASE_VI_GFX9_GFX10(TTMP7) \
CASE_VI_GFX9_GFX10(TTMP8) \
CASE_VI_GFX9_GFX10(TTMP9) \
CASE_VI_GFX9_GFX10(TTMP10) \
CASE_VI_GFX9_GFX10(TTMP11) \
CASE_VI_GFX9_GFX10(TTMP12) \
CASE_VI_GFX9_GFX10(TTMP13) \
CASE_VI_GFX9_GFX10(TTMP14) \
CASE_VI_GFX9_GFX10(TTMP15) \
CASE_VI_GFX9_GFX10(TTMP0_TTMP1) \
CASE_VI_GFX9_GFX10(TTMP2_TTMP3) \
CASE_VI_GFX9_GFX10(TTMP4_TTMP5) \
CASE_VI_GFX9_GFX10(TTMP6_TTMP7) \
CASE_VI_GFX9_GFX10(TTMP8_TTMP9) \
CASE_VI_GFX9_GFX10(TTMP10_TTMP11) \
CASE_VI_GFX9_GFX10(TTMP12_TTMP13) \
CASE_VI_GFX9_GFX10(TTMP14_TTMP15) \
CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3) \
CASE_VI_GFX9_GFX10(TTMP4_TTMP5_TTMP6_TTMP7) \
CASE_VI_GFX9_GFX10(TTMP8_TTMP9_TTMP10_TTMP11) \
CASE_VI_GFX9_GFX10(TTMP12_TTMP13_TTMP14_TTMP15) \
CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \
CASE_VI_GFX9_GFX10(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \
CASE_VI_GFX9_GFX10(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
}

#define CASE_CI_VI(node) \
assert(!isSI(STI)); \
case node: return isCI(STI) ? node##_ci : node##_vi;

#define CASE_VI_GFX9(node) \
case node: return isGFX9(STI) ? node##_gfx9 : node##_vi;
#define CASE_VI_GFX9_GFX10(node) \
case node: return (isGFX9(STI) || isGFX10(STI)) ? node##_gfx9_gfx10 : node##_vi;

unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
if (STI.getTargetTriple().getArch() == Triple::r600)
Expand All @@ -752,17 +773,17 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
}

#undef CASE_CI_VI
#undef CASE_VI_GFX9
#undef CASE_VI_GFX9_GFX10

#define CASE_CI_VI(node) case node##_ci: case node##_vi: return node;
#define CASE_VI_GFX9(node) case node##_vi: case node##_gfx9: return node;
#define CASE_VI_GFX9_GFX10(node) case node##_vi: case node##_gfx9_gfx10: return node;

unsigned mc2PseudoReg(unsigned Reg) {
MAP_REG2REG
}

#undef CASE_CI_VI
#undef CASE_VI_GFX9
#undef CASE_VI_GFX9_GFX10
#undef MAP_REG2REG

bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
Expand Down Expand Up @@ -1030,5 +1051,6 @@ const SourceOfDivergence *lookupSourceOfDivergence(unsigned Intr);
bool isIntrinsicSourceOfDivergence(unsigned IntrID) {
return lookupSourceOfDivergence(IntrID);
}

} // namespace AMDGPU
} // namespace llvm
4 changes: 3 additions & 1 deletion llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,8 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen);
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
const MCSubtargetInfo *STI);

amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor();
amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(
const MCSubtargetInfo *STI);

bool isGroupSegment(const GlobalValue *GV);
bool isGlobalSegment(const GlobalValue *GV);
Expand Down Expand Up @@ -398,6 +399,7 @@ bool isSI(const MCSubtargetInfo &STI);
bool isCI(const MCSubtargetInfo &STI);
bool isVI(const MCSubtargetInfo &STI);
bool isGFX9(const MCSubtargetInfo &STI);
bool isGFX10(const MCSubtargetInfo &STI);

/// Is Reg - scalar register
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ COMPPGM1(priv, compute_pgm_rsrc1_priv, PRIV
COMPPGM1(enable_dx10_clamp, compute_pgm_rsrc1_dx10_clamp, DX10_CLAMP),
COMPPGM1(debug_mode, compute_pgm_rsrc1_debug_mode, DEBUG_MODE),
COMPPGM1(enable_ieee_mode, compute_pgm_rsrc1_ieee_mode, IEEE_MODE),
COMPPGM1(enable_wgp_mode, compute_pgm_rsrc1_wgp_mode, WGP_MODE),
COMPPGM1(enable_mem_ordered, compute_pgm_rsrc1_mem_ordered, MEM_ORDERED),
COMPPGM1(enable_fwd_progress, compute_pgm_rsrc1_fwd_progress, FWD_PROGRESS),
// TODO: bulky
// TODO: cdbg_user
COMPPGM2(enable_sgpr_private_segment_wave_byte_offset, compute_pgm_rsrc2_scratch_en, SCRATCH_EN),
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx904 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX904 %s
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX906 %s
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx909 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX909 %s
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1010 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1010 %s

; ARCH-R600: Arch: r600
; ARCH-GCN: Arch: amdgcn
Expand Down Expand Up @@ -87,6 +88,7 @@
; GFX904: EF_AMDGPU_MACH_AMDGCN_GFX904 (0x2E)
; GFX906: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F)
; GFX909: EF_AMDGPU_MACH_AMDGCN_GFX909 (0x31)
; GFX1010: EF_AMDGPU_MACH_AMDGCN_GFX1010 (0x33)
; ALL: ]

define amdgpu_kernel void @elf_header() {
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/AMDGPU/hsa-note-no-func.ll
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx904 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX904 %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx906 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX906 %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx909 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX909 %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX1010 %s

; HSA: .hsa_code_object_version 2,1
; HSA-SI600: .hsa_code_object_isa 6,0,0,"AMD","AMDGPU"
Expand All @@ -42,3 +43,4 @@
; HSA-GFX904: .hsa_code_object_isa 9,0,4,"AMD","AMDGPU"
; HSA-GFX906: .hsa_code_object_isa 9,0,6,"AMD","AMDGPU"
; HSA-GFX909: .hsa_code_object_isa 9,0,9,"AMD","AMDGPU"
; HSA-GFX1010: .hsa_code_object_isa 10,1,0,"AMD","AMDGPU"
1 change: 1 addition & 0 deletions llvm/tools/llvm-readobj/ELFDumper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1275,6 +1275,7 @@ static const EnumEntry<unsigned> ElfHeaderAMDGPUFlags[] = {
LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX904),
LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX906),
LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX909),
LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1010),
LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_XNACK),
LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_SRAM_ECC)
};
Expand Down