Skip to content
This repository has been archived by the owner on Apr 23, 2020. It is now read-only.

Commit

Permalink
AMDGPU: Add Vega12 and Vega20
Browse files Browse the repository at this point in the history
Changes by
  Matt Arsenault
  Konstantin Zhuravlyov

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@331215 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
arsenm committed Apr 30, 2018
1 parent be8f7c9 commit ac9b3ef
Show file tree
Hide file tree
Showing 43 changed files with 2,133 additions and 322 deletions.
18 changes: 14 additions & 4 deletions docs/AMDGPUUsage.rst
Expand Up @@ -200,6 +200,16 @@ names from both the *Processor* and *Alternative Processor* can be used.
- Radeon Instinct MI25
``gfx902`` ``amdgcn`` APU - xnack - Ryzen 3 2200G
[on] - Ryzen 5 2400G
``gfx904`` ``amdgcn`` dGPU - xnack *TBA*
[off]
.. TODO
Add product
names.
``gfx906`` ``amdgcn`` dGPU - xnack *TBA*
[off]
.. TODO
Add product
names.
=========== =============== ============ ===== ========= ======= ==================

.. _amdgpu-target-features:
Expand Down Expand Up @@ -547,8 +557,8 @@ The AMDGPU backend uses the following ELF header:
``EF_AMDGPU_MACH_AMDGCN_GFX810`` 0x02b ``gfx810``
``EF_AMDGPU_MACH_AMDGCN_GFX900`` 0x02c ``gfx900``
``EF_AMDGPU_MACH_AMDGCN_GFX902`` 0x02d ``gfx902``
*reserved* 0x02e Reserved.
*reserved* 0x02f Reserved.
``EF_AMDGPU_MACH_AMDGCN_GFX904`` 0x02e ``gfx904``
``EF_AMDGPU_MACH_AMDGCN_GFX906`` 0x02f ``gfx906``
*reserved* 0x030 Reserved.
================================= ========== =============================

Expand Down Expand Up @@ -765,7 +775,7 @@ The following relocation types are supported:
``R_AMDGPU_ABS32_HI`` Static, 2 ``word32`` (S + A) >> 32
Dynamic
``R_AMDGPU_ABS64`` Static, 3 ``word64`` S + A
Dynamic
Dynamic
``R_AMDGPU_REL32`` Static 4 ``word32`` S + A - P
``R_AMDGPU_REL64`` Static 5 ``word64`` S + A - P
``R_AMDGPU_ABS32`` Static, 6 ``word32`` S + A
Expand All @@ -784,7 +794,7 @@ the ``mesa3d`` OS, which does not support ``R_AMDGPU_ABS64``.

There is no current OS loader support for 32 bit programs and so
``R_AMDGPU_ABS32`` is not used.

.. _amdgpu-dwarf:

DWARF
Expand Down
8 changes: 4 additions & 4 deletions include/llvm/BinaryFormat/ELF.h
Expand Up @@ -687,7 +687,7 @@ enum : unsigned {

// AMDGCN-based processors.
EF_AMDGPU_MACH_AMDGCN_FIRST = 0x020,
EF_AMDGPU_MACH_AMDGCN_LAST = 0x02d,
EF_AMDGPU_MACH_AMDGCN_LAST = 0x02f,
// AMDGCN GFX6.
EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020,
EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021,
Expand All @@ -705,12 +705,12 @@ enum : unsigned {
// AMDGCN GFX9.
EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d,
EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e,
EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,

// Reserved for AMDGCN-based processors.
EF_AMDGPU_MACH_AMDGCN_RESERVED0 = 0x027,
EF_AMDGPU_MACH_AMDGCN_RESERVED1 = 0x02e,
EF_AMDGPU_MACH_AMDGCN_RESERVED2 = 0x02f,
EF_AMDGPU_MACH_AMDGCN_RESERVED3 = 0x030,
EF_AMDGPU_MACH_AMDGCN_RESERVED1 = 0x030,

// Indicates if the xnack target feature is enabled for all code contained in
// the object.
Expand Down
103 changes: 103 additions & 0 deletions include/llvm/IR/IntrinsicsAMDGPU.td
Expand Up @@ -1287,6 +1287,109 @@ def int_amdgcn_ds_bpermute :
GCCBuiltin<"__builtin_amdgcn_ds_bpermute">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;

//===----------------------------------------------------------------------===//
// Deep learning intrinsics.
//===----------------------------------------------------------------------===//

// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c)
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
def int_amdgcn_fdot2 :
GCCBuiltin<"__builtin_amdgcn_fdot2">,
Intrinsic<
[llvm_float_ty], // %r
[
llvm_v2f16_ty, // %a
llvm_v2f16_ty, // %b
llvm_float_ty // %c
],
[IntrNoMem, IntrSpeculatable]
>;

// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c)
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
def int_amdgcn_sdot2 :
GCCBuiltin<"__builtin_amdgcn_sdot2">,
Intrinsic<
[llvm_i32_ty], // %r
[
llvm_v2i16_ty, // %a
llvm_v2i16_ty, // %b
llvm_i32_ty // %c
],
[IntrNoMem, IntrSpeculatable]
>;

// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c)
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
def int_amdgcn_udot2 :
GCCBuiltin<"__builtin_amdgcn_udot2">,
Intrinsic<
[llvm_i32_ty], // %r
[
llvm_v2i16_ty, // %a
llvm_v2i16_ty, // %b
llvm_i32_ty // %c
],
[IntrNoMem, IntrSpeculatable]
>;

// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c)
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
def int_amdgcn_sdot4 :
GCCBuiltin<"__builtin_amdgcn_sdot4">,
Intrinsic<
[llvm_i32_ty], // %r
[
llvm_i32_ty, // %a
llvm_i32_ty, // %b
llvm_i32_ty // %c
],
[IntrNoMem, IntrSpeculatable]
>;

// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c)
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
def int_amdgcn_udot4 :
GCCBuiltin<"__builtin_amdgcn_udot4">,
Intrinsic<
[llvm_i32_ty], // %r
[
llvm_i32_ty, // %a
llvm_i32_ty, // %b
llvm_i32_ty // %c
],
[IntrNoMem, IntrSpeculatable]
>;

// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c)
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
def int_amdgcn_sdot8 :
GCCBuiltin<"__builtin_amdgcn_sdot8">,
Intrinsic<
[llvm_i32_ty], // %r
[
llvm_i32_ty, // %a
llvm_i32_ty, // %b
llvm_i32_ty // %c
],
[IntrNoMem, IntrSpeculatable]
>;

// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c)
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
def int_amdgcn_udot8 :
GCCBuiltin<"__builtin_amdgcn_udot8">,
Intrinsic<
[llvm_i32_ty], // %r
[
llvm_i32_ty, // %a
llvm_i32_ty, // %b
llvm_i32_ty // %c
],
[IntrNoMem, IntrSpeculatable]
>;

//===----------------------------------------------------------------------===//
// Special Intrinsics for backend internal use only. No frontend
Expand Down
2 changes: 2 additions & 0 deletions lib/ObjectYAML/ELFYAML.cpp
Expand Up @@ -400,6 +400,8 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX810, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX900, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX902, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX904, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX906, EF_AMDGPU_MACH);
BCase(EF_AMDGPU_XNACK);
break;
case ELF::EM_X86_64:
Expand Down
31 changes: 31 additions & 0 deletions lib/Target/AMDGPU/AMDGPU.td
Expand Up @@ -127,6 +127,12 @@ def FeatureMadMixInsts : SubtargetFeature<"mad-mix-insts",
"Has v_mad_mix_f32, v_mad_mixlo_f16, v_mad_mixhi_f16 instructions"
>;

def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
"HasFmaMixInsts",
"true",
"Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
>;

// XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support
// XNACK. The current default kernel driver setting is:
// - graphics ring: XNACK disabled
Expand Down Expand Up @@ -310,6 +316,12 @@ def FeatureUnpackedD16VMem : SubtargetFeature<"unpacked-d16-vmem",
"Has unpacked d16 vmem instructions"
>;

def FeatureDLInsts : SubtargetFeature<"dl-insts",
"HasDLInsts",
"true",
"Has deep learning instructions"
>;

//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
Expand Down Expand Up @@ -606,6 +618,18 @@ def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2,
FeatureXNACK
]>;

def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4,
[FeatureGFX9,
FeatureLDSBankCount32,
FeatureFmaMixInsts]>;

def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6,
[FeatureGFX9,
HalfRate64Ops,
FeatureFmaMixInsts,
FeatureLDSBankCount32,
FeatureDLInsts]>;

//===----------------------------------------------------------------------===//
// Debugger related subtarget features.
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -788,6 +812,13 @@ def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
AssemblerPredicate<"FeatureMovrel">;

def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
AssemblerPredicate<"FeatureFmaMixInsts">;

def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
AssemblerPredicate<"FeatureDLInsts">;


def EnableLateCFGStructurize : Predicate<
"EnableLateStructurizeCFG">;

Expand Down
23 changes: 15 additions & 8 deletions lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Expand Up @@ -215,7 +215,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
void SelectS_BFE(SDNode *N);
bool isCBranchSCC(const SDNode *N) const;
void SelectBRCOND(SDNode *N);
void SelectFMAD(SDNode *N);
void SelectFMAD_FMA(SDNode *N);
void SelectATOMIC_CMP_SWAP(SDNode *N);

protected:
Expand Down Expand Up @@ -621,7 +621,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectBRCOND(N);
return;
case ISD::FMAD:
SelectFMAD(N);
case ISD::FMA:
SelectFMAD_FMA(N);
return;
case AMDGPUISD::ATOMIC_CMP_SWAP:
SelectATOMIC_CMP_SWAP(N);
Expand Down Expand Up @@ -1728,9 +1729,13 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
VCC.getValue(0));
}

void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
MVT VT = N->getSimpleValueType(0);
if (VT != MVT::f32 || !Subtarget->hasMadMixInsts()) {
bool IsFMA = N->getOpcode() == ISD::FMA;
if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() &&
!Subtarget->hasFmaMixInsts()) ||
((IsFMA && Subtarget->hasMadMixInsts()) ||
(!IsFMA && Subtarget->hasFmaMixInsts()))) {
SelectCode(N);
return;
}
Expand All @@ -1740,13 +1745,13 @@ void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
SDValue Src2 = N->getOperand(2);
unsigned Src0Mods, Src1Mods, Src2Mods;

// Avoid using v_mad_mix_f32 unless there is actually an operand using the
// conversion from f16.
// Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
// using the conversion from f16.
bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods);
bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);

assert(!Subtarget->hasFP32Denormals() &&
assert((IsFMA || !Subtarget->hasFP32Denormals()) &&
"fmad selected with denormals enabled");
// TODO: We can select this with f32 denormals enabled if all the sources are
// converted from f16 (in which case fmad isn't legal).
Expand All @@ -1762,7 +1767,9 @@ void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
Zero, Zero
};

CurDAG->SelectNodeTo(N, AMDGPU::V_MAD_MIX_F32, MVT::f32, Ops);
CurDAG->SelectNodeTo(N,
IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32,
MVT::f32, Ops);
} else {
SelectCode(N);
}
Expand Down
3 changes: 2 additions & 1 deletion lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Expand Up @@ -939,7 +939,8 @@ bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
// where this is OK to use.
bool AMDGPUTargetLowering::isFPExtFoldable(unsigned Opcode,
EVT DestVT, EVT SrcVT) const {
return Opcode == ISD::FMAD && Subtarget->hasMadMixInsts() &&
return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
(Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
SrcVT.getScalarType() == MVT::f16;
}
Expand Down
2 changes: 2 additions & 0 deletions lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Expand Up @@ -148,6 +148,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasIntClamp(false),
HasVOP3PInsts(false),
HasMadMixInsts(false),
HasFmaMixInsts(false),
HasMovrel(false),
HasVGPRIndexMode(false),
HasScalarStores(false),
Expand All @@ -160,6 +161,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasSDWAMac(false),
HasSDWAOutModsVOPC(false),
HasDPP(false),
HasDLInsts(false),
FlatAddressSpace(false),
FlatInstOffsets(false),
FlatGlobalInsts(false),
Expand Down
15 changes: 14 additions & 1 deletion lib/Target/AMDGPU/AMDGPUSubtarget.h
Expand Up @@ -72,7 +72,10 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
ISAVersion8_0_3,
ISAVersion8_1_0,
ISAVersion9_0_0,
ISAVersion9_0_2
ISAVersion9_0_1,
ISAVersion9_0_2,
ISAVersion9_0_4,
ISAVersion9_0_6
};

enum TrapHandlerAbi {
Expand Down Expand Up @@ -150,6 +153,7 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
bool HasIntClamp;
bool HasVOP3PInsts;
bool HasMadMixInsts;
bool HasFmaMixInsts;
bool HasMovrel;
bool HasVGPRIndexMode;
bool HasScalarStores;
Expand All @@ -162,6 +166,7 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
bool HasSDWAMac;
bool HasSDWAOutModsVOPC;
bool HasDPP;
bool HasDLInsts;
bool FlatAddressSpace;
bool FlatInstOffsets;
bool FlatGlobalInsts;
Expand Down Expand Up @@ -329,6 +334,10 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
return HasMadMixInsts;
}

bool hasFmaMixInsts() const {
return HasFmaMixInsts;
}

bool hasCARRY() const {
return (getGeneration() >= EVERGREEN);
}
Expand Down Expand Up @@ -534,6 +543,10 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
return getGeneration() < SEA_ISLANDS;
}

bool hasDLInsts() const {
return HasDLInsts;
}

/// \brief Returns the offset in bytes from the start of the input buffer
/// of the first explicit kernel argument.
unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
Expand Down

0 comments on commit ac9b3ef

Please sign in to comment.