Skip to content

Commit

Permalink
[AMDGPU] - Add address space for strided buffers (#74471)
Browse files Browse the repository at this point in the history
This is an experimental address space for strided buffers. These buffers
can have structs as elements and
a stride > 1.
These pointers allow the indexed access in units of stride, i.e., they
point at `buffer[index * stride]`.
Thus, we can use the `idxen` modifier for buffer loads.

We assign address space 9 to 192-bit buffer pointers which contain a
128-bit descriptor, a 32-bit offset and a 32-bit index. Essentially,
they are fat buffer pointers with an additional 32-bit index.
  • Loading branch information
OutOfCache committed Dec 15, 2023
1 parent 163aeca commit 32f9983
Show file tree
Hide file tree
Showing 68 changed files with 247 additions and 120 deletions.
5 changes: 3 additions & 2 deletions clang/lib/Basic/Targets/AMDGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@ static const char *const DataLayoutStringR600 =

static const char *const DataLayoutStringAMDGCN =
"e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
"-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
"-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:"
"32-v48:64-v96:128"
"-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"
"-ni:7:8";
"-ni:7:8:9";

const LangASMap AMDGPUTargetInfo::AMDGPUDefIsGenMap = {
llvm::AMDGPUAS::FLAT_ADDRESS, // Default
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGen/target-data.c
Original file line number Diff line number Diff line change
Expand Up @@ -176,12 +176,12 @@

// RUN: %clang_cc1 -triple amdgcn-unknown -target-cpu hawaii -o - -emit-llvm %s \
// RUN: | FileCheck %s -check-prefix=R600SI
// R600SI: target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
// R600SI: target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"

// Test default -target-cpu
// RUN: %clang_cc1 -triple amdgcn-unknown -o - -emit-llvm %s \
// RUN: | FileCheck %s -check-prefix=R600SIDefault
// R600SIDefault: target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
// R600SIDefault: target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"

// RUN: %clang_cc1 -triple arm64-unknown -o - -emit-llvm %s | \
// RUN: FileCheck %s -check-prefix=AARCH64
Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// RUN: %clang_cc1 %s -O0 -triple amdgcn -emit-llvm -o - | FileCheck %s
// RUN: %clang_cc1 %s -O0 -triple amdgcn---opencl -emit-llvm -o - | FileCheck %s

// CHECK: target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
// CHECK: target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
void foo(void) {}
49 changes: 31 additions & 18 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -703,23 +703,24 @@ supported for the ``amdgcn`` target.
.. table:: AMDGPU Address Spaces
:name: amdgpu-address-spaces-table

================================= =============== =========== ================ ======= ============================
.. 64-Bit Process Address Space
--------------------------------- --------------- ----------- ---------------- ------------------------------------
Address Space Name LLVM IR Address HSA Segment Hardware Address NULL Value
Space Number Name Name Size
================================= =============== =========== ================ ======= ============================
Generic 0 flat flat 64 0x0000000000000000
Global 1 global global 64 0x0000000000000000
Region 2 N/A GDS 32 *not implemented for AMDHSA*
Local 3 group LDS 32 0xFFFFFFFF
Constant 4 constant *same as global* 64 0x0000000000000000
Private 5 private scratch 32 0xFFFFFFFF
Constant 32-bit 6 *TODO* 0x00000000
Buffer Fat Pointer (experimental) 7 *TODO*
Buffer Resource (experimental) 8 *TODO*
Streamout Registers 128 N/A GS_REGS
================================= =============== =========== ================ ======= ============================
===================================== =============== =========== ================ ======= ============================
.. 64-Bit Process Address Space
------------------------------------- --------------- ----------- ---------------- ------------------------------------
Address Space Name LLVM IR Address HSA Segment Hardware Address NULL Value
Space Number Name Name Size
===================================== =============== =========== ================ ======= ============================
Generic 0 flat flat 64 0x0000000000000000
Global 1 global global 64 0x0000000000000000
Region 2 N/A GDS 32 *not implemented for AMDHSA*
Local 3 group LDS 32 0xFFFFFFFF
Constant 4 constant *same as global* 64 0x0000000000000000
Private 5 private scratch 32 0xFFFFFFFF
Constant 32-bit 6 *TODO* 0x00000000
Buffer Fat Pointer (experimental) 7 *TODO*
Buffer Resource (experimental) 8 *TODO*
Buffer Strided Pointer (experimental) 9 *TODO*
Streamout Registers 128 N/A GS_REGS
===================================== =============== =========== ================ ======= ============================

**Generic**
The generic address space is supported unless the *Target Properties* column
Expand Down Expand Up @@ -836,7 +837,7 @@ supported for the ``amdgcn`` target.
the backend.

The buffer descriptor used to construct a buffer fat pointer must be *raw*:
the stride must be 0, the "add tid" flag bust be 0, the swizzle enable bits
the stride must be 0, the "add tid" flag must be 0, the swizzle enable bits
must be off, and the extent must be measured in bytes. (On subtargets where
bounds checking may be disabled, buffer fat pointers may choose to enable
it or not).
Expand Down Expand Up @@ -864,6 +865,18 @@ supported for the ``amdgcn`` target.
(bits `127:96`). The specific interpretation of these fields varies by the
target architecture and is detailed in the ISA descriptions.

**Buffer Strided Pointer**
The buffer index pointer is an experimental address space. It represents
a 128-bit buffer descriptor and a 32-bit offset, like the **Buffer Fat
Pointer**. Additionally, it contains an index into the buffer, which
allows the direct addressing of structured elements. These components appear
in that order, i.e., the descriptor comes first, then the 32-bit offset
followed by the 32-bit index.

The bits in the buffer descriptor must meet the following requirements:
the stride is the size of a structured element, the "add tid" flag must be 0,
and the swizzle enable bits must be off.

**Streamout Registers**
Dedicated registers used by the GS NGG Streamout Instructions. The register
file is modelled as a memory in a distinct address space because it is indexed
Expand Down
5 changes: 4 additions & 1 deletion llvm/include/llvm/Support/AMDGPUAddrSpace.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ namespace llvm {
namespace AMDGPUAS {
enum : unsigned {
// The maximum value for flat, generic, local, private, constant and region.
MAX_AMDGPU_ADDRESS = 8,
MAX_AMDGPU_ADDRESS = 9,

FLAT_ADDRESS = 0, ///< Address space for flat memory.
GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
Expand All @@ -42,6 +42,9 @@ enum : unsigned {

BUFFER_RESOURCE = 8, ///< Address space for 128-bit buffer resources.

BUFFER_STRIDED_POINTER = 9, ///< Address space for 192-bit fat buffer
///< pointers with an additional index.

/// Internal address spaces. Can be freely renumbered.
STREAMOUT_REGISTER = 128, ///< Address space for GS NGG Streamout registers.
/// end Internal address spaces.
Expand Down
10 changes: 7 additions & 3 deletions llvm/lib/IR/AutoUpgrade.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5207,17 +5207,21 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
// This goes before adding new address spaces to prevent incoherent string
// values.
if (!DL.contains("-ni") && !DL.starts_with("ni"))
Res.append("-ni:7:8");
// Update ni:7 to ni:7:8.
Res.append("-ni:7:8:9");
// Update ni:7 to ni:7:8:9.
if (DL.ends_with("ni:7"))
Res.append(":8");
Res.append(":8:9");
if (DL.ends_with("ni:7:8"))
Res.append(":9");

// Add sizing for address spaces 7 and 8 (fat raw buffers and buffer
// resources) An empty data layout has already been upgraded to G1 by now.
if (!DL.contains("-p7") && !DL.starts_with("p7"))
Res.append("-p7:160:256:256:32");
if (!DL.contains("-p8") && !DL.starts_with("p8"))
Res.append("-p8:128:128");
if (!DL.contains("-p9") && !DL.startswith("p9"))
Res.append("-p9:192:256:256:32");

return Res;
}
Expand Down
27 changes: 14 additions & 13 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -410,24 +410,25 @@ inline bool isExtendedGlobalAddrSpace(unsigned AS) {
}

static inline bool addrspacesMayAlias(unsigned AS1, unsigned AS2) {
static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 8, "Addr space out of range");
static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 9, "Addr space out of range");

if (AS1 > AMDGPUAS::MAX_AMDGPU_ADDRESS || AS2 > AMDGPUAS::MAX_AMDGPU_ADDRESS)
return true;

// This array is indexed by address space value enum elements 0 ... to 8
// This array is indexed by address space value enum elements 0 ... to 9
// clang-format off
static const bool ASAliasRules[9][9] = {
/* Flat Global Region Group Constant Private Const32 BufFatPtr BufRsrc */
/* Flat */ {true, true, false, true, true, true, true, true, true},
/* Global */ {true, true, false, false, true, false, true, true, true},
/* Region */ {false, false, true, false, false, false, false, false, false},
/* Group */ {true, false, false, true, false, false, false, false, false},
/* Constant */ {true, true, false, false, false, false, true, true, true},
/* Private */ {true, false, false, false, false, true, false, false, false},
/* Constant 32-bit */ {true, true, false, false, true, false, false, true, true},
/* Buffer Fat Ptr */ {true, true, false, false, true, false, true, true, true},
/* Buffer Resource */ {true, true, false, false, true, false, true, true, true},
static const bool ASAliasRules[10][10] = {
/* Flat Global Region Group Constant Private Const32 BufFatPtr BufRsrc BufStrdPtr */
/* Flat */ {true, true, false, true, true, true, true, true, true, true},
/* Global */ {true, true, false, false, true, false, true, true, true, true},
/* Region */ {false, false, true, false, false, false, false, false, false, false},
/* Group */ {true, false, false, true, false, false, false, false, false, false},
/* Constant */ {true, true, false, false, false, false, true, true, true, true},
/* Private */ {true, false, false, false, false, true, false, false, false, false},
/* Constant 32-bit */ {true, true, false, false, true, false, false, true, true, true},
/* Buffer Fat Ptr */ {true, true, false, false, true, false, true, true, true, true},
/* Buffer Resource */ {true, true, false, false, true, false, true, true, true, true},
/* Buffer Strided Ptr */ {true, true, false, false, true, false, true, true, true, true},
};
// clang-format on

Expand Down
7 changes: 5 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -633,6 +633,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
const LLT BufferStridedPtr =
GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);

const LLT CodePtr = FlatPtr;

Expand Down Expand Up @@ -1113,7 +1115,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
}

getActionDefinitionsBuilder(G_PTR_ADD)
.unsupportedFor({BufferFatPtr, RsrcPtr})
.unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
.legalIf(all(isPointer(0), sameSize(0, 1)))
.scalarize(0)
.scalarSameSizeAs(1, 0);
Expand Down Expand Up @@ -1403,7 +1405,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// The custom pointers (fat pointers, buffer resources) don't work with load
// and store at this level. Fat pointers should have been lowered to
// intrinsics before the translation to MIR.
Actions.unsupportedIf(typeInSet(1, {BufferFatPtr, RsrcPtr}));
Actions.unsupportedIf(
typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));

// Address space 8 pointers are handled by a 4xs32 load, bitcast, and
// ptrtoint. This is needed to account for the fact that we can't have i128
Expand Down
5 changes: 3 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -539,9 +539,10 @@ static StringRef computeDataLayout(const Triple &TT) {
// space 8) which cannot be non-trivilally accessed by LLVM memory operations
// like getelementptr.
return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
"-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:"
"-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-"
"v32:32-v48:64-v96:"
"128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
"G1-ni:7:8";
"G1-ni:7:8:9";
}

LLVM_READNONE
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,8 @@ unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
AddrSpace == AMDGPUAS::BUFFER_RESOURCE) {
AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
return 512;
}

Expand Down
13 changes: 11 additions & 2 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1046,12 +1046,20 @@ static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const {
if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
return MVT::v5i32;
if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
DL.getPointerSizeInBits(AS) == 192)
return MVT::v6i32;
return AMDGPUTargetLowering::getPointerTy(DL, AS);
}
/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
/// v8i32 when padding is added.
/// The in-memory representation of a p9 is {p8, i32, i32}, which is
/// also v8i32 with padding.
MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
DL.getPointerSizeInBits(AS) == 160) ||
(AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
DL.getPointerSizeInBits(AS) == 192))
return MVT::v8i32;
return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
}
Expand Down Expand Up @@ -1418,7 +1426,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,

if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE) {
AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
AS == AMDGPUAS::BUFFER_STRIDED_POINTER) {
// If the offset isn't a multiple of 4, it probably isn't going to be
// correctly aligned.
// FIXME: Can we get the real alignment here?
Expand Down
70 changes: 70 additions & 0 deletions llvm/test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll
Original file line number Diff line number Diff line change
Expand Up @@ -248,3 +248,73 @@ define void @test_8_5(ptr %p) {
load i8, ptr addrspace(3) @shm
ret void
}

; CHECK: MayAlias: i8 addrspace(9)* %p, i8* %p1
define void @test_9_0(ptr addrspace(9) %p, ptr addrspace(0) %p1) {
load i8, ptr addrspace(9) %p
load i8, ptr addrspace(0) %p1
ret void
}

; CHECK: MayAlias: i8 addrspace(9)* %p, i8 addrspace(1)* %p1
define void @test_9_1(ptr addrspace(9) %p, ptr addrspace(1) %p1) {
load i8, ptr addrspace(9) %p
load i8, ptr addrspace(1) %p1
ret void
}

; CHECK: NoAlias: i8 addrspace(9)* %p, i8 addrspace(2)* %p1
define void @test_9_2(ptr addrspace(9) %p, ptr addrspace(2) %p1) {
load i8, ptr addrspace(9) %p
load i8, ptr addrspace(2) %p1
ret void
}

; CHECK: NoAlias: i8 addrspace(9)* %p, i8 addrspace(3)* %p1
define void @test_9_3(ptr addrspace(9) %p, ptr addrspace(3) %p1) {
load i8, ptr addrspace(9) %p
load i8, ptr addrspace(3) %p1
ret void
}

; CHECK: MayAlias: i8 addrspace(9)* %p, i8 addrspace(4)* %p1
define void @test_9_4(ptr addrspace(9) %p, ptr addrspace(4) %p1) {
load i8, ptr addrspace(9) %p
load i8, ptr addrspace(4) %p1
ret void
}

; CHECK: NoAlias: i8 addrspace(9)* %p, i8 addrspace(5)* %p1
define void @test_9_5(ptr addrspace(9) %p, ptr addrspace(5) %p1) {
load i8, ptr addrspace(9) %p
load i8, ptr addrspace(5) %p1
ret void
}

; CHECK: MayAlias: i8 addrspace(9)* %p, i8 addrspace(6)* %p1
define void @test_9_6(ptr addrspace(9) %p, ptr addrspace(6) %p1) {
load i8, ptr addrspace(9) %p
load i8, ptr addrspace(6) %p1
ret void
}

; CHECK: MayAlias: i8 addrspace(9)* %p, i8 addrspace(7)* %p1
define void @test_9_7(ptr addrspace(9) %p, ptr addrspace(7) %p1) {
load i8, ptr addrspace(9) %p
load i8, ptr addrspace(7) %p1
ret void
}

; CHECK: MayAlias: i8 addrspace(9)* %p, i8 addrspace(8)* %p1
define void @test_9_8(ptr addrspace(9) %p, ptr addrspace(8) %p1) {
load i8, ptr addrspace(9) %p
load i8, ptr addrspace(8) %p1
ret void
}

; CHECK: MayAlias: i8 addrspace(9)* %p, i8 addrspace(9)* %p1
define void @test_9_9(ptr addrspace(9) %p, ptr addrspace(9) %p1) {
load i8, ptr addrspace(9) %p
load i8, ptr addrspace(9) %p1
ret void
}
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-ceil.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s

target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"

declare float @_Z4ceilf(float)
declare <2 x float> @_Z4ceilDv2_f(<2 x float>)
Expand Down

0 comments on commit 32f9983

Please sign in to comment.