/
AMDGPU.td
186 lines (159 loc) · 8.07 KB
/
AMDGPU.td
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
//===-- AMDGPU.td - AMDGPU dialect definitions *- tablegen -*------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef AMDGPU
#define AMDGPU
include "mlir/Interfaces/SideEffectInterfaces.td"
include "mlir/IR/OpBase.td"
def AMDGPU_Dialect : Dialect {
let name = "amdgpu";
let cppNamespace = "::mlir::amdgpu";
let description = [{
The `AMDGPU` dialect provides wrappers around AMD-specific functionality
and LLVM intrinsics. These wrappers should be used in conjunction with
more generic dialects, such as `gpu` and `vector`, when generating LLVM IR
that will eventually be executed on AMD hardware.
}];
let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
}
//===----------------------------------------------------------------------===//
// AMDGPU Op definitions
//===----------------------------------------------------------------------===//
class AMDGPU_Op<string mnemonic, list<Trait> traits = []> :
Op<AMDGPU_Dialect, mnemonic, traits> {}
/// Raw buffer load
def AMDGPU_RawBufferLoadOp :
AMDGPU_Op<"raw_buffer_load", [AllElementTypesMatch<["value", "memref"]>,
AttrSizedOperandSegments]>,
Arguments<(ins Arg<AnyMemRef, "buffer to load from", [MemRead]>:$memref,
Variadic<I32>:$indices,
DefaultValuedAttr<BoolAttr, "true">:$boundsCheck,
OptionalAttr<I32Attr>:$indexOffset,
Optional<I32>:$sgprOffset)>,
Results<(outs AnyTypeOf<[BF16, F16, F32, I32, I8,
VectorOfLengthAndType<[2, 4], [F32, I32]>,
VectorOfLengthAndType<[2, 4, 8], [F16, BF16]>,
VectorOfLengthAndType<[2, 4, 8, 16], [I8]>]>:$value)> {
let summary = "Raw Buffer load, exposing GCN features";
let description = [{
The `amdgpu.raw_buffer_load` op is a wrapper around the buffer load intrinsics
available on AMD GPUs, including extensions in newer GPUs.
The index into the buffer is computed as for `memref.load` with the additon
of `indexOffset` and `sgprOffset` (which **may or may not** be considered
in bounds checks and includes any offset present on the memref type if it's
non-zero).
All indices and offsets are in units of the memref's data type and are
converted to bytes during lowering.
When a load is out of bounds, the instruction returns zero.
Partially-out of bounds have chipset-dependent behavior: whether reading
2 elements starting at index 7 of a `memref<8xf32>` returns the last element
in the first vector component depends on the architecture.
The memref struct is converted into a buffer resource (a V#) and the arguments
are translated to intrinsic arguments as follows:
- The base address of the buffer is the base address of the memref
- The stride is 0 to enable raw mode
- The number of records is the size of the memref, in bytes
In the case of dynamically-shaped memrefs, this is computed at runtime
as max_d (size(d) * stride(d)) * sizeof(elementType(memref))
- The offset enable bit is 1, the index enable bit is 0.
- The thread ID addition bit is off
- If `boundsCheck` is false and the target chipset is RDNA, OOB_SELECT is set
to 2 to disable bounds checks, otherwise it is 3
- The cache coherency bits are off
}];
let assemblyFormat = [{
attr-dict $memref `[` $indices `]`
(`sgprOffset` $sgprOffset^)? `:`
type($memref) `,` type($indices) `->` type($value)
}];
let hasVerifier = 1;
}
/// Raw buffer store
def AMDGPU_RawBufferStoreOp :
AMDGPU_Op<"raw_buffer_store", [AllElementTypesMatch<["value", "memref"]>,
AttrSizedOperandSegments]>,
Arguments<(ins AnyTypeOf<[BF16, F16, F32, I32, I8,
VectorOfLengthAndType<[2, 4], [F32, I32]>,
VectorOfLengthAndType<[2, 4, 8], [F16, BF16]>,
VectorOfLengthAndType<[2, 4, 8, 16], [I8]>]>:$value,
Arg<AnyMemRef, "buffer to store to", [MemWrite]>:$memref,
Variadic<I32>:$indices,
DefaultValuedAttr<BoolAttr, "true">:$boundsCheck,
OptionalAttr<I32Attr>:$indexOffset,
Optional<I32>:$sgprOffset)> {
let summary = "Raw Buffer Store, exposing GCN features";
let description = [{
The `amdgpu.raw_buffer_store` op is a wrapper around the buffer store
intrinsics available on AMD GPUs, including extensions in newer GPUs.
The store index is computed as in `memref.store` with the addition of
`indexOffset` (which is included for uniformity with atomics and may be useful
when writing vectorized code) and `sgprOffset` (which is added after bounds
checks and implicitly includes the offset of the memref type if non-zero).
All index components are in terms of the elements of the memref, not bytes,
and are scaled up appropriately.
Out of bounds stores are ignored in hardware.
Wthether a vector write that includes some in-bounds and soeme out-of-bounds
components is partically completed is chipset-dependent.
See `amdgpu.raw_buffer_load` for a description of how the underlying
instruction is constructed.
}];
let assemblyFormat = [{
attr-dict $value `->` $memref `[` $indices `]`
(`sgprOffset` $sgprOffset^)? `:`
type($value) `->` type($memref) `,` type($indices)
}];
let hasVerifier = 1;
}
// Raw buffer atomic floating point add
def AMDGPU_RawBufferAtomicFaddOp :
AMDGPU_Op<"raw_buffer_atomic_fadd", [AllElementTypesMatch<["value", "memref"]>,
AttrSizedOperandSegments]>,
Arguments<(ins F32:$value,
Arg<AnyMemRef, "buffer to operate on", [MemRead, MemWrite]>:$memref,
Variadic<I32>:$indices,
DefaultValuedAttr<BoolAttr, "true">:$boundsCheck,
OptionalAttr<I32Attr>:$indexOffset,
Optional<I32>:$sgprOffset)> {
let summary = "Raw Buffer Floating-point Atomic Add (MI-* only)";
let description = [{
The `amdgpu.raw_buffer_atomic_fadd` op is a wrapper around the
buffer-based atomic floating point addition available on the MI-* series
of AMD GPUs.
The index into the buffer is computed as for `memref.store` with the addition
of `indexOffset` (which is used to aid in emitting vectorized code) and,
if present `sgprOffset` (which is added after bounds checks and includes
any non-zero offset on the memref type).
All indexing components are given in terms of the memref's element size, not
the byte lengths required by the intrinsic.
Out of bounds atomic operations are ignored in hardware.
See `amdgpu.raw_buffer_load` for a description of how the underlying
instruction is constructed.
}];
let assemblyFormat = [{
attr-dict $value `->` $memref `[` $indices `]`
(`sgprOffset` $sgprOffset^)? `:`
type($value) `->` type($memref) `,` type($indices)
}];
let hasVerifier = 1;
}
def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> {
let summary = "Barrier that includes a wait for LDS memory operations.";
let description = [{
`amdgpu.lds_barrier` is both a barrier (all workitems in a workgroup must reach
the barrier before any of them may proceed past it) and a wait for all
operations that affect the Local Data Store (LDS) issued from that wrokgroup
to complete before the workgroup may continue. Since the LDS is per-workgroup
memory, this barrier may be used, for example, to ensure all workitems have
written data to LDS before any workitem attempts to read from it.
Note that `lds_barrier` does **not** force reads to or from global memory
to complete before execution continues. Therefore, it should be used when
operations on global memory can be issued far in advance of when their results
are used (for example, by writing them to LDS).
}];
let assemblyFormat = "attr-dict";
}
#endif // AMDGPU