Skip to content

Commit

Permalink
[AMDGPU] support image load/store a16
Browse files Browse the repository at this point in the history
Our a16 support was only enabled for sample/gather and buffer
load/store, but not for image load/store operations (which take an i16
as the pixel index rather than a half).

Fix our isel lowering and add test cases to prove it out.

Differential Revision: https://reviews.llvm.org/D53750

llvm-svn: 345710
  • Loading branch information
Neil Henning committed Oct 31, 2018
1 parent 262baa4 commit 63718b2
Show file tree
Hide file tree
Showing 7 changed files with 1,146 additions and 2 deletions.
6 changes: 4 additions & 2 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Expand Up @@ -4726,9 +4726,11 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
// Check for 16 bit addresses and pack if true.
unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
if (VAddrVT.getScalarType() == MVT::f16 &&
const MVT VAddrScalarVT = VAddrVT.getScalarType();
if (((VAddrScalarVT == MVT::f16) || (VAddrScalarVT == MVT::i16)) &&
ST->hasFeature(AMDGPU::FeatureR128A16)) {
IsA16 = true;
const MVT VectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) {
SDValue AddrLo, AddrHi;
// Push back extra arguments.
Expand All @@ -4747,7 +4749,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
AddrHi = Op.getOperand(i + 1);
i++;
}
AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f16,
AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorVT,
{AddrLo, AddrHi});
AddrLo = DAG.getBitcast(MVT::i32, AddrLo);
}
Expand Down
530 changes: 530 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll

Large diffs are not rendered by default.

128 changes: 128 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll
@@ -0,0 +1,128 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s

; GCN-LABEL: {{^}}load.f16.1d:
; GCN: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm a16 d16
define amdgpu_ps <4 x half> @load.f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%v = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i16(i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x half> %v
}

; GCN-LABEL: {{^}}load.v2f16.1d:
; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16 d16
define amdgpu_ps <4 x half> @load.v2f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%v = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i16(i32 3, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x half> %v
}

; GCN-LABEL: {{^}}load.v3f16.1d:
; GCN: image_load v[0:1], v0, s[0:7] dmask:0x7 unorm a16 d16
define amdgpu_ps <4 x half> @load.v3f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%v = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i16(i32 7, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x half> %v
}

; GCN-LABEL: {{^}}load.v4f16.1d:
; GCN: image_load v[0:1], v0, s[0:7] dmask:0xf unorm a16 d16
define amdgpu_ps <4 x half> @load.v4f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%v = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i16(i32 15, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x half> %v
}

; GCN-LABEL: {{^}}load.f16.2d:
; GCN: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm a16 d16
define amdgpu_ps <4 x half> @load.f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%y = extractelement <2 x i16> %coords, i32 1
%v = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i16(i32 1, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x half> %v
}

; GCN-LABEL: {{^}}load.v2f16.2d:
; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16 d16
define amdgpu_ps <4 x half> @load.v2f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%y = extractelement <2 x i16> %coords, i32 1
%v = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i16(i32 3, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x half> %v
}

; GCN-LABEL: {{^}}load.v3f16.2d:
; GCN: image_load v[0:1], v0, s[0:7] dmask:0x7 unorm a16 d16
define amdgpu_ps <4 x half> @load.v3f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%y = extractelement <2 x i16> %coords, i32 1
%v = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i16(i32 7, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x half> %v
}

; GCN-LABEL: {{^}}load.v4f16.2d:
; GCN: image_load v[0:1], v0, s[0:7] dmask:0xf unorm a16 d16
define amdgpu_ps <4 x half> @load.v4f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%y = extractelement <2 x i16> %coords, i32 1
%v = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i16(i32 15, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x half> %v
}

; GCN-LABEL: {{^}}load.f16.3d:
; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x1 unorm a16 d16
define amdgpu_ps <4 x half> @load.f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
main_body:
%x = extractelement <2 x i16> %coords_lo, i32 0
%y = extractelement <2 x i16> %coords_lo, i32 1
%z = extractelement <2 x i16> %coords_hi, i32 0
%v = call <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i16(i32 1, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x half> %v
}

; GCN-LABEL: {{^}}load.v2f16.3d:
; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x3 unorm a16 d16
define amdgpu_ps <4 x half> @load.v2f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
main_body:
%x = extractelement <2 x i16> %coords_lo, i32 0
%y = extractelement <2 x i16> %coords_lo, i32 1
%z = extractelement <2 x i16> %coords_hi, i32 0
%v = call <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i16(i32 3, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x half> %v
}

; GCN-LABEL: {{^}}load.v3f16.3d:
; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x7 unorm a16 d16
define amdgpu_ps <4 x half> @load.v3f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
main_body:
%x = extractelement <2 x i16> %coords_lo, i32 0
%y = extractelement <2 x i16> %coords_lo, i32 1
%z = extractelement <2 x i16> %coords_hi, i32 0
%v = call <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i16(i32 7, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x half> %v
}

; GCN-LABEL: {{^}}load.v4f16.3d:
; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0xf unorm a16 d16
define amdgpu_ps <4 x half> @load.v4f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
main_body:
%x = extractelement <2 x i16> %coords_lo, i32 0
%y = extractelement <2 x i16> %coords_lo, i32 1
%z = extractelement <2 x i16> %coords_hi, i32 0
%v = call <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i16(i32 15, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x half> %v
}

declare <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i16(i32, i16, <8 x i32>, i32, i32) #2
declare <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i16(i32, i16, i16, <8 x i32>, i32, i32) #2
declare <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i16(i32, i16, i16, i16, <8 x i32>, i32, i32) #2

attributes #0 = { nounwind }
attributes #1 = { nounwind readonly }
128 changes: 128 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll
@@ -0,0 +1,128 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s

; GCN-LABEL: {{^}}load.f32.1d:
; GCN: image_load v[0:3], v0, s[0:7] dmask:0x1 unorm a16
define amdgpu_ps <4 x float> @load.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}load.v2f32.1d:
; GCN: image_load v[0:3], v0, s[0:7] dmask:0x3 unorm a16
define amdgpu_ps <4 x float> @load.v2f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 3, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}load.v3f32.1d:
; GCN: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm a16
define amdgpu_ps <4 x float> @load.v3f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 7, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}load.v4f32.1d:
; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16
define amdgpu_ps <4 x float> @load.v4f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}load.f32.2d:
; GCN: image_load v[0:3], v0, s[0:7] dmask:0x1 unorm a16
define amdgpu_ps <4 x float> @load.f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%y = extractelement <2 x i16> %coords, i32 1
%v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 1, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}load.v2f32.2d:
; GCN: image_load v[0:3], v0, s[0:7] dmask:0x3 unorm a16
define amdgpu_ps <4 x float> @load.v2f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%y = extractelement <2 x i16> %coords, i32 1
%v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 3, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}load.v3f32.2d:
; GCN: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm a16
define amdgpu_ps <4 x float> @load.v3f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%y = extractelement <2 x i16> %coords, i32 1
%v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 7, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}load.v4f32.2d:
; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16
define amdgpu_ps <4 x float> @load.v4f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%y = extractelement <2 x i16> %coords, i32 1
%v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}load.f32.3d:
; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x1 unorm a16
define amdgpu_ps <4 x float> @load.f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
main_body:
%x = extractelement <2 x i16> %coords_lo, i32 0
%y = extractelement <2 x i16> %coords_lo, i32 1
%z = extractelement <2 x i16> %coords_hi, i32 0
%v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 1, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}load.v2f32.3d:
; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x3 unorm a16
define amdgpu_ps <4 x float> @load.v2f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
main_body:
%x = extractelement <2 x i16> %coords_lo, i32 0
%y = extractelement <2 x i16> %coords_lo, i32 1
%z = extractelement <2 x i16> %coords_hi, i32 0
%v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 3, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}load.v3f32.3d:
; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x7 unorm a16
define amdgpu_ps <4 x float> @load.v3f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
main_body:
%x = extractelement <2 x i16> %coords_lo, i32 0
%y = extractelement <2 x i16> %coords_lo, i32 1
%z = extractelement <2 x i16> %coords_hi, i32 0
%v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 7, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %v
}

; GCN-LABEL: {{^}}load.v4f32.3d:
; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16
define amdgpu_ps <4 x float> @load.v4f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
main_body:
%x = extractelement <2 x i16> %coords_lo, i32 0
%y = extractelement <2 x i16> %coords_lo, i32 1
%z = extractelement <2 x i16> %coords_hi, i32 0
%v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 15, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %v
}

declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32, i16, <8 x i32>, i32, i32) #2
declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32, i16, i16, <8 x i32>, i32, i32) #2
declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32, i16, i16, i16, <8 x i32>, i32, i32) #2

attributes #0 = { nounwind }
attributes #1 = { nounwind readonly }

0 comments on commit 63718b2

Please sign in to comment.