[AMDGPU] support image load/store a16

Our a16 support was only enabled for sample/gather and buffer load/store, but not for image load/store operations (which take an i16 as the pixel index rather than a half). Fix our isel lowering and add test cases to prove it out. Differential Revision: https://reviews.llvm.org/D53750 llvm-svn: 345710
llvm · Oct 31, 2018 · 63718b2 · 63718b2
1 parent 262baa4
commit 63718b2
Show file tree

Hide file tree

Showing 7 changed files with 1,146 additions and 2 deletions.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4726,9 +4726,11 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   // Check for 16 bit addresses and pack if true.
   unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
   MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
-  if (VAddrVT.getScalarType() == MVT::f16 &&
+  const MVT VAddrScalarVT = VAddrVT.getScalarType();
+  if (((VAddrScalarVT == MVT::f16) || (VAddrScalarVT == MVT::i16)) &&
       ST->hasFeature(AMDGPU::FeatureR128A16)) {
     IsA16 = true;
+    const MVT VectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
     for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) {
       SDValue AddrLo, AddrHi;
       // Push back extra arguments.
@@ -4747,7 +4749,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
           AddrHi = Op.getOperand(i + 1);
           i++;
         }
-        AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f16,
+        AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorVT,
                              {AddrLo, AddrHi});
         AddrLo = DAG.getBitcast(MVT::i32, AddrLo);
       }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll
@@ -0,0 +1,128 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+
+; GCN-LABEL: {{^}}load.f16.1d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm a16 d16
+define amdgpu_ps <4 x half> @load.f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i16(i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v2f16.1d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16 d16
+define amdgpu_ps <4 x half> @load.v2f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i16(i32 3, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v3f16.1d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x7 unorm a16 d16
+define amdgpu_ps <4 x half> @load.v3f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i16(i32 7, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v4f16.1d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0xf unorm a16 d16
+define amdgpu_ps <4 x half> @load.v4f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i16(i32 15, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.f16.2d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm a16 d16
+define amdgpu_ps <4 x half> @load.f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i16(i32 1, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v2f16.2d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16 d16
+define amdgpu_ps <4 x half> @load.v2f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i16(i32 3, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v3f16.2d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x7 unorm a16 d16
+define amdgpu_ps <4 x half> @load.v3f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i16(i32 7, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v4f16.2d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0xf unorm a16 d16
+define amdgpu_ps <4 x half> @load.v4f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i16(i32 15, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.f16.3d:
+; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x1 unorm a16 d16
+define amdgpu_ps <4 x half> @load.f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i16(i32 1, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v2f16.3d:
+; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x3 unorm a16 d16
+define amdgpu_ps <4 x half> @load.v2f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i16(i32 3, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v3f16.3d:
+; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x7 unorm a16 d16
+define amdgpu_ps <4 x half> @load.v3f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i16(i32 7, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v4f16.3d:
+; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0xf unorm a16 d16
+define amdgpu_ps <4 x half> @load.v4f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i16(i32 15, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+declare <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i16(i32, i16, <8 x i32>, i32, i32) #2
+declare <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i16(i32, i16, i16, <8 x i32>, i32, i32) #2
+declare <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i16(i32, i16, i16, i16, <8 x i32>, i32, i32) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll
@@ -0,0 +1,128 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+
+; GCN-LABEL: {{^}}load.f32.1d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0x1 unorm a16
+define amdgpu_ps <4 x float> @load.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v2f32.1d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0x3 unorm a16
+define amdgpu_ps <4 x float> @load.v2f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 3, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v3f32.1d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm a16
+define amdgpu_ps <4 x float> @load.v3f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 7, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v4f32.1d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load.v4f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.f32.2d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0x1 unorm a16
+define amdgpu_ps <4 x float> @load.f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 1, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v2f32.2d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0x3 unorm a16
+define amdgpu_ps <4 x float> @load.v2f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 3, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v3f32.2d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm a16
+define amdgpu_ps <4 x float> @load.v3f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 7, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v4f32.2d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load.v4f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.f32.3d:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x1 unorm a16
+define amdgpu_ps <4 x float> @load.f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 1, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v2f32.3d:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x3 unorm a16
+define amdgpu_ps <4 x float> @load.v2f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 3, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v3f32.3d:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x7 unorm a16
+define amdgpu_ps <4 x float> @load.v3f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 7, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v4f32.3d:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load.v4f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 15, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32, i16, <8 x i32>, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32, i16, i16, <8 x i32>, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32, i16, i16, i16, <8 x i32>, i32, i32) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }