212 changes: 212 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.a16.dim.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s

define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) {
; GFX9-LABEL: sample_cd_1d:
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_cd_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
; GFX9-LABEL: sample_cd_2d:
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4
; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2
; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v0
; GFX9-NEXT: image_sample_cd v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0
; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s) {
; GFX9-LABEL: sample_c_cd_1d:
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_c_cd_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
; GFX9-LABEL: sample_c_cd_2d:
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: v_mov_b32_e32 v7, v3
; GFX9-NEXT: v_mov_b32_e32 v8, v2
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v5
; GFX9-NEXT: v_lshl_or_b32 v3, v6, 16, v2
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v7
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2
; GFX9-NEXT: v_lshl_or_b32 v1, v8, 16, v1
; GFX9-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_c_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) {
; GFX9-LABEL: sample_cd_cl_1d:
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX9-NEXT: image_sample_cd_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_cd_cl_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) {
; GFX9-LABEL: sample_cd_cl_2d:
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v4
; GFX9-NEXT: v_lshl_or_b32 v4, v3, 16, v2
; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v0
; GFX9-NEXT: image_sample_cd_cl v[0:3], v[3:6], s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) {
; GFX9-LABEL: sample_c_cd_cl_1d:
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX9-NEXT: image_sample_c_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_c_cd_cl_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) {
; GFX9-LABEL: sample_c_cd_cl_2d:
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: v_mov_b32_e32 v11, v7
; GFX9-NEXT: v_mov_b32_e32 v7, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5
; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v8, v2, 16, v0
; GFX9-NEXT: image_sample_c_cd_cl v[0:3], v[7:11], s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_c_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f16(i32, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f16(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f16(i32, float, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f16(i32, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f16(i32, half, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f16(i32, float, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f16(i32, float, half, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1

attributes #0 = { nounwind }
attributes #1 = { nounwind readonly }
attributes #2 = { nounwind readnone }
178 changes: 178 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.dim.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=VERDE %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s

define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s) {
; VERDE-LABEL: sample_cd_1d:
; VERDE: ; %bb.0: ; %main_body
; VERDE-NEXT: image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf
; VERDE-NEXT: s_waitcnt vmcnt(0)
; VERDE-NEXT: ; return to shader part epilog
;
; GFX6789-LABEL: sample_cd_1d:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_cd_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) {
; VERDE-LABEL: sample_cd_2d:
; VERDE: ; %bb.0: ; %main_body
; VERDE-NEXT: image_sample_cd v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf
; VERDE-NEXT: s_waitcnt vmcnt(0)
; VERDE-NEXT: ; return to shader part epilog
;
; GFX6789-LABEL: sample_cd_2d:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: image_sample_cd v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_cd v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) {
; VERDE-LABEL: sample_c_cd_1d:
; VERDE: ; %bb.0: ; %main_body
; VERDE-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf
; VERDE-NEXT: s_waitcnt vmcnt(0)
; VERDE-NEXT: ; return to shader part epilog
;
; GFX6789-LABEL: sample_c_cd_1d:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_c_cd_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) {
; VERDE-LABEL: sample_c_cd_2d:
; VERDE: ; %bb.0: ; %main_body
; VERDE-NEXT: image_sample_c_cd v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf
; VERDE-NEXT: s_waitcnt vmcnt(0)
; VERDE-NEXT: ; return to shader part epilog
;
; GFX6789-LABEL: sample_c_cd_2d:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: image_sample_c_cd v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_c_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_c_cd v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s, float %clamp) {
; VERDE-LABEL: sample_cd_cl_1d:
; VERDE: ; %bb.0: ; %main_body
; VERDE-NEXT: image_sample_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf
; VERDE-NEXT: s_waitcnt vmcnt(0)
; VERDE-NEXT: ; return to shader part epilog
;
; GFX6789-LABEL: sample_cd_cl_1d:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: image_sample_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_cd_cl_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) {
; VERDE-LABEL: sample_cd_cl_2d:
; VERDE: ; %bb.0: ; %main_body
; VERDE-NEXT: image_sample_cd_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf
; VERDE-NEXT: s_waitcnt vmcnt(0)
; VERDE-NEXT: ; return to shader part epilog
;
; GFX6789-LABEL: sample_cd_cl_2d:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: image_sample_cd_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_cd_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp) {
; VERDE-LABEL: sample_c_cd_cl_1d:
; VERDE: ; %bb.0: ; %main_body
; VERDE-NEXT: image_sample_c_cd_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf
; VERDE-NEXT: s_waitcnt vmcnt(0)
; VERDE-NEXT: ; return to shader part epilog
;
; GFX6789-LABEL: sample_c_cd_cl_1d:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: image_sample_c_cd_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_c_cd_cl_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_c_cd_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1

attributes #0 = { nounwind }
attributes #1 = { nounwind readonly }
attributes #2 = { nounwind readnone }
121 changes: 121 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.encode.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1010 -show-mc-encoding < %s | FileCheck -check-prefix=GFX10 %s

define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_cd_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_c_cd_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_c_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04]
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
; GFX10-LABEL: sample_cd_cl_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
; GFX10-LABEL: sample_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
; GFX10-LABEL: sample_c_cd_cl_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xac,0xf1,0x00,0x00,0x40,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
; GFX10-LABEL: sample_c_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x01,0x04]
; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x05,0x04]
; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x09,0x0f,0xac,0xf1,0x02,0x00,0x40,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1

attributes #0 = { nounwind }
attributes #1 = { nounwind readonly }
attributes #2 = { nounwind readnone }
121 changes: 121 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s

define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_cd_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_c_cd_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_c_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
; GFX10-LABEL: sample_cd_cl_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
; GFX10-LABEL: sample_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
; GFX10-LABEL: sample_c_cd_cl_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
; GFX10-LABEL: sample_c_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v8, v2
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0
; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1
; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1

attributes #0 = { nounwind }
attributes #1 = { nounwind readonly }
attributes #2 = { nounwind readnone }
151 changes: 84 additions & 67 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=TONGA %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GFX81 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s

define amdgpu_ps half @image_sample_2d_f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
; TONGA-LABEL: image_sample_2d_f16:
Expand Down Expand Up @@ -32,14 +33,14 @@ define amdgpu_ps half @image_sample_2d_f16(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: image_sample_2d_f16:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 s12, exec_lo
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: image_sample_2d_f16:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo
; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10PLUS-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D d16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%tex = call half @llvm.amdgcn.image.sample.2d.f16.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
ret half %tex
Expand Down Expand Up @@ -109,6 +110,22 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: global_store_dword v4, v3, s[12:13]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: image_sample_2d_f16_tfe:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_mov_b32 s14, exec_lo
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: v_mov_b32_e32 v5, v4
; GFX11-NEXT: v_mov_b32_e32 v2, v4
; GFX11-NEXT: v_mov_b32_e32 v3, v5
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX11-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v2
; GFX11-NEXT: global_store_b32 v4, v3, s[12:13]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ; return to shader part epilog
main_body:
%tex = call {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
%tex.vec = extractvalue {half, i32} %tex, 0
Expand Down Expand Up @@ -138,11 +155,11 @@ define amdgpu_ps float @image_sample_c_d_1d_v2f16(<8 x i32> inreg %rsrc, <4 x i3
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: image_sample_c_d_1d_v2f16:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: image_sample_c_d_1d_v2f16:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D d16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%tex = call <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
%r = bitcast <2 x half> %tex to float
Expand Down Expand Up @@ -182,15 +199,15 @@ define amdgpu_ps <2 x float> @image_sample_c_d_1d_v2f16_tfe(<8 x i32> inreg %rsr
; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: image_sample_c_d_1d_v2f16_tfe:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v4, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: image_sample_c_d_1d_v2f16_tfe:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: v_mov_b32_e32 v5, v0
; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0
; GFX10PLUS-NEXT: v_mov_b32_e32 v4, v1
; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0
; GFX10PLUS-NEXT: image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%tex = call {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
%tex.vec = extractvalue {<2 x half>, i32} %tex, 0
Expand Down Expand Up @@ -233,14 +250,14 @@ define amdgpu_ps <2 x float> @image_sample_b_2d_v3f16(<8 x i32> inreg %rsrc, <4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: image_sample_b_2d_v3f16:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 s12, exec_lo
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: image_sample_b_2d_v3f16:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo
; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10PLUS-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D d16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%tex = call <3 x half> @llvm.amdgcn.image.sample.b.2d.v3f16.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
%tex_wide = shufflevector <3 x half> %tex, <3 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
Expand Down Expand Up @@ -296,20 +313,20 @@ define amdgpu_ps <4 x float> @image_sample_b_2d_v3f16_tfe(<8 x i32> inreg %rsrc,
; GFX9-NEXT: v_mov_b32_e32 v2, v5
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: image_sample_b_2d_v3f16_tfe:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 s12, exec_lo
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-NEXT: v_mov_b32_e32 v3, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v5, v2
; GFX10-NEXT: v_mov_b32_e32 v4, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: image_sample_b_2d_v3f16_tfe:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo
; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v0
; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0
; GFX10PLUS-NEXT: v_mov_b32_e32 v5, v2
; GFX10PLUS-NEXT: v_mov_b32_e32 v4, v1
; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0
; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0
; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10PLUS-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%tex = call {<3 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v3f16i32.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
%tex.vec = extractvalue {<3 x half>, i32} %tex, 0
Expand Down Expand Up @@ -357,14 +374,14 @@ define amdgpu_ps <2 x float> @image_sample_b_2d_v4f16(<8 x i32> inreg %rsrc, <4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: image_sample_b_2d_v4f16:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 s12, exec_lo
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: image_sample_b_2d_v4f16:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo
; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10PLUS-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D d16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%tex = call <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
%r = bitcast <4 x half> %tex to <2 x float>
Expand Down Expand Up @@ -421,20 +438,20 @@ define amdgpu_ps <4 x float> @image_sample_b_2d_v4f16_tfe(<8 x i32> inreg %rsrc,
; GFX9-NEXT: v_mov_b32_e32 v2, v5
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: image_sample_b_2d_v4f16_tfe:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 s12, exec_lo
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-NEXT: v_mov_b32_e32 v3, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v5, v2
; GFX10-NEXT: v_mov_b32_e32 v4, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: image_sample_b_2d_v4f16_tfe:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo
; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v0
; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0
; GFX10PLUS-NEXT: v_mov_b32_e32 v5, v2
; GFX10PLUS-NEXT: v_mov_b32_e32 v4, v1
; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0
; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0
; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10PLUS-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%tex = call {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
%tex.vec = extractvalue {<4 x half>, i32} %tex, 0
Expand Down
1,247 changes: 556 additions & 691 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll

Large diffs are not rendered by default.

222 changes: 107 additions & 115 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll

Large diffs are not rendered by default.

116 changes: 1 addition & 115 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s

define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_d_1d:
Expand Down Expand Up @@ -124,112 +125,6 @@ main_body:
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_cd_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_c_cd_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_c_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
; GFX10-LABEL: sample_cd_cl_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
; GFX10-LABEL: sample_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
; GFX10-LABEL: sample_c_cd_cl_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
; GFX10-LABEL: sample_c_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v8, v2
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0
; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1
; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
ret <4 x float> %v
}

define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
; GFX10-LABEL: sample_c_d_o_2darray_V1:
; GFX10: ; %bb.0: ; %main_body
Expand Down Expand Up @@ -278,15 +173,6 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32, half, h
declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1

declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1

declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s

define amdgpu_ps void @store_f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
; GFX9-LABEL: store_f16_1d:
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s

define amdgpu_ps void @store_f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
; GFX9-LABEL: store_f32_1d:
Expand Down
234 changes: 191 additions & 43 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1013 %s
; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s
; RUN: llc -march=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1013 %s
; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1030 %s
; RUN: not --crash llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s

; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr)
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr)
Expand Down Expand Up @@ -38,26 +39,49 @@ main_body:
}

define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, float inreg %ray_extent, <3 x float> inreg %ray_origin, <3 x half> inreg %ray_dir, <3 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) {
; GCN-LABEL: image_bvh_intersect_ray_a16:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_mov_b32 s15, s12
; GCN-NEXT: s_mov_b32 s12, s9
; GCN-NEXT: s_lshr_b32 s9, s7, 16
; GCN-NEXT: s_pack_ll_b32_b16 s6, s6, s7
; GCN-NEXT: s_pack_ll_b32_b16 s7, s9, s8
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_mov_b32_e32 v5, s5
; GCN-NEXT: v_mov_b32_e32 v6, s6
; GCN-NEXT: v_mov_b32_e32 v7, s7
; GCN-NEXT: s_mov_b32 s14, s11
; GCN-NEXT: s_mov_b32 s13, s10
; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[12:15] a16
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
; GFX10-LABEL: image_bvh_intersect_ray_a16:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 s15, s12
; GFX10-NEXT: s_mov_b32 s12, s9
; GFX10-NEXT: s_lshr_b32 s9, s7, 16
; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s7
; GFX10-NEXT: s_pack_ll_b32_b16 s7, s9, s8
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: v_mov_b32_e32 v5, s5
; GFX10-NEXT: v_mov_b32_e32 v6, s6
; GFX10-NEXT: v_mov_b32_e32 v7, s7
; GFX10-NEXT: s_mov_b32 s14, s11
; GFX10-NEXT: s_mov_b32 s13, s10
; GFX10-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[12:15] a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: image_bvh_intersect_ray_a16:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s3
; GFX11-NEXT: s_lshr_b32 s2, s7, 16
; GFX11-NEXT: s_lshr_b32 s3, s5, 16
; GFX11-NEXT: v_mov_b32_e32 v2, s4
; GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s2
; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s7
; GFX11-NEXT: s_pack_ll_b32_b16 s4, s6, s8
; GFX11-NEXT: v_mov_b32_e32 v3, s3
; GFX11-NEXT: v_mov_b32_e32 v4, s2
; GFX11-NEXT: v_mov_b32_e32 v5, s4
; GFX11-NEXT: v_mov_b32_e32 v6, s0
; GFX11-NEXT: v_mov_b32_e32 v7, s1
; GFX11-NEXT: s_mov_b32 s15, s12
; GFX11-NEXT: s_mov_b32 s14, s11
; GFX11-NEXT: s_mov_b32 s13, s10
; GFX11-NEXT: s_mov_b32 s12, s9
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[12:15] a16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
Expand Down Expand Up @@ -89,27 +113,51 @@ main_body:
}

define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, float inreg %ray_extent, <3 x float> inreg %ray_origin, <3 x half> inreg %ray_dir, <3 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) {
; GCN-LABEL: image_bvh64_intersect_ray_a16:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_mov_b32 s14, s12
; GCN-NEXT: s_mov_b32 s12, s10
; GCN-NEXT: s_lshr_b32 s10, s8, 16
; GCN-NEXT: s_pack_ll_b32_b16 s7, s7, s8
; GCN-NEXT: s_pack_ll_b32_b16 s8, s10, s9
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_mov_b32_e32 v5, s5
; GCN-NEXT: v_mov_b32_e32 v6, s6
; GCN-NEXT: v_mov_b32_e32 v7, s7
; GCN-NEXT: v_mov_b32_e32 v8, s8
; GCN-NEXT: s_mov_b32 s15, s13
; GCN-NEXT: s_mov_b32 s13, s11
; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[12:15] a16
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
; GFX10-LABEL: image_bvh64_intersect_ray_a16:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 s14, s12
; GFX10-NEXT: s_mov_b32 s12, s10
; GFX10-NEXT: s_lshr_b32 s10, s8, 16
; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s8
; GFX10-NEXT: s_pack_ll_b32_b16 s8, s10, s9
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: v_mov_b32_e32 v5, s5
; GFX10-NEXT: v_mov_b32_e32 v6, s6
; GFX10-NEXT: v_mov_b32_e32 v7, s7
; GFX10-NEXT: v_mov_b32_e32 v8, s8
; GFX10-NEXT: s_mov_b32 s15, s13
; GFX10-NEXT: s_mov_b32 s13, s11
; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[12:15] a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: image_bvh64_intersect_ray_a16:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v0, s3
; GFX11-NEXT: v_mov_b32_e32 v6, s0
; GFX11-NEXT: s_lshr_b32 s0, s8, 16
; GFX11-NEXT: s_lshr_b32 s3, s6, 16
; GFX11-NEXT: v_mov_b32_e32 v7, s1
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s3, s0
; GFX11-NEXT: s_pack_ll_b32_b16 s1, s6, s8
; GFX11-NEXT: s_pack_ll_b32_b16 s3, s7, s9
; GFX11-NEXT: v_mov_b32_e32 v1, s4
; GFX11-NEXT: v_mov_b32_e32 v2, s5
; GFX11-NEXT: v_mov_b32_e32 v3, s1
; GFX11-NEXT: v_mov_b32_e32 v4, s0
; GFX11-NEXT: v_mov_b32_e32 v5, s3
; GFX11-NEXT: v_mov_b32_e32 v8, s2
; GFX11-NEXT: s_mov_b32 s15, s13
; GFX11-NEXT: s_mov_b32 s14, s12
; GFX11-NEXT: s_mov_b32 s13, s11
; GFX11-NEXT: s_mov_b32 s12, s10
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[12:15] a16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
Expand Down Expand Up @@ -172,6 +220,33 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1030-NEXT: s_endpgm
;
; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v4, 4.0
; GFX11-NEXT: v_mov_b32_e32 v5, 0x40a00000
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: v_mov_b32_e32 v7, 1.0
; GFX11-NEXT: v_mov_b32_e32 v8, 2.0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
; GFX11-NEXT: v_add_co_u32 v2, s4, s6, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4
; GFX11-NEXT: flat_load_b32 v9, v[0:1]
; GFX11-NEXT: flat_load_b32 v10, v[2:3]
; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000
; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000
; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000
; GFX11-NEXT: v_mov_b32_e32 v3, 0x40400000
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[6:8], v[3:5], v[0:2]], s[0:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_endpgm
main_body:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid
Expand Down Expand Up @@ -240,6 +315,30 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1030-NEXT: s_endpgm
;
; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v4, 1.0
; GFX11-NEXT: v_mov_b32_e32 v5, 2.0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
; GFX11-NEXT: v_add_co_u32 v2, s4, s6, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4
; GFX11-NEXT: flat_load_b32 v6, v[0:1]
; GFX11-NEXT: flat_load_b32 v7, v[2:3]
; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200
; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400
; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[0:3] a16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_endpgm
main_body:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid
Expand Down Expand Up @@ -312,6 +411,32 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray,
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1030-NEXT: s_endpgm
;
; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000
; GFX11-NEXT: v_mov_b32_e32 v3, 0x40400000
; GFX11-NEXT: v_mov_b32_e32 v4, 4.0
; GFX11-NEXT: v_mov_b32_e32 v5, 0x40a00000
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: v_mov_b32_e32 v7, 1.0
; GFX11-NEXT: v_mov_b32_e32 v8, 2.0
; GFX11-NEXT: v_mov_b32_e32 v9, 0xb36211c7
; GFX11-NEXT: v_mov_b32_e32 v10, 0x102
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
; GFX11-NEXT: flat_load_b32 v11, v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000
; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[6:8], v[3:5], v[0:2]], s[0:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_endpgm
main_body:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
Expand Down Expand Up @@ -376,6 +501,29 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1030-NEXT: s_endpgm
;
; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: v_mov_b32_e32 v4, 1.0
; GFX11-NEXT: v_mov_b32_e32 v5, 2.0
; GFX11-NEXT: v_mov_b32_e32 v6, 0xb36211c6
; GFX11-NEXT: v_mov_b32_e32 v7, 0x102
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
; GFX11-NEXT: flat_load_b32 v8, v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200
; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[3:5], v[0:2]], s[0:3] a16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_endpgm
main_body:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
Expand Down
490 changes: 490 additions & 0 deletions llvm/test/CodeGen/AMDGPU/merge-image-load-gfx11.mir

Large diffs are not rendered by default.

1,013 changes: 1,013 additions & 0 deletions llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx11.mir

Large diffs are not rendered by default.