1,430 changes: 1,301 additions & 129 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -688,7 +688,7 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[DEF:%[0-9]+]]:sgpr_192 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:sgpr_192 = IMPLICIT_DEF
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_512 = REG_SEQUENCE [[DEF]], %subreg.sub0_sub1_sub2_sub3_sub4_sub5, [[DEF1]], %subreg.sub6_sub7_sub8_sub9_sub10_sub11
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_384 = REG_SEQUENCE [[DEF]], %subreg.sub0_sub1_sub2_sub3_sub4_sub5, [[DEF1]], %subreg.sub6_sub7_sub8_sub9_sub10_sub11
; GCN-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
%0:sgpr(<3 x s64>) = G_IMPLICIT_DEF
%1:sgpr(<3 x s64>) = G_IMPLICIT_DEF
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ body: |
; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_96 = COPY $sgpr3_sgpr4_sgpr5
; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_96 = COPY $sgpr6_sgpr7_sgpr8
; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_96 = COPY $sgpr9_sgpr10_sgpr11
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_512_with_sub0_sub1_sub2 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1_sub2, [[COPY1]], %subreg.sub3_sub4_sub5, [[COPY2]], %subreg.sub6_sub7_sub8, [[COPY3]], %subreg.sub9_sub10_sub11
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_384_with_sub0_sub1_sub2 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1_sub2, [[COPY1]], %subreg.sub3_sub4_sub5, [[COPY2]], %subreg.sub6_sub7_sub8, [[COPY3]], %subreg.sub9_sub10_sub11
; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE]].sub0_sub1_sub2
; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE]].sub3_sub4_sub5
; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE]].sub6_sub7_sub8
Expand Down Expand Up @@ -332,7 +332,7 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_192 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_192 = COPY $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1_sub2_sub3_sub4_sub5, [[COPY1]], %subreg.sub6_sub7_sub8_sub9_sub10_sub11
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_384 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1_sub2_sub3_sub4_sub5, [[COPY1]], %subreg.sub6_sub7_sub8_sub9_sub10_sub11
; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]].sub0_sub1_sub2
; GCN-NEXT: [[COPY3:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]].sub3_sub4_sub5
; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]].sub6_sub7_sub8
Expand Down
138 changes: 63 additions & 75 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# RUN: llc -march=amdgcn -run-pass simple-register-coalescing -o - %s | FileCheck %s
# Check that %11 and %20 have been coalesced.
# CHECK: IMAGE_SAMPLE_C_D_O_V1_V16 %[[REG:[0-9]+]]
# CHECK: IMAGE_SAMPLE_C_D_O_V1_V16 %[[REG]]
# CHECK: IMAGE_SAMPLE_C_D_O_V1_V11 %[[REG:[0-9]+]]
# CHECK: IMAGE_SAMPLE_C_D_O_V1_V11 %[[REG]]

---
name: main
Expand All @@ -17,9 +17,9 @@ registers:
- { id: 6, class: sgpr_128 }
- { id: 7, class: sgpr_512 }
- { id: 9, class: vreg_512 }
- { id: 11, class: vreg_512 }
- { id: 11, class: vreg_352 }
- { id: 18, class: vgpr_32 }
- { id: 20, class: vreg_512 }
- { id: 20, class: vreg_352 }
- { id: 27, class: vgpr_32 }
liveins:
- { reg: '$sgpr2_sgpr3', virtual-reg: '%0' }
Expand Down Expand Up @@ -61,7 +61,7 @@ body: |
%11.sub6 = COPY %1
%11.sub7 = COPY %1
%11.sub8 = COPY %1
dead %18 = IMAGE_SAMPLE_C_D_O_V1_V16 %11, %3, %4, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load (s32))
dead %18 = IMAGE_SAMPLE_C_D_O_V1_V11 %11, %3, %4, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load (s32))
%20.sub1 = COPY %2
%20.sub2 = COPY %2
%20.sub3 = COPY %2
Expand All @@ -70,6 +70,6 @@ body: |
%20.sub6 = COPY %2
%20.sub7 = COPY %2
%20.sub8 = COPY %2
dead %27 = IMAGE_SAMPLE_C_D_O_V1_V16 %20, %5, %6, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load (s32))
dead %27 = IMAGE_SAMPLE_C_D_O_V1_V11 %20, %5, %6, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load (s32))
...
18 changes: 15 additions & 3 deletions llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
Original file line number Diff line number Diff line change
Expand Up @@ -171,14 +171,20 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0
; SI-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1
; SI-NEXT: s_mov_b32 s26, -1
; SI-NEXT: s_mov_b32 s27, 0xe8f000
; SI-NEXT: s_add_u32 s24, s24, s3
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_addc_u32 s25, s25, 0
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_mov_b32 s22, s10
; SI-NEXT: s_mov_b32 s23, s11
Expand All @@ -197,24 +203,30 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x
;
; VI-LABEL: test_copy_v4i8_x4:
; VI: ; %bb.0:
; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
; VI-NEXT: s_mov_b32 s90, -1
; VI-NEXT: s_mov_b32 s91, 0xe80000
; VI-NEXT: s_add_u32 s88, s88, s3
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_addc_u32 s89, s89, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_mov_b32 s18, s10
; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s22, s10
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_mov_b32 s22, s10
; VI-NEXT: s_mov_b32 s23, s11
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
Expand Down
20 changes: 16 additions & 4 deletions llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,43 +7,55 @@ define protected amdgpu_kernel void @sccClobber(ptr addrspace(1) %a, ptr addrspa
; RRLIST-LABEL: sccClobber:
; RRLIST: ; %bb.0: ; %entry
; RRLIST-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; RRLIST-NEXT: v_mov_b32_e32 v2, 0
; RRLIST-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
; RRLIST-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
; RRLIST-NEXT: s_mov_b32 s22, -1
; RRLIST-NEXT: s_mov_b32 s23, 0xe00000
; RRLIST-NEXT: s_add_u32 s20, s20, s3
; RRLIST-NEXT: s_waitcnt lgkmcnt(0)
; RRLIST-NEXT: s_load_dword s16, s[8:9], 0x0
; RRLIST-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
; RRLIST-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0
; RRLIST-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x44
; RRLIST-NEXT: s_load_dword s17, s[10:11], 0x0
; RRLIST-NEXT: s_addc_u32 s21, s21, 0
; RRLIST-NEXT: s_waitcnt lgkmcnt(0)
; RRLIST-NEXT: s_min_i32 s4, s16, 0
; RRLIST-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
; RRLIST-NEXT: s_min_i32 s4, s16, 0
; RRLIST-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
; RRLIST-NEXT: s_and_b64 s[0:1], vcc, exec
; RRLIST-NEXT: s_cselect_b32 s0, s16, s17
; RRLIST-NEXT: s_cmp_eq_u64 s[12:13], s[2:3]
; RRLIST-NEXT: s_cselect_b32 s0, s4, s0
; RRLIST-NEXT: v_mov_b32_e32 v2, 0
; RRLIST-NEXT: v_mov_b32_e32 v0, s0
; RRLIST-NEXT: global_store_dword v2, v0, s[14:15]
; RRLIST-NEXT: s_endpgm
;
; FAST-LABEL: sccClobber:
; FAST: ; %bb.0: ; %entry
; FAST-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; FAST-NEXT: v_mov_b32_e32 v2, 0
; FAST-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
; FAST-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
; FAST-NEXT: s_mov_b32 s22, -1
; FAST-NEXT: s_mov_b32 s23, 0xe00000
; FAST-NEXT: s_add_u32 s20, s20, s3
; FAST-NEXT: s_waitcnt lgkmcnt(0)
; FAST-NEXT: s_load_dword s16, s[8:9], 0x0
; FAST-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
; FAST-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0
; FAST-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x44
; FAST-NEXT: s_load_dword s17, s[10:11], 0x0
; FAST-NEXT: s_addc_u32 s21, s21, 0
; FAST-NEXT: s_waitcnt lgkmcnt(0)
; FAST-NEXT: s_min_i32 s4, s16, 0
; FAST-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
; FAST-NEXT: s_min_i32 s4, s16, 0
; FAST-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
; FAST-NEXT: s_and_b64 s[0:1], vcc, exec
; FAST-NEXT: s_cselect_b32 s0, s16, s17
; FAST-NEXT: s_cmp_eq_u64 s[12:13], s[2:3]
; FAST-NEXT: s_cselect_b32 s0, s4, s0
; FAST-NEXT: v_mov_b32_e32 v2, 0
; FAST-NEXT: v_mov_b32_e32 v0, s0
; FAST-NEXT: global_store_dword v2, v0, s[14:15]
; FAST-NEXT: s_endpgm
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/function-returns.ll
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ define <4 x i64> @v4i64_func_void() #0 {
; GCN-LABEL: {{^}}v5i64_func_void:
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
; GCN-DAG: buffer_load_dwordx4 v[8:11], off
; GCN-DAG: buffer_load_dwordx2 v[8:9], off
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <5 x i64> @v5i64_func_void() #0 {
Expand Down
7 changes: 2 additions & 5 deletions llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
Original file line number Diff line number Diff line change
Expand Up @@ -385,22 +385,19 @@ bb7: ; preds = %bb4, %bb1
; GCN: s_load_dword [[ARG:s[0-9]+]]

; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
; MOVREL: s_waitcnt
; MOVREL: s_add_i32 m0, [[ARG]], -16
; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, 4.0
; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, -4.0
; MOVREL: s_mov_b32 m0, -1


; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
; IDXMODE: s_waitcnt
; IDXMODE: s_add_i32 [[ARG]], [[ARG]], -16
; IDXMODE: s_set_gpr_idx_on [[ARG]], gpr_idx(DST)
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 4.0
; IDXMODE: s_set_gpr_idx_off
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
; IDXMODE: s_set_gpr_idx_on [[ARG]], gpr_idx(DST)
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, -4.0
; IDXMODE: s_set_gpr_idx_off

Expand Down
39 changes: 23 additions & 16 deletions llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -610,10 +610,16 @@ entry:
define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x double> %vec, i32 %sel) {
; GCN-LABEL: double5_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
; GCN-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
; GCN-NEXT: s_mov_b32 s18, -1
; GCN-NEXT: s_mov_b32 s19, 0xe80000
; GCN-NEXT: s_add_u32 s16, s16, s3
; GCN-NEXT: s_load_dword s12, s[0:1], 0xa4
; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x84
; GCN-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24
; GCN-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64
; GCN-NEXT: s_addc_u32 s17, s17, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s12, 4
; GCN-NEXT: s_cselect_b32 s9, 0x3ff00000, s9
Expand All @@ -622,10 +628,8 @@ define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x
; GCN-NEXT: s_cselect_b32 s3, 0x3ff00000, s3
; GCN-NEXT: s_cselect_b32 s2, 0, s2
; GCN-NEXT: s_cmp_eq_u32 s12, 0
; GCN-NEXT: v_mov_b32_e32 v4, s8
; GCN-NEXT: v_mov_b32_e32 v5, s9
; GCN-NEXT: s_cselect_b32 s8, 0x3ff00000, s1
; GCN-NEXT: s_cselect_b32 s9, 0, s0
; GCN-NEXT: s_cselect_b32 s13, 0x3ff00000, s1
; GCN-NEXT: s_cselect_b32 s14, 0, s0
; GCN-NEXT: s_cmp_eq_u32 s12, 3
; GCN-NEXT: s_cselect_b32 s0, 0x3ff00000, s7
; GCN-NEXT: s_cselect_b32 s1, 0, s6
Expand All @@ -636,23 +640,26 @@ define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x
; GCN-NEXT: s_add_u32 s0, s10, 16
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: s_addc_u32 s1, s11, 0
; GCN-NEXT: v_mov_b32_e32 v7, s1
; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: v_mov_b32_e32 v6, s0
; GCN-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GCN-NEXT: v_mov_b32_e32 v6, s10
; GCN-NEXT: v_mov_b32_e32 v0, s9
; GCN-NEXT: v_mov_b32_e32 v1, s8
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: v_mov_b32_e32 v4, s10
; GCN-NEXT: s_add_u32 s0, s10, 32
; GCN-NEXT: v_mov_b32_e32 v0, s14
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: v_mov_b32_e32 v7, s11
; GCN-NEXT: s_add_u32 s0, s10, 32
; GCN-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GCN-NEXT: v_mov_b32_e32 v5, s11
; GCN-NEXT: s_addc_u32 s1, s11, 0
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
entry:
%v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel
Expand Down
670 changes: 623 additions & 47 deletions llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll

Large diffs are not rendered by default.

9 changes: 6 additions & 3 deletions llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -enable-ipra -print-regusage -o /dev/null 2>&1 < %s | FileCheck %s
; Make sure the expected regmask is generated for sub/superregisters.

; CHECK-DAG: csr Clobbered Registers: $vgpr0 $vgpr0_hi16 $vgpr0_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1 $vgpr0_vgpr1_vgpr2 {{$}}
; CHECK-DAG: csr Clobbered Registers: $vgpr0 $vgpr0_hi16 $vgpr0_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1 $vgpr0_vgpr1_vgpr2 {{$}}
define void @csr() #0 {
call void asm sideeffect "", "~{v0},~{v44},~{v45}"() #0
ret void
}

; CHECK-DAG: subregs_for_super Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
; CHECK-DAG: subregs_for_super Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
define void @subregs_for_super() #0 {
call void asm sideeffect "", "~{v0},~{v1}"() #0
ret void
}

; CHECK-DAG: clobbered_reg_with_sub Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
; CHECK-DAG: clobbered_reg_with_sub Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
define void @clobbered_reg_with_sub() #0 {
call void asm sideeffect "", "~{v[0:1]}"() #0
ret void
Expand Down Expand Up @@ -44,3 +45,5 @@ define void @vcc() #0 {
i8* bitcast (void ()* @vcc to i8*)]

attributes #0 = { nounwind }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK: {{.*}}
102 changes: 50 additions & 52 deletions llvm/test/CodeGen/AMDGPU/kernel-args.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2286,57 +2286,56 @@ define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5
;
; VI-LABEL: v5i64_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x84
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: s_add_u32 s8, s2, 16
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: s_addc_u32 s9, s3, 0
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: s_add_u32 s12, s8, 32
; VI-NEXT: v_mov_b32_e32 v1, s10
; VI-NEXT: s_addc_u32 s13, s9, 0
; VI-NEXT: v_mov_b32_e32 v3, s12
; VI-NEXT: v_mov_b32_e32 v2, s11
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v4, s13
; VI-NEXT: s_add_u32 s4, s8, 16
; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_addc_u32 s5, s9, 0
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v5i64_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x60
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s12
; GFX9-NEXT: global_store_dwordx2 v4, v[1:2], s[2:3] offset:32
; GFX9-NEXT: v_mov_b32_e32 v1, s13
; GFX9-NEXT: v_mov_b32_e32 v2, s14
; GFX9-NEXT: v_mov_b32_e32 v3, s15
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: v_mov_b32_e32 v2, s10
; GFX9-NEXT: v_mov_b32_e32 v3, s11
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] offset:32
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v5i64_arg:
Expand Down Expand Up @@ -2429,57 +2428,56 @@ define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out,
;
; VI-LABEL: v5f64_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x84
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: s_add_u32 s8, s2, 16
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: s_addc_u32 s9, s3, 0
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: s_add_u32 s12, s8, 32
; VI-NEXT: v_mov_b32_e32 v1, s10
; VI-NEXT: s_addc_u32 s13, s9, 0
; VI-NEXT: v_mov_b32_e32 v3, s12
; VI-NEXT: v_mov_b32_e32 v2, s11
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v4, s13
; VI-NEXT: s_add_u32 s4, s8, 16
; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_addc_u32 s5, s9, 0
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v5f64_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x60
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s12
; GFX9-NEXT: global_store_dwordx2 v4, v[1:2], s[2:3] offset:32
; GFX9-NEXT: v_mov_b32_e32 v1, s13
; GFX9-NEXT: v_mov_b32_e32 v2, s14
; GFX9-NEXT: v_mov_b32_e32 v3, s15
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: v_mov_b32_e32 v2, s10
; GFX9-NEXT: v_mov_b32_e32 v3, s11
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] offset:32
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v5f64_arg:
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ main_body:
}

; GCN-LABEL: {{^}}sample_d_3d:
; GFX1010-NSA: image_sample_d v[0:3], v[7:22],
; GFX1010-NSA: image_sample_d v[0:3], v[7:15],
; GFX1030-NSA: image_sample_d v[0:3], [v3, v8, v7, v5, v4, v6, v0, v2, v1],
; GFX11-NSA: image_sample_d v[0:3], v[7:22],
; GFX11-NSA: image_sample_d v[0:3], v[7:15],
define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %r, float %t, float %dsdh, float %dtdv, float %dsdv, float %drdv, float %drdh, float %dtdh) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32(i32 15, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand Down
28 changes: 14 additions & 14 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1568,19 +1568,19 @@ main_body:
define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) {
; VERDE-LABEL: sample_c_d_o_2darray_V1:
; VERDE: ; %bb.0: ; %main_body
; VERDE-NEXT: image_sample_c_d_o v0, v[0:15], s[0:7], s[8:11] dmask:0x4 da
; VERDE-NEXT: image_sample_c_d_o v0, v[0:8], s[0:7], s[8:11] dmask:0x4 da
; VERDE-NEXT: s_waitcnt vmcnt(0)
; VERDE-NEXT: ; return to shader part epilog
;
; GFX6789-LABEL: sample_c_d_o_2darray_V1:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: image_sample_c_d_o v0, v[0:15], s[0:7], s[8:11] dmask:0x4 da
; GFX6789-NEXT: image_sample_c_d_o v0, v[0:8], s[0:7], s[8:11] dmask:0x4 da
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: sample_c_d_o_2darray_V1:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_sample_c_d_o v0, v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10PLUS-NEXT: image_sample_c_d_o v0, v[0:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
Expand All @@ -1593,7 +1593,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x
; VERDE: ; %bb.0: ; %main_body
; VERDE-NEXT: v_mov_b32_e32 v9, 0
; VERDE-NEXT: v_mov_b32_e32 v10, v9
; VERDE-NEXT: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 tfe da
; VERDE-NEXT: image_sample_c_d_o v[9:10], v[0:8], s[0:7], s[8:11] dmask:0x4 tfe da
; VERDE-NEXT: s_mov_b32 s15, 0xf000
; VERDE-NEXT: s_mov_b32 s14, -1
; VERDE-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -1608,7 +1608,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x
; GFX6789-NEXT: v_mov_b32_e32 v12, v11
; GFX6789-NEXT: v_mov_b32_e32 v9, v11
; GFX6789-NEXT: v_mov_b32_e32 v10, v12
; GFX6789-NEXT: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 tfe da
; GFX6789-NEXT: image_sample_c_d_o v[9:10], v[0:8], s[0:7], s[8:11] dmask:0x4 tfe da
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: v_mov_b32_e32 v0, v9
; GFX6789-NEXT: global_store_dword v11, v10, s[12:13]
Expand All @@ -1621,7 +1621,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x
; GFX10-NEXT: v_mov_b32_e32 v12, v11
; GFX10-NEXT: v_mov_b32_e32 v9, v11
; GFX10-NEXT: v_mov_b32_e32 v10, v12
; GFX10-NEXT: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe
; GFX10-NEXT: image_sample_c_d_o v[9:10], v[0:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, v9
; GFX10-NEXT: global_store_dword v11, v10, s[12:13]
Expand All @@ -1633,7 +1633,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x
; GFX11-NEXT: v_mov_b32_e32 v11, 0
; GFX11-NEXT: v_mov_b32_e32 v12, v11
; GFX11-NEXT: v_dual_mov_b32 v9, v11 :: v_dual_mov_b32 v10, v12
; GFX11-NEXT: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe
; GFX11-NEXT: image_sample_c_d_o v[9:10], v[0:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v9
; GFX11-NEXT: global_store_b32 v11, v10, s[12:13]
Expand All @@ -1650,19 +1650,19 @@ main_body:
define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) {
; VERDE-LABEL: sample_c_d_o_2darray_V2:
; VERDE: ; %bb.0: ; %main_body
; VERDE-NEXT: image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 da
; VERDE-NEXT: image_sample_c_d_o v[0:1], v[0:8], s[0:7], s[8:11] dmask:0x6 da
; VERDE-NEXT: s_waitcnt vmcnt(0)
; VERDE-NEXT: ; return to shader part epilog
;
; GFX6789-LABEL: sample_c_d_o_2darray_V2:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 da
; GFX6789-NEXT: image_sample_c_d_o v[0:1], v[0:8], s[0:7], s[8:11] dmask:0x6 da
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: sample_c_d_o_2darray_V2:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10PLUS-NEXT: image_sample_c_d_o v[0:1], v[0:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
Expand All @@ -1676,7 +1676,7 @@ define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc,
; VERDE-NEXT: v_mov_b32_e32 v9, 0
; VERDE-NEXT: v_mov_b32_e32 v10, v9
; VERDE-NEXT: v_mov_b32_e32 v11, v9
; VERDE-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 tfe da
; VERDE-NEXT: image_sample_c_d_o v[9:11], v[0:8], s[0:7], s[8:11] dmask:0x6 tfe da
; VERDE-NEXT: s_waitcnt vmcnt(0)
; VERDE-NEXT: v_mov_b32_e32 v0, v9
; VERDE-NEXT: v_mov_b32_e32 v1, v10
Expand All @@ -1688,7 +1688,7 @@ define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc,
; GFX6789-NEXT: v_mov_b32_e32 v9, 0
; GFX6789-NEXT: v_mov_b32_e32 v10, v9
; GFX6789-NEXT: v_mov_b32_e32 v11, v9
; GFX6789-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 tfe da
; GFX6789-NEXT: image_sample_c_d_o v[9:11], v[0:8], s[0:7], s[8:11] dmask:0x6 tfe da
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: v_mov_b32_e32 v0, v9
; GFX6789-NEXT: v_mov_b32_e32 v1, v10
Expand All @@ -1700,7 +1700,7 @@ define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc,
; GFX10-NEXT: v_mov_b32_e32 v9, 0
; GFX10-NEXT: v_mov_b32_e32 v10, v9
; GFX10-NEXT: v_mov_b32_e32 v11, v9
; GFX10-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe
; GFX10-NEXT: image_sample_c_d_o v[9:11], v[0:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, v9
; GFX10-NEXT: v_mov_b32_e32 v1, v10
Expand All @@ -1712,7 +1712,7 @@ define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc,
; GFX11-NEXT: v_mov_b32_e32 v9, 0
; GFX11-NEXT: v_mov_b32_e32 v10, v9
; GFX11-NEXT: v_mov_b32_e32 v11, v9
; GFX11-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe
; GFX11-NEXT: image_sample_c_d_o v[9:11], v[0:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, v11
; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.dim.ll
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ main_body:
}

; GCN-LABEL: {{^}}sample_c_d_cl_o_2d:
; GCN: image_sample_c_d_cl_o v[0:3], v[0:15], s[0:7], s[8:11] dmask:0xf{{$}}
; GCN: image_sample_c_d_cl_o v[0:3], v[0:8], s[0:7], s[8:11] dmask:0xf{{$}}
define amdgpu_ps <4 x float> @sample_c_d_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.2d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand Down Expand Up @@ -250,7 +250,7 @@ main_body:
}

; GCN-LABEL: {{^}}sample_c_cd_cl_o_2d:
; GCN: image_sample_c_cd_cl_o v[0:3], v[0:15], s[0:7], s[8:11] dmask:0xf{{$}}
; GCN: image_sample_c_cd_cl_o v[0:3], v[0:8], s[0:7], s[8:11] dmask:0xf{{$}}
define amdgpu_ps <4 x float> @sample_c_cd_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.2d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand Down
18 changes: 9 additions & 9 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <3
define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
; GCN-LABEL: image_bvh_intersect_ray:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[0:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
Expand Down Expand Up @@ -90,7 +90,7 @@ main_body:
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
; GCN-LABEL: image_bvh64_intersect_ray:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
Expand Down Expand Up @@ -128,7 +128,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr,
; GFX10-NEXT: v_mov_b32_e32 v8, s8
; GFX10-NEXT: s_mov_b32 s15, s13
; GFX10-NEXT: s_mov_b32 s13, s11
; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[12:15] a16
; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[12:15] a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
Expand Down Expand Up @@ -182,7 +182,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0
; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[4:7]
; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
Expand All @@ -208,7 +208,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX1030-NEXT: v_mov_b32_e32 v2, 0
; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[4:7]
; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1030-NEXT: s_endpgm
Expand Down Expand Up @@ -370,7 +370,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7
; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
Expand All @@ -396,7 +396,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1030-NEXT: s_endpgm
Expand Down Expand Up @@ -461,7 +461,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6
; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
Expand All @@ -484,7 +484,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1030-NEXT: s_endpgm
Expand Down
98 changes: 77 additions & 21 deletions llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
; GCN: s_load_dword s{{[0-9]+}}

; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
define amdgpu_kernel void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) #0 {
define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load i32, i32 addrspace(4)* %in
store i32 %ld, i32 addrspace(1)* %out
Expand All @@ -20,7 +20,7 @@ entry:
; GCN: s_load_dwordx2

; EG: VTX_READ_64
define amdgpu_kernel void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(4)* %in) #0 {
define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <2 x i32>, <2 x i32> addrspace(4)* %in
store <2 x i32> %ld, <2 x i32> addrspace(1)* %out
Expand All @@ -31,7 +31,7 @@ entry:
; GCN: s_load_dwordx4

; EG: VTX_READ_128
define amdgpu_kernel void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(4)* %in) #0 {
define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <3 x i32>, <3 x i32> addrspace(4)* %in
store <3 x i32> %ld, <3 x i32> addrspace(1)* %out
Expand All @@ -42,7 +42,7 @@ entry:
; GCN: s_load_dwordx4

; EG: VTX_READ_128
define amdgpu_kernel void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(4)* %in) #0 {
define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <4 x i32>, <4 x i32> addrspace(4)* %in
store <4 x i32> %ld, <4 x i32> addrspace(1)* %out
Expand All @@ -54,21 +54,77 @@ entry:

; EG: VTX_READ_128
; EG: VTX_READ_128
define amdgpu_kernel void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(4)* %in) #0 {
define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <8 x i32>, <8 x i32> addrspace(4)* %in
store <8 x i32> %ld, <8 x i32> addrspace(1)* %out
ret void
}

; FUNC-LABEL: {{^}}constant_load_v9i32:
; GCN: s_load_dword
; GCN: s_load_dwordx8

; EG: VTX_READ_128
; EG: VTX_READ_128
; EG: VTX_READ_32
define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <9 x i32>, <9 x i32> addrspace(4)* %in
store <9 x i32> %ld, <9 x i32> addrspace(1)* %out
ret void
}

; FUNC-LABEL: {{^}}constant_load_v10i32:
; GCN: s_load_dwordx2
; GCN: s_load_dwordx8

; EG: VTX_READ_128
; EG: VTX_READ_128
; EG: VTX_READ_128
define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <10 x i32>, <10 x i32> addrspace(4)* %in
store <10 x i32> %ld, <10 x i32> addrspace(1)* %out
ret void
}

; FUNC-LABEL: {{^}}constant_load_v11i32:
; GCN: s_load_dwordx4
; GCN: s_load_dwordx8

; EG: VTX_READ_128
; EG: VTX_READ_128
; EG: VTX_READ_128
define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <11 x i32>, <11 x i32> addrspace(4)* %in
store <11 x i32> %ld, <11 x i32> addrspace(1)* %out
ret void
}

; FUNC-LABEL: {{^}}constant_load_v12i32:
; GCN: s_load_dwordx4
; GCN: s_load_dwordx8

; EG: VTX_READ_128
; EG: VTX_READ_128
; EG: VTX_READ_128
define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <12 x i32>, <12 x i32> addrspace(4)* %in
store <12 x i32> %ld, <12 x i32> addrspace(1)* %out
ret void
}

; FUNC-LABEL: {{^}}constant_load_v16i32:
; GCN: s_load_dwordx16

; EG: VTX_READ_128
; EG: VTX_READ_128
; EG: VTX_READ_128
; EG: VTX_READ_128
define amdgpu_kernel void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(4)* %in) #0 {
define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%ld = load <16 x i32>, <16 x i32> addrspace(4)* %in
store <16 x i32> %ld, <16 x i32> addrspace(1)* %out
Expand All @@ -83,7 +139,7 @@ entry:
; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
; EG: CF_END
; EG: VTX_READ_32
define amdgpu_kernel void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(4)* %in) #0 {
define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load i32, i32 addrspace(4)* %in
%ext = zext i32 %ld to i64
store i64 %ext, i64 addrspace(1)* %out
Expand All @@ -100,7 +156,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out,
; EG: VTX_READ_32
; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal.
; EG: 31
define amdgpu_kernel void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(4)* %in) #0 {
define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load i32, i32 addrspace(4)* %in
%ext = sext i32 %ld to i64
store i64 %ext, i64 addrspace(1)* %out
Expand All @@ -110,7 +166,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out,
; FUNC-LABEL: {{^}}constant_zextload_v1i32_to_v1i64:
; GCN: s_load_dword
; GCN: store_dwordx2
define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(4)* %in) #0 {
define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <1 x i32>, <1 x i32> addrspace(4)* %in
%ext = zext <1 x i32> %ld to <1 x i64>
store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
Expand All @@ -121,7 +177,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(
; GCN: s_load_dword s[[LO:[0-9]+]]
; GCN: s_ashr_i32 s[[HI:[0-9]+]], s[[LO]], 31
; GCN: store_dwordx2
define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(4)* %in) #0 {
define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <1 x i32>, <1 x i32> addrspace(4)* %in
%ext = sext <1 x i32> %ld to <1 x i64>
store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
Expand All @@ -131,7 +187,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(
; FUNC-LABEL: {{^}}constant_zextload_v2i32_to_v2i64:
; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
; GCN: store_dwordx4
define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(4)* %in) #0 {
define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <2 x i32>, <2 x i32> addrspace(4)* %in
%ext = zext <2 x i32> %ld to <2 x i64>
store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
Expand All @@ -145,7 +201,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(
; GCN-DAG: s_ashr_i32

; GCN: store_dwordx4
define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(4)* %in) #0 {
define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <2 x i32>, <2 x i32> addrspace(4)* %in
%ext = sext <2 x i32> %ld to <2 x i64>
store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
Expand All @@ -157,7 +213,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(

; GCN: store_dwordx4
; GCN: store_dwordx4
define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(4)* %in) #0 {
define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <4 x i32>, <4 x i32> addrspace(4)* %in
%ext = zext <4 x i32> %ld to <4 x i64>
store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
Expand All @@ -174,7 +230,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(

; GCN: store_dwordx4
; GCN: store_dwordx4
define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(4)* %in) #0 {
define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <4 x i32>, <4 x i32> addrspace(4)* %in
%ext = sext <4 x i32> %ld to <4 x i64>
store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
Expand All @@ -193,7 +249,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-SA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(4)* %in) #0 {
define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <8 x i32>, <8 x i32> addrspace(4)* %in
%ext = zext <8 x i32> %ld to <8 x i64>
store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
Expand Down Expand Up @@ -221,7 +277,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(4)* %in) #0 {
define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <8 x i32>, <8 x i32> addrspace(4)* %in
%ext = sext <8 x i32> %ld to <8 x i64>
store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
Expand All @@ -242,7 +298,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(
; GCN: store_dwordx4
; GCN: store_dwordx4
; GCN: store_dwordx4
define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(4)* %in) #0 {
define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <16 x i32>, <16 x i32> addrspace(4)* %in
%ext = sext <16 x i32> %ld to <16 x i64>
store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
Expand All @@ -269,7 +325,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspa
; GCN-HSA: {{flat|global}}_store_dwordx4
; GCN-HSA: {{flat|global}}_store_dwordx4
; GCN-HSA: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(4)* %in) #0 {
define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <16 x i32>, <16 x i32> addrspace(4)* %in
%ext = zext <16 x i32> %ld to <16 x i64>
store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
Expand Down Expand Up @@ -321,7 +377,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspa
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4

define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(4)* %in) #0 {
define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <32 x i32>, <32 x i32> addrspace(4)* %in
%ext = sext <32 x i32> %ld to <32 x i64>
store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
Expand Down Expand Up @@ -372,7 +428,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspa
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(4)* %in) #0 {
define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <32 x i32>, <32 x i32> addrspace(4)* %in
%ext = zext <32 x i32> %ld to <32 x i64>
store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
Expand Down Expand Up @@ -424,7 +480,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspa
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @constant_load_v32i32(<32 x i32> addrspace(1)* %out, <32 x i32> addrspace(4)* %in) #0 {
define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
%ld = load <32 x i32>, <32 x i32> addrspace(4)* %in
store <32 x i32> %ld, <32 x i32> addrspace(1)* %out
ret void
Expand Down
88 changes: 82 additions & 6 deletions llvm/test/CodeGen/AMDGPU/load-global-f32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
; GCN-HSA: flat_load_dword

; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
define amdgpu_kernel void @global_load_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_load_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load float, float addrspace(1)* %in
store float %tmp0, float addrspace(1)* %out
Expand All @@ -22,7 +22,7 @@ entry:
; GCN-HSA: flat_load_dwordx2

; R600: VTX_READ_64
define amdgpu_kernel void @global_load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_load_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <2 x float>, <2 x float> addrspace(1)* %in
store <2 x float> %tmp0, <2 x float> addrspace(1)* %out
Expand All @@ -35,7 +35,7 @@ entry:
; GCNX3-HSA: flat_load_dwordx3

; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_load_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <3 x float>, <3 x float> addrspace(1)* %in
store <3 x float> %tmp0, <3 x float> addrspace(1)* %out
Expand All @@ -47,7 +47,7 @@ entry:
; GCN-HSA: flat_load_dwordx4

; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_load_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <4 x float>, <4 x float> addrspace(1)* %in
store <4 x float> %tmp0, <4 x float> addrspace(1)* %out
Expand All @@ -62,13 +62,89 @@ entry:

; R600: VTX_READ_128
; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v8f32(<8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_load_v8f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <8 x float>, <8 x float> addrspace(1)* %in
store <8 x float> %tmp0, <8 x float> addrspace(1)* %out
ret void
}

; FUNC-LABEL: {{^}}global_load_v9f32:
; GCN-NOHSA: buffer_load_dword
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dword
; GCN-HSA: flat_load_dwordx4

; R600: VTX_READ_128
; R600: VTX_READ_32
; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v9f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <9 x float>, <9 x float> addrspace(1)* %in
store <9 x float> %tmp0, <9 x float> addrspace(1)* %out
ret void
}


; FUNC-LABEL: {{^}}global_load_v10f32:
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx2
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx2

; R600: VTX_READ_128
; R600: VTX_READ_128
; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v10f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <10 x float>, <10 x float> addrspace(1)* %in
store <10 x float> %tmp0, <10 x float> addrspace(1)* %out
ret void
}

; FUNC-LABEL: {{^}}global_load_v11f32:
; SI-NOHSA: buffer_load_dwordx4
; SI-NOHSA: buffer_load_dwordx4
; SI-NOHSA: buffer_load_dwordx4
; GCNX3-NOHSA: buffer_load_dwordx4
; GCNX3-NOHSA: buffer_load_dwordx4
; GCNX3-NOHSA: buffer_load_dwordx3
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx3

; R600: VTX_READ_128
; R600: VTX_READ_128
; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v11f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <11 x float>, <11 x float> addrspace(1)* %in
store <11 x float> %tmp0, <11 x float> addrspace(1)* %out
ret void
}

; FUNC-LABEL: {{^}}global_load_v12f32:
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx4

; R600: VTX_READ_128
; R600: VTX_READ_128
; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v12f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <12 x float>, <12 x float> addrspace(1)* %in
store <12 x float> %tmp0, <12 x float> addrspace(1)* %out
ret void
}

; FUNC-LABEL: {{^}}global_load_v16f32:
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
Expand All @@ -84,7 +160,7 @@ entry:
; R600: VTX_READ_128
; R600: VTX_READ_128
; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v16f32(<16 x float> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_load_v16f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <16 x float>, <16 x float> addrspace(1)* %in
store <16 x float> %tmp0, <16 x float> addrspace(1)* %out
Expand Down
102 changes: 81 additions & 21 deletions llvm/test/CodeGen/AMDGPU/load-global-i32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
; GCN-HSA: {{flat|global}}_load_dword

; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
define amdgpu_kernel void @global_load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load i32, i32 addrspace(1)* %in
store i32 %ld, i32 addrspace(1)* %out
Expand All @@ -22,7 +22,7 @@ entry:
; GCN-HSA: {{flat|global}}_load_dwordx2

; EG: VTX_READ_64
define amdgpu_kernel void @global_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
store <2 x i32> %ld, <2 x i32> addrspace(1)* %out
Expand All @@ -35,7 +35,7 @@ entry:
; GCNX3-HSA: {{flat|global}}_load_dwordx3

; EG: VTX_READ_128
define amdgpu_kernel void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <3 x i32>, <3 x i32> addrspace(1)* %in
store <3 x i32> %ld, <3 x i32> addrspace(1)* %out
Expand All @@ -47,7 +47,7 @@ entry:
; GCN-HSA: {{flat|global}}_load_dwordx4

; EG: VTX_READ_128
define amdgpu_kernel void @global_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
store <4 x i32> %ld, <4 x i32> addrspace(1)* %out
Expand All @@ -62,13 +62,73 @@ entry:

; EG: VTX_READ_128
; EG: VTX_READ_128
define amdgpu_kernel void @global_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
store <8 x i32> %ld, <8 x i32> addrspace(1)* %out
ret void
}

; FUNC-LABEL: {{^}}global_load_v9i32:
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dword
; GCN-HSA: {{flat|global}}_load_dwordx4
; GCN-HSA: {{flat|global}}_load_dwordx4
; GCN-HSA: {{flat|global}}_load_dword
define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <9 x i32>, <9 x i32> addrspace(1)* %in
store <9 x i32> %ld, <9 x i32> addrspace(1)* %out
ret void
}

; FUNC-LABEL: {{^}}global_load_v10i32:
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx2
; GCN-HSA: {{flat|global}}_load_dwordx4
; GCN-HSA: {{flat|global}}_load_dwordx4
; GCN-HSA: {{flat|global}}_load_dwordx2
define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <10 x i32>, <10 x i32> addrspace(1)* %in
store <10 x i32> %ld, <10 x i32> addrspace(1)* %out
ret void
}

; FUNC-LABEL: {{^}}global_load_v11i32:
; SI-NOHSA: buffer_load_dwordx4
; SI-NOHSA: buffer_load_dwordx4
; SI-NOHSA: buffer_load_dwordx4
; GCNX3-NOHSA: buffer_load_dwordx4
; GCNX3-NOHSA: buffer_load_dwordx4
; GCNX3-NOHSA: buffer_load_dwordx3
; GCN-HSA: {{flat|global}}_load_dwordx4
; GCN-HSA: {{flat|global}}_load_dwordx4
; GCN-HSA: {{flat|global}}_load_dwordx3
define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <11 x i32>, <11 x i32> addrspace(1)* %in
store <11 x i32> %ld, <11 x i32> addrspace(1)* %out
ret void
}


; FUNC-LABEL: {{^}}global_load_v12i32:
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
; GCN-HSA: {{flat|global}}_load_dwordx4
; GCN-HSA: {{flat|global}}_load_dwordx4
; GCN-HSA: {{flat|global}}_load_dwordx4
define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <12 x i32>, <12 x i32> addrspace(1)* %in
store <12 x i32> %ld, <12 x i32> addrspace(1)* %out
ret void
}

; FUNC-LABEL: {{^}}global_load_v16i32:
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
Expand All @@ -84,7 +144,7 @@ entry:
; EG: VTX_READ_128
; EG: VTX_READ_128
; EG: VTX_READ_128
define amdgpu_kernel void @global_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
store <16 x i32> %ld, <16 x i32> addrspace(1)* %out
Expand All @@ -100,7 +160,7 @@ entry:
; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]

; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
define amdgpu_kernel void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load i32, i32 addrspace(1)* %in
%ext = zext i32 %ld to i64
store i64 %ext, i64 addrspace(1)* %out
Expand All @@ -119,7 +179,7 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i3
; EG: VTX_READ_32
; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal.
; EG: 31
define amdgpu_kernel void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load i32, i32 addrspace(1)* %in
%ext = sext i32 %ld to i64
store i64 %ext, i64 addrspace(1)* %out
Expand All @@ -132,7 +192,7 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i3

; GCN-HSA: {{flat|global}}_load_dword
; GCN-HSA: {{flat|global}}_store_dwordx2
define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
%ext = zext <1 x i32> %ld to <1 x i64>
store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
Expand All @@ -145,7 +205,7 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)
; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
; GCN-NOHSA: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
%ext = sext <1 x i32> %ld to <1 x i64>
store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
Expand All @@ -158,7 +218,7 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)

; GCN-HSA: {{flat|global}}_load_dwordx2
; GCN-HSA: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
%ext = zext <2 x i32> %ld to <2 x i64>
store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
Expand All @@ -174,7 +234,7 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)

; GCN-NOHSA-DAG: buffer_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
%ext = sext <2 x i32> %ld to <2 x i64>
store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
Expand All @@ -189,7 +249,7 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)
; GCN-HSA: {{flat|global}}_load_dwordx4
; GCN-HSA: {{flat|global}}_store_dwordx4
; GCN-HSA: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
%ext = zext <4 x i32> %ld to <4 x i64>
store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
Expand All @@ -210,7 +270,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)

; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
%ext = sext <4 x i32> %ld to <4 x i64>
store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
Expand All @@ -233,7 +293,7 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
%ext = zext <8 x i32> %ld to <8 x i64>
store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
Expand Down Expand Up @@ -265,7 +325,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
%ext = sext <8 x i32> %ld to <8 x i64>
store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
Expand Down Expand Up @@ -311,7 +371,7 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)
; GCN-DAG: v_ashrrev_i32
; GCN-NOHSA-DAG: buffer_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
%ext = sext <16 x i32> %ld to <16 x i64>
store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
Expand Down Expand Up @@ -346,7 +406,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace
; GCN-HSA: {{flat|global}}_store_dwordx4
; GCN-HSA: {{flat|global}}_store_dwordx4
; GCN-HSA: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
%ext = zext <16 x i32> %ld to <16 x i64>
store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
Expand Down Expand Up @@ -446,7 +506,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4

define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
%ext = sext <32 x i32> %ld to <32 x i64>
store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
Expand Down Expand Up @@ -513,7 +573,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
%ext = zext <32 x i32> %ld to <32 x i64>
store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
Expand Down Expand Up @@ -581,7 +641,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(<32 x i64> addrspace
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
define amdgpu_kernel void @global_load_v32i32(<32 x i32> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
store <32 x i32> %ld, <32 x i32> addrspace(1)* %out
ret void
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx11.mir
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,7 @@ body: |


# GFX11-LABEL: name: image_sample_c_d_cl_o_merged_v1v3
# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_CL_O_V4_V16_gfx11 %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_CL_O_V4_V9_gfx11 %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3

Expand All @@ -726,9 +726,9 @@ body: |
%2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
%3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0
%4:vgpr_32 = COPY %2.sub3
%5:vreg_512 = IMPLICIT_DEF
%6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_O_V1_V16_gfx11 %5:vreg_512, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
%7:vreg_96 = IMAGE_SAMPLE_C_D_CL_O_V3_V16_gfx11 %5:vreg_512, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
%5:vreg_288 = IMPLICIT_DEF
%6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_O_V1_V9_gfx11 %5:vreg_288, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
%7:vreg_96 = IMAGE_SAMPLE_C_D_CL_O_V3_V9_gfx11 %5:vreg_288, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
...
---

Expand Down
30 changes: 27 additions & 3 deletions llvm/test/CodeGen/AMDGPU/select.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,19 @@
define amdgpu_kernel void @select_f16(
; SI-LABEL: select_f16:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0
; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1
; SI-NEXT: s_mov_b32 s26, -1
; SI-NEXT: s_mov_b32 s27, 0xe8f000
; SI-NEXT: s_add_u32 s24, s24, s3
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11
; SI-NEXT: s_mov_b32 s18, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s16, s6
; SI-NEXT: s_mov_b32 s17, s7
; SI-NEXT: s_mov_b32 s18, s2
; SI-NEXT: s_mov_b32 s19, s3
; SI-NEXT: s_mov_b32 s20, s8
; SI-NEXT: s_mov_b32 s21, s9
Expand All @@ -34,6 +39,7 @@ define amdgpu_kernel void @select_f16(
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_addc_u32 s25, s25, 0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
Expand All @@ -46,14 +52,19 @@ define amdgpu_kernel void @select_f16(
;
; VI-LABEL: select_f16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1
; VI-NEXT: s_mov_b32 s26, -1
; VI-NEXT: s_mov_b32 s27, 0xe80000
; VI-NEXT: s_add_u32 s24, s24, s3
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44
; VI-NEXT: s_mov_b32 s18, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s16, s6
; VI-NEXT: s_mov_b32 s17, s7
; VI-NEXT: s_mov_b32 s18, s2
; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: s_mov_b32 s20, s8
; VI-NEXT: s_mov_b32 s21, s9
Expand All @@ -75,6 +86,7 @@ define amdgpu_kernel void @select_f16(
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_addc_u32 s25, s25, 0
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
Expand Down Expand Up @@ -420,14 +432,19 @@ entry:
define amdgpu_kernel void @select_v2f16(
; SI-LABEL: select_v2f16:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0
; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1
; SI-NEXT: s_mov_b32 s26, -1
; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11
; SI-NEXT: s_mov_b32 s27, 0xe8f000
; SI-NEXT: s_add_u32 s24, s24, s3
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s18, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s16, s6
; SI-NEXT: s_mov_b32 s17, s7
; SI-NEXT: s_mov_b32 s18, s2
; SI-NEXT: s_mov_b32 s19, s3
; SI-NEXT: s_mov_b32 s20, s8
; SI-NEXT: s_mov_b32 s21, s9
Expand All @@ -445,6 +462,7 @@ define amdgpu_kernel void @select_v2f16(
; SI-NEXT: buffer_load_dword v3, off, s[8:11], 0
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_addc_u32 s25, s25, 0
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
Expand Down Expand Up @@ -474,8 +492,13 @@ define amdgpu_kernel void @select_v2f16(
;
; VI-LABEL: select_v2f16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0
; VI-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44
; VI-NEXT: s_mov_b32 s26, -1
; VI-NEXT: s_mov_b32 s27, 0xe80000
; VI-NEXT: s_add_u32 s24, s24, s3
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s14, s2
Expand All @@ -499,6 +522,7 @@ define amdgpu_kernel void @select_v2f16(
; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_addc_u32 s25, s25, 0
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; VI-NEXT: s_waitcnt vmcnt(2)
Expand Down
16 changes: 14 additions & 2 deletions llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,19 @@ entry:
define amdgpu_kernel void @madak_f16_use_2(
; SI-LABEL: madak_f16_use_2:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
; SI-NEXT: s_mov_b32 s22, -1
; SI-NEXT: s_mov_b32 s23, 0xe8f000
; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11
; SI-NEXT: s_add_u32 s20, s20, s3
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s18, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s16, s8
; SI-NEXT: s_mov_b32 s17, s9
; SI-NEXT: s_mov_b32 s18, s2
; SI-NEXT: s_mov_b32 s19, s3
; SI-NEXT: s_mov_b32 s8, s10
; SI-NEXT: s_mov_b32 s9, s11
Expand All @@ -91,6 +96,7 @@ define amdgpu_kernel void @madak_f16_use_2(
; SI-NEXT: v_mov_b32_e32 v3, 0x41200000
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_addc_u32 s21, s21, 0
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_mov_b32 s9, s7
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
Expand All @@ -106,14 +112,19 @@ define amdgpu_kernel void @madak_f16_use_2(
;
; VI-LABEL: madak_f16_use_2:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
; VI-NEXT: s_mov_b32 s22, -1
; VI-NEXT: s_mov_b32 s23, 0xe80000
; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44
; VI-NEXT: s_add_u32 s20, s20, s3
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s18, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s16, s8
; VI-NEXT: s_mov_b32 s17, s9
; VI-NEXT: s_mov_b32 s18, s2
; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: s_mov_b32 s8, s10
; VI-NEXT: s_mov_b32 s9, s11
Expand All @@ -130,6 +141,7 @@ define amdgpu_kernel void @madak_f16_use_2(
; VI-NEXT: v_mov_b32_e32 v3, 0x4900
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_addc_u32 s21, s21, 0
; VI-NEXT: s_mov_b32 s8, s6
; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: v_madak_f16 v1, v0, v1, 0x4900
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/waitcnt-bvh.mir
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ body: |
; GCN-LABEL: name: waitcnt-check-inorder
; GCN: S_WAITCNT 0
; GCN-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
; GCN-NEXT: S_ENDPGM 0
$vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
$vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
$vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
$vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
S_ENDPGM 0
...
---
Expand All @@ -22,11 +22,11 @@ body: |
; GCN-LABEL: name: waitcnt-check-vs-vmem
; GCN: S_WAITCNT 0
; GCN-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
; GCN-NEXT: S_WAITCNT 16240
; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr16, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
$vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
$vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr16, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
S_ENDPGM 0
...
Expand All @@ -37,11 +37,11 @@ body: |
; GCN-LABEL: name: waitcnt-check-vs-mimg-samp
; GCN: S_WAITCNT 0
; GCN-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
; GCN-NEXT: S_WAITCNT 16240
; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
; GCN-NEXT: S_ENDPGM 0
$vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
$vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
$vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
S_ENDPGM 0
...
Expand All @@ -54,10 +54,10 @@ body: |
; GCN-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr20, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 16240
; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr20, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
$vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
$vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
S_ENDPGM 0
...
---
Expand All @@ -69,9 +69,9 @@ body: |
; GCN-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2 $vgpr16_vgpr17, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
; GCN-NEXT: S_WAITCNT 16240
; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
; GCN-NEXT: S_ENDPGM 0
$vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2 $vgpr16_vgpr17, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
$vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
$vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
S_ENDPGM 0
...
16 changes: 8 additions & 8 deletions llvm/test/MC/AMDGPU/gfx1013.s
Original file line number Diff line number Diff line change
@@ -1,28 +1,28 @@
// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1013 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck %s

image_bvh64_intersect_ray v[5:8], v[1:16], s[8:11]
image_bvh64_intersect_ray v[5:8], v[1:12], s[8:11]
// CHECK: [0x01,0x9f,0x9c,0xf1,0x01,0x05,0x02,0x00]

image_bvh64_intersect_ray v[5:8], v[240:255], s[8:11] a16
// CHECK: [0x01,0x9f,0x9c,0xf1,0xf0,0x05,0x02,0x40]
image_bvh64_intersect_ray v[5:8], v[247:255], s[8:11] a16
// CHECK: [0x01,0x9f,0x9c,0xf1,0xf7,0x05,0x02,0x40]

image_bvh64_intersect_ray v[5:8], v[1:16], ttmp[12:15]
image_bvh64_intersect_ray v[5:8], v[1:12], ttmp[12:15]
// CHECK: [0x01,0x9f,0x9c,0xf1,0x01,0x05,0x1e,0x00]

image_bvh64_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40, v42], s[12:15]
// CHECK: encoding: [0x07,0x9f,0x9c,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x2a,0x00]
// CHECK: [0x07,0x9f,0x9c,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x2a,0x00]

image_bvh_intersect_ray v[252:255], v[1:16], s[8:11]
image_bvh_intersect_ray v[252:255], v[1:11], s[8:11]
// CHECK: [0x01,0x9f,0x98,0xf1,0x01,0xfc,0x02,0x00]

image_bvh_intersect_ray v[5:8], v[248:255], s[8:11] a16
// CHECK: [0x01,0x9f,0x98,0xf1,0xf8,0x05,0x02,0x40]

image_bvh_intersect_ray v[5:8], v[1:16], ttmp[12:15]
image_bvh_intersect_ray v[5:8], v[1:11], ttmp[12:15]
// CHECK: [0x01,0x9f,0x98,0xf1,0x01,0x05,0x1e,0x00]

image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40], s[12:15]
// CHECK: encoding: [0x07,0x9f,0x98,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x00,0x00]
// CHECK: [0x07,0x9f,0x98,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x00,0x00]

image_msaa_load v[5:6], v[1:4], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
// CHECK: [0x39,0x03,0x00,0xf0,0x01,0x05,0x02,0x00]
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/MC/AMDGPU/gfx1030_new.s
Original file line number Diff line number Diff line change
Expand Up @@ -84,16 +84,16 @@ v_fmac_legacy_f32 v0, |v1|, -v2
v_fmac_legacy_f32 v0, s1, 2.0
// GFX10: encoding: [0x00,0x00,0x06,0xd5,0x01,0xe8,0x01,0x00]

image_bvh_intersect_ray v[4:7], v[9:24], s[4:7]
image_bvh_intersect_ray v[4:7], v[9:19], s[4:7]
// GFX10: encoding: [0x01,0x9f,0x98,0xf1,0x09,0x04,0x01,0x00]

image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] a16
// GFX10: encoding: [0x01,0x9f,0x98,0xf1,0x09,0x04,0x01,0x40]

image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7]
image_bvh64_intersect_ray v[4:7], v[9:20], s[4:7]
// GFX10: encoding: [0x01,0x9f,0x9c,0xf1,0x09,0x04,0x01,0x00]

image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] a16
image_bvh64_intersect_ray v[4:7], v[9:17], s[4:7] a16
// GFX10: encoding: [0x01,0x9f,0x9c,0xf1,0x09,0x04,0x01,0x40]

image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40], s[12:15]
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/MC/AMDGPU/gfx10_asm_mimg.s
Original file line number Diff line number Diff line change
Expand Up @@ -298,8 +298,8 @@ image_sample_d v[64:66], v[32:37], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG
image_sample_d v[64:66], [v32, v16, v8, v4, v2, v1, v0, v20, v21], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D
; GFX10: image_sample_d v[64:66], [v32, v16, v8, v4, v2, v1, v0, v20, v21], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x14,0x07,0x88,0xf0,0x20,0x40,0x21,0x03,0x10,0x08,0x04,0x02,0x01,0x00,0x14,0x15]

image_sample_d v[64:66], v[32:47], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D
; GFX10: image_sample_d v[64:66], v[32:47], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x10,0x07,0x88,0xf0,0x20,0x40,0x21,0x03]
image_sample_d v[64:66], v[32:40], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D
; GFX10: image_sample_d v[64:66], v[32:40], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x10,0x07,0x88,0xf0,0x20,0x40,0x21,0x03]

image_sample_d v[64:66], [v32, v16, v8, v4, v2, v1, v5], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_CUBE
; GFX10: image_sample_d v[64:66], [v32, v16, v8, v4, v2, v1, v5], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_CUBE ; encoding: [0x1c,0x07,0x88,0xf0,0x20,0x40,0x21,0x03,0x10,0x08,0x04,0x02,0x01,0x05,0x00,0x00]
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/MC/AMDGPU/gfx10_unsupported.s
Original file line number Diff line number Diff line change
Expand Up @@ -761,10 +761,10 @@ global_store_d16_hi_b8 v1, v2, s[104:105]
global_store_dword_addtid v1, off offset:16 glc slc dlc
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

image_bvh64_intersect_ray v[252:255], v[240:255], ttmp[12:15] a16
image_bvh64_intersect_ray v[252:255], v[247:255], ttmp[12:15] a16
// GFX1010: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

image_bvh_intersect_ray v[252:255], v[1:16], s[8:11]
image_bvh_intersect_ray v[252:255], v[1:11], s[8:11]
// GFX1010: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

image_msaa_load v14, [v204,v11,v14,v19], s[40:47] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
Expand Down
Loading