170 changes: 58 additions & 112 deletions llvm/test/CodeGen/AMDGPU/flat-scratch.ll

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GCN %s

; There aren't any stack objects, but we still enable the
; private_segment_wavefront_offset to get to 16, and the workgroup ID
Expand Down
4 changes: 4 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,7 @@
; GCN-O1-NEXT: Branch Probability Basic Block Placement
; GCN-O1-NEXT: Insert fentry calls
; GCN-O1-NEXT: Insert XRay ops
; GCN-O1-NEXT: GCN Create VOPD Instructions
; GCN-O1-NEXT: SI Memory Legalizer
; GCN-O1-NEXT: MachineDominator Tree Construction
; GCN-O1-NEXT: Machine Natural Loop Construction
Expand Down Expand Up @@ -667,6 +668,7 @@
; GCN-O1-OPTS-NEXT: Branch Probability Basic Block Placement
; GCN-O1-OPTS-NEXT: Insert fentry calls
; GCN-O1-OPTS-NEXT: Insert XRay ops
; GCN-O1-OPTS-NEXT: GCN Create VOPD Instructions
; GCN-O1-OPTS-NEXT: SI Memory Legalizer
; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction
; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction
Expand Down Expand Up @@ -958,6 +960,7 @@
; GCN-O2-NEXT: Branch Probability Basic Block Placement
; GCN-O2-NEXT: Insert fentry calls
; GCN-O2-NEXT: Insert XRay ops
; GCN-O2-NEXT: GCN Create VOPD Instructions
; GCN-O2-NEXT: SI Memory Legalizer
; GCN-O2-NEXT: MachineDominator Tree Construction
; GCN-O2-NEXT: Machine Natural Loop Construction
Expand Down Expand Up @@ -1262,6 +1265,7 @@
; GCN-O3-NEXT: Branch Probability Basic Block Placement
; GCN-O3-NEXT: Insert fentry calls
; GCN-O3-NEXT: Insert XRay ops
; GCN-O3-NEXT: GCN Create VOPD Instructions
; GCN-O3-NEXT: SI Memory Legalizer
; GCN-O3-NEXT: MachineDominator Tree Construction
; GCN-O3-NEXT: Machine Natural Loop Construction
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s

; FUNC-LABEL: {{^}}ds_ordered_add:
; GCN-DAG: v_mov_b32_e32 v[[INCR:[0-9]+]], 31
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,PREGFX11 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX10,PREGFX11 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX11 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX11 %s

declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
declare void @llvm.amdgcn.exp.i32(i32, i32, i32, i32, i32, i32, i1, i1) #1
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,12 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s6
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 clamp
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
Expand All @@ -38,13 +37,12 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s6
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
Expand Down
192 changes: 72 additions & 120 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
Original file line number Diff line number Diff line change
Expand Up @@ -124,17 +124,14 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1
;
; GFX11-LABEL: load_1d_tfe:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: v_mov_b32_e32 v7, v6
; GFX11-NEXT: v_mov_b32_e32 v8, v6
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0
; GFX11-NEXT: v_mov_b32_e32 v9, v6
; GFX11-NEXT: v_mov_b32_e32 v10, v6
; GFX11-NEXT: v_mov_b32_e32 v8, v6
; GFX11-NEXT: v_mov_b32_e32 v7, v6
; GFX11-NEXT: v_mov_b32_e32 v0, v6
; GFX11-NEXT: v_mov_b32_e32 v1, v7
; GFX11-NEXT: v_mov_b32_e32 v2, v8
; GFX11-NEXT: v_mov_b32_e32 v3, v9
; GFX11-NEXT: v_mov_b32_e32 v4, v10
; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7
; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10
; GFX11-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v6, v4, s[8:9]
Expand Down Expand Up @@ -231,17 +228,14 @@ define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1
;
; GFX11-LABEL: load_1d_lwe:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: v_mov_b32_e32 v7, v6
; GFX11-NEXT: v_mov_b32_e32 v8, v6
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0
; GFX11-NEXT: v_mov_b32_e32 v9, v6
; GFX11-NEXT: v_mov_b32_e32 v10, v6
; GFX11-NEXT: v_mov_b32_e32 v8, v6
; GFX11-NEXT: v_mov_b32_e32 v7, v6
; GFX11-NEXT: v_mov_b32_e32 v0, v6
; GFX11-NEXT: v_mov_b32_e32 v1, v7
; GFX11-NEXT: v_mov_b32_e32 v2, v8
; GFX11-NEXT: v_mov_b32_e32 v3, v9
; GFX11-NEXT: v_mov_b32_e32 v4, v10
; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7
; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10
; GFX11-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v6, v4, s[8:9]
Expand Down Expand Up @@ -377,18 +371,15 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1
;
; GFX11-LABEL: load_2d_tfe:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v7, 0
; GFX11-NEXT: v_mov_b32_e32 v6, v1
; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: v_mov_b32_e32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v9, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
; GFX11-NEXT: v_mov_b32_e32 v11, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
; GFX11-NEXT: v_mov_b32_e32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v0, v7
; GFX11-NEXT: v_mov_b32_e32 v1, v8
; GFX11-NEXT: v_mov_b32_e32 v2, v9
; GFX11-NEXT: v_mov_b32_e32 v3, v10
; GFX11-NEXT: v_mov_b32_e32 v4, v11
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11
; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v7, v4, s[8:9]
Expand Down Expand Up @@ -528,19 +519,15 @@ define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspa
;
; GFX11-LABEL: load_3d_tfe_lwe:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: v_mov_b32_e32 v7, v2
; GFX11-NEXT: v_mov_b32_e32 v6, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: v_mov_b32_e32 v9, v8
; GFX11-NEXT: v_mov_b32_e32 v10, v8
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0
; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
; GFX11-NEXT: v_mov_b32_e32 v11, v8
; GFX11-NEXT: v_mov_b32_e32 v12, v8
; GFX11-NEXT: v_mov_b32_e32 v10, v8
; GFX11-NEXT: v_mov_b32_e32 v9, v8
; GFX11-NEXT: v_mov_b32_e32 v0, v8
; GFX11-NEXT: v_mov_b32_e32 v1, v9
; GFX11-NEXT: v_mov_b32_e32 v2, v10
; GFX11-NEXT: v_mov_b32_e32 v3, v11
; GFX11-NEXT: v_mov_b32_e32 v4, v12
; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9
; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12
; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v8, v4, s[8:9]
Expand Down Expand Up @@ -680,19 +667,15 @@ define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace
;
; GFX11-LABEL: load_cube_lwe:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: v_mov_b32_e32 v7, v2
; GFX11-NEXT: v_mov_b32_e32 v6, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: v_mov_b32_e32 v9, v8
; GFX11-NEXT: v_mov_b32_e32 v10, v8
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0
; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
; GFX11-NEXT: v_mov_b32_e32 v11, v8
; GFX11-NEXT: v_mov_b32_e32 v12, v8
; GFX11-NEXT: v_mov_b32_e32 v10, v8
; GFX11-NEXT: v_mov_b32_e32 v9, v8
; GFX11-NEXT: v_mov_b32_e32 v0, v8
; GFX11-NEXT: v_mov_b32_e32 v1, v9
; GFX11-NEXT: v_mov_b32_e32 v2, v10
; GFX11-NEXT: v_mov_b32_e32 v3, v11
; GFX11-NEXT: v_mov_b32_e32 v4, v12
; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9
; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12
; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v8, v4, s[8:9]
Expand Down Expand Up @@ -828,18 +811,15 @@ define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 addrsp
;
; GFX11-LABEL: load_1darray_tfe:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v7, 0
; GFX11-NEXT: v_mov_b32_e32 v6, v1
; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: v_mov_b32_e32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v9, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
; GFX11-NEXT: v_mov_b32_e32 v11, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
; GFX11-NEXT: v_mov_b32_e32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v0, v7
; GFX11-NEXT: v_mov_b32_e32 v1, v8
; GFX11-NEXT: v_mov_b32_e32 v2, v9
; GFX11-NEXT: v_mov_b32_e32 v3, v10
; GFX11-NEXT: v_mov_b32_e32 v4, v11
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11
; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v7, v4, s[8:9]
Expand Down Expand Up @@ -979,19 +959,15 @@ define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrsp
;
; GFX11-LABEL: load_2darray_lwe:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: v_mov_b32_e32 v7, v2
; GFX11-NEXT: v_mov_b32_e32 v6, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: v_mov_b32_e32 v9, v8
; GFX11-NEXT: v_mov_b32_e32 v10, v8
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0
; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
; GFX11-NEXT: v_mov_b32_e32 v11, v8
; GFX11-NEXT: v_mov_b32_e32 v12, v8
; GFX11-NEXT: v_mov_b32_e32 v10, v8
; GFX11-NEXT: v_mov_b32_e32 v9, v8
; GFX11-NEXT: v_mov_b32_e32 v0, v8
; GFX11-NEXT: v_mov_b32_e32 v1, v9
; GFX11-NEXT: v_mov_b32_e32 v2, v10
; GFX11-NEXT: v_mov_b32_e32 v3, v11
; GFX11-NEXT: v_mov_b32_e32 v4, v12
; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9
; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12
; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v8, v4, s[8:9]
Expand Down Expand Up @@ -1131,19 +1107,15 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrsp
;
; GFX11-LABEL: load_2dmsaa_both:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: v_mov_b32_e32 v7, v2
; GFX11-NEXT: v_mov_b32_e32 v6, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: v_mov_b32_e32 v9, v8
; GFX11-NEXT: v_mov_b32_e32 v10, v8
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0
; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
; GFX11-NEXT: v_mov_b32_e32 v11, v8
; GFX11-NEXT: v_mov_b32_e32 v12, v8
; GFX11-NEXT: v_mov_b32_e32 v10, v8
; GFX11-NEXT: v_mov_b32_e32 v9, v8
; GFX11-NEXT: v_mov_b32_e32 v0, v8
; GFX11-NEXT: v_mov_b32_e32 v1, v9
; GFX11-NEXT: v_mov_b32_e32 v2, v10
; GFX11-NEXT: v_mov_b32_e32 v3, v11
; GFX11-NEXT: v_mov_b32_e32 v4, v12
; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9
; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12
; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v8, v4, s[8:9]
Expand Down Expand Up @@ -1287,20 +1259,16 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 ad
;
; GFX11-LABEL: load_2darraymsaa_tfe:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v9, 0
; GFX11-NEXT: v_mov_b32_e32 v8, v3
; GFX11-NEXT: v_mov_b32_e32 v7, v2
; GFX11-NEXT: v_mov_b32_e32 v6, v1
; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v8, v3
; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: v_mov_b32_e32 v10, v9
; GFX11-NEXT: v_mov_b32_e32 v11, v9
; GFX11-NEXT: v_mov_b32_e32 v12, v9
; GFX11-NEXT: v_mov_b32_e32 v13, v9
; GFX11-NEXT: v_mov_b32_e32 v12, v9
; GFX11-NEXT: v_mov_b32_e32 v10, v9
; GFX11-NEXT: v_mov_b32_e32 v0, v9
; GFX11-NEXT: v_mov_b32_e32 v1, v10
; GFX11-NEXT: v_mov_b32_e32 v2, v11
; GFX11-NEXT: v_mov_b32_e32 v3, v12
; GFX11-NEXT: v_mov_b32_e32 v4, v13
; GFX11-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12
; GFX11-NEXT: v_dual_mov_b32 v1, v10 :: v_dual_mov_b32 v4, v13
; GFX11-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v9, v4, s[8:9]
Expand Down Expand Up @@ -1436,18 +1404,15 @@ define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspa
;
; GFX11-LABEL: load_mip_1d_lwe:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v7, 0
; GFX11-NEXT: v_mov_b32_e32 v6, v1
; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: v_mov_b32_e32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v9, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
; GFX11-NEXT: v_mov_b32_e32 v11, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
; GFX11-NEXT: v_mov_b32_e32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v0, v7
; GFX11-NEXT: v_mov_b32_e32 v1, v8
; GFX11-NEXT: v_mov_b32_e32 v2, v9
; GFX11-NEXT: v_mov_b32_e32 v3, v10
; GFX11-NEXT: v_mov_b32_e32 v4, v11
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11
; GFX11-NEXT: image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v7, v4, s[8:9]
Expand Down Expand Up @@ -1587,19 +1552,15 @@ define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspa
;
; GFX11-LABEL: load_mip_2d_tfe:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: v_mov_b32_e32 v7, v2
; GFX11-NEXT: v_mov_b32_e32 v6, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: v_mov_b32_e32 v9, v8
; GFX11-NEXT: v_mov_b32_e32 v10, v8
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0
; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
; GFX11-NEXT: v_mov_b32_e32 v11, v8
; GFX11-NEXT: v_mov_b32_e32 v12, v8
; GFX11-NEXT: v_mov_b32_e32 v10, v8
; GFX11-NEXT: v_mov_b32_e32 v9, v8
; GFX11-NEXT: v_mov_b32_e32 v0, v8
; GFX11-NEXT: v_mov_b32_e32 v1, v9
; GFX11-NEXT: v_mov_b32_e32 v2, v10
; GFX11-NEXT: v_mov_b32_e32 v3, v11
; GFX11-NEXT: v_mov_b32_e32 v4, v12
; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9
; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12
; GFX11-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v8, v4, s[8:9]
Expand Down Expand Up @@ -1996,15 +1957,12 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, i32 a
;
; GFX11-LABEL: load_1d_tfe_V4_dmask3:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v5, 0
; GFX11-NEXT: v_mov_b32_e32 v4, v0
; GFX11-NEXT: v_mov_b32_e32 v6, v5
; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, 0
; GFX11-NEXT: v_mov_b32_e32 v7, v5
; GFX11-NEXT: v_mov_b32_e32 v8, v5
; GFX11-NEXT: v_mov_b32_e32 v0, v5
; GFX11-NEXT: v_mov_b32_e32 v1, v6
; GFX11-NEXT: v_mov_b32_e32 v2, v7
; GFX11-NEXT: v_mov_b32_e32 v3, v8
; GFX11-NEXT: v_mov_b32_e32 v6, v5
; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v3, v8
; GFX11-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v2, v7
; GFX11-NEXT: image_load v[0:3], v4, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v5, v3, s[8:9]
Expand Down Expand Up @@ -2089,13 +2047,11 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, i32 a
;
; GFX11-LABEL: load_1d_tfe_V4_dmask2:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: v_mov_b32_e32 v5, v4
; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, 0
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v4
; GFX11-NEXT: v_mov_b32_e32 v0, v4
; GFX11-NEXT: v_mov_b32_e32 v1, v5
; GFX11-NEXT: v_mov_b32_e32 v2, v6
; GFX11-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v1, v5
; GFX11-NEXT: image_load v[0:2], v3, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v4, v2, s[8:9]
Expand Down Expand Up @@ -2174,11 +2130,9 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask1(<8 x i32> inreg %rsrc, i32 a
;
; GFX11-LABEL: load_1d_tfe_V4_dmask1:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, 0
; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: v_mov_b32_e32 v1, v4
; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
; GFX11-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v3, v1, s[8:9]
Expand Down Expand Up @@ -2257,11 +2211,9 @@ define amdgpu_ps <2 x float> @load_1d_tfe_V2_dmask1(<8 x i32> inreg %rsrc, i32 a
;
; GFX11-LABEL: load_1d_tfe_V2_dmask1:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, 0
; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: v_mov_b32_e32 v1, v4
; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
; GFX11-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v3, v1, s[8:9]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s

define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
; GFX9-LABEL: sample_1d:
Expand Down
112 changes: 73 additions & 39 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,7 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32>
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: v_mov_b32_e32 v5, v4
; GFX11-NEXT: v_mov_b32_e32 v2, v4
; GFX11-NEXT: v_mov_b32_e32 v3, v5
; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX11-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
; GFX11-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -199,15 +198,24 @@ define amdgpu_ps <2 x float> @image_sample_c_d_1d_v2f16_tfe(<8 x i32> inreg %rsr
; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: image_sample_c_d_1d_v2f16_tfe:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: v_mov_b32_e32 v5, v0
; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0
; GFX10PLUS-NEXT: v_mov_b32_e32 v4, v1
; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0
; GFX10PLUS-NEXT: image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
; GFX10-LABEL: image_sample_c_d_1d_v2f16_tfe:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v4, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: image_sample_c_d_1d_v2f16_tfe:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_mov_b32_e32 v1, v0
; GFX11-NEXT: image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%tex = call {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
%tex.vec = extractvalue {<2 x half>, i32} %tex, 0
Expand Down Expand Up @@ -313,20 +321,33 @@ define amdgpu_ps <4 x float> @image_sample_b_2d_v3f16_tfe(<8 x i32> inreg %rsrc,
; GFX9-NEXT: v_mov_b32_e32 v2, v5
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: image_sample_b_2d_v3f16_tfe:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo
; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v0
; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0
; GFX10PLUS-NEXT: v_mov_b32_e32 v5, v2
; GFX10PLUS-NEXT: v_mov_b32_e32 v4, v1
; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0
; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0
; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10PLUS-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
; GFX10-LABEL: image_sample_b_2d_v3f16_tfe:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 s12, exec_lo
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-NEXT: v_mov_b32_e32 v3, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v5, v2
; GFX10-NEXT: v_mov_b32_e32 v4, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: image_sample_b_2d_v3f16_tfe:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_mov_b32 s12, exec_lo
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
; GFX11-NEXT: v_mov_b32_e32 v1, v0
; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX11-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%tex = call {<3 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v3f16i32.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
%tex.vec = extractvalue {<3 x half>, i32} %tex, 0
Expand Down Expand Up @@ -438,20 +459,33 @@ define amdgpu_ps <4 x float> @image_sample_b_2d_v4f16_tfe(<8 x i32> inreg %rsrc,
; GFX9-NEXT: v_mov_b32_e32 v2, v5
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: image_sample_b_2d_v4f16_tfe:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo
; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v0
; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0
; GFX10PLUS-NEXT: v_mov_b32_e32 v5, v2
; GFX10PLUS-NEXT: v_mov_b32_e32 v4, v1
; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0
; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0
; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10PLUS-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
; GFX10-LABEL: image_sample_b_2d_v4f16_tfe:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 s12, exec_lo
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-NEXT: v_mov_b32_e32 v3, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v5, v2
; GFX10-NEXT: v_mov_b32_e32 v4, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: image_sample_b_2d_v4f16_tfe:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_mov_b32 s12, exec_lo
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
; GFX11-NEXT: v_mov_b32_e32 v1, v0
; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX11-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%tex = call {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
%tex.vec = extractvalue {<4 x half>, i32} %tex, 0
Expand Down
134 changes: 81 additions & 53 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
Original file line number Diff line number Diff line change
Expand Up @@ -104,17 +104,14 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_mov_b32 s14, exec_lo
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: v_mov_b32_e32 v7, v6
; GFX11-NEXT: v_mov_b32_e32 v8, v6
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0
; GFX11-NEXT: v_mov_b32_e32 v9, v6
; GFX11-NEXT: v_mov_b32_e32 v10, v6
; GFX11-NEXT: v_mov_b32_e32 v8, v6
; GFX11-NEXT: v_mov_b32_e32 v7, v6
; GFX11-NEXT: v_mov_b32_e32 v0, v6
; GFX11-NEXT: v_mov_b32_e32 v1, v7
; GFX11-NEXT: v_mov_b32_e32 v2, v8
; GFX11-NEXT: v_mov_b32_e32 v3, v9
; GFX11-NEXT: v_mov_b32_e32 v4, v10
; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7
; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX11-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -344,18 +341,30 @@ define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_12(<8 x i32> inreg
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: sample_1d_tfe_adjust_writemask_12:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo
; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v0
; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0
; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0
; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0
; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10PLUS-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
; GFX10-LABEL: sample_1d_tfe_adjust_writemask_12:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 s12, exec_lo
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-NEXT: v_mov_b32_e32 v3, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_1d_tfe_adjust_writemask_12:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_mov_b32 s12, exec_lo
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: v_mov_b32_e32 v1, v0
; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX11-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
%res.vec = extractvalue {<4 x float>,i32} %v, 0
Expand Down Expand Up @@ -396,18 +405,30 @@ define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_24(<8 x i32> inreg
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: sample_1d_tfe_adjust_writemask_24:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo
; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v0
; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0
; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0
; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0
; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10PLUS-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D tfe
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
; GFX10-LABEL: sample_1d_tfe_adjust_writemask_24:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 s12, exec_lo
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-NEXT: v_mov_b32_e32 v3, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D tfe
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_1d_tfe_adjust_writemask_24:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_mov_b32 s12, exec_lo
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: v_mov_b32_e32 v1, v0
; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX11-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
%res.vec = extractvalue {<4 x float>,i32} %v, 0
Expand Down Expand Up @@ -546,17 +567,14 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_mov_b32 s14, exec_lo
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: v_mov_b32_e32 v7, v6
; GFX11-NEXT: v_mov_b32_e32 v8, v6
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0
; GFX11-NEXT: v_mov_b32_e32 v9, v6
; GFX11-NEXT: v_mov_b32_e32 v10, v6
; GFX11-NEXT: v_mov_b32_e32 v8, v6
; GFX11-NEXT: v_mov_b32_e32 v7, v6
; GFX11-NEXT: v_mov_b32_e32 v0, v6
; GFX11-NEXT: v_mov_b32_e32 v1, v7
; GFX11-NEXT: v_mov_b32_e32 v2, v8
; GFX11-NEXT: v_mov_b32_e32 v3, v9
; GFX11-NEXT: v_mov_b32_e32 v4, v10
; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7
; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX11-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe
; GFX11-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -1614,8 +1632,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v11, 0
; GFX11-NEXT: v_mov_b32_e32 v12, v11
; GFX11-NEXT: v_mov_b32_e32 v9, v11
; GFX11-NEXT: v_mov_b32_e32 v10, v12
; GFX11-NEXT: v_dual_mov_b32 v9, v11 :: v_dual_mov_b32 v10, v12
; GFX11-NEXT: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v9
Expand Down Expand Up @@ -1678,17 +1695,28 @@ define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc,
; GFX6789-NEXT: v_mov_b32_e32 v2, v11
; GFX6789-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: sample_c_d_o_2darray_V2_tfe:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: v_mov_b32_e32 v9, 0
; GFX10PLUS-NEXT: v_mov_b32_e32 v10, v9
; GFX10PLUS-NEXT: v_mov_b32_e32 v11, v9
; GFX10PLUS-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: v_mov_b32_e32 v0, v9
; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v10
; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v11
; GFX10PLUS-NEXT: ; return to shader part epilog
; GFX10-LABEL: sample_c_d_o_2darray_V2_tfe:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v9, 0
; GFX10-NEXT: v_mov_b32_e32 v10, v9
; GFX10-NEXT: v_mov_b32_e32 v11, v9
; GFX10-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, v9
; GFX10-NEXT: v_mov_b32_e32 v1, v10
; GFX10-NEXT: v_mov_b32_e32 v2, v11
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: sample_c_d_o_2darray_V2_tfe:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v9, 0
; GFX11-NEXT: v_mov_b32_e32 v10, v9
; GFX11-NEXT: v_mov_b32_e32 v11, v9
; GFX11-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, v11
; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10
; GFX11-NEXT: ; return to shader part epilog
main_body:
%v = call {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
%v.vec = extractvalue {<2 x float>, i32} %v, 0
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1010 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11 %s

define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_d_1d:
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s

define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_d_1d:
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,8 @@ define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inr
; GCN-NEXT: lds_param_load v2, attr2.x wait_vdst:15
; GCN-NEXT: lds_param_load v3, attr3.x wait_vdst:15
; GCN-NEXT: s_mov_b32 exec_lo, s3
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GCN-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
Expand Down
68 changes: 26 additions & 42 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
Original file line number Diff line number Diff line change
Expand Up @@ -62,19 +62,16 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, f
;
; GFX11-LABEL: image_bvh_intersect_ray_a16:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s3
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: s_lshr_b32 s2, s7, 16
; GFX11-NEXT: s_lshr_b32 s3, s5, 16
; GFX11-NEXT: v_mov_b32_e32 v2, s4
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1
; GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s2
; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: s_pack_ll_b32_b16 s4, s6, s8
; GFX11-NEXT: v_mov_b32_e32 v3, s3
; GFX11-NEXT: v_mov_b32_e32 v4, s2
; GFX11-NEXT: v_mov_b32_e32 v5, s4
; GFX11-NEXT: v_mov_b32_e32 v6, s0
; GFX11-NEXT: v_mov_b32_e32 v7, s1
; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s4
; GFX11-NEXT: s_mov_b32 s15, s12
; GFX11-NEXT: s_mov_b32 s14, s11
; GFX11-NEXT: s_mov_b32 s13, s10
Expand Down Expand Up @@ -137,20 +134,17 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr,
;
; GFX11-LABEL: image_bvh64_intersect_ray_a16:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_mov_b32_e32 v0, s3
; GFX11-NEXT: v_mov_b32_e32 v6, s0
; GFX11-NEXT: s_lshr_b32 s0, s8, 16
; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s4
; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v7, s1
; GFX11-NEXT: s_lshr_b32 s3, s6, 16
; GFX11-NEXT: v_mov_b32_e32 v7, s1
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s3, s0
; GFX11-NEXT: s_pack_ll_b32_b16 s1, s6, s8
; GFX11-NEXT: s_pack_ll_b32_b16 s3, s7, s9
; GFX11-NEXT: v_mov_b32_e32 v1, s4
; GFX11-NEXT: v_mov_b32_e32 v2, s5
; GFX11-NEXT: v_mov_b32_e32 v3, s1
; GFX11-NEXT: v_mov_b32_e32 v4, s0
; GFX11-NEXT: v_mov_b32_e32 v5, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v3, s1
; GFX11-NEXT: s_lshr_b32 s0, s8, 16
; GFX11-NEXT: v_mov_b32_e32 v8, s2
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s3, s0
; GFX11-NEXT: s_pack_ll_b32_b16 s3, s7, s9
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s3
; GFX11-NEXT: s_mov_b32 s15, s13
; GFX11-NEXT: s_mov_b32 s14, s12
; GFX11-NEXT: s_mov_b32 s13, s11
Expand Down Expand Up @@ -226,11 +220,9 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v4, 4.0
; GFX11-NEXT: v_mov_b32_e32 v5, 0x40a00000
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: v_mov_b32_e32 v7, 1.0
; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0
; GFX11-NEXT: v_mov_b32_e32 v8, 2.0
; GFX11-NEXT: v_dual_mov_b32 v4, 4.0 :: v_dual_mov_b32 v7, 1.0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
Expand Down Expand Up @@ -323,20 +315,19 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v4, 1.0
; GFX11-NEXT: v_mov_b32_e32 v5, 2.0
; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
; GFX11-NEXT: v_add_co_u32 v2, s4, s6, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4
; GFX11-NEXT: flat_load_b32 v6, v[0:1]
; GFX11-NEXT: flat_load_b32 v7, v[2:3]
; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200
; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400
; GFX11-NEXT: v_dual_mov_b32 v0, 0x46004200 :: v_dual_mov_b32 v3, 0
; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[0:3] a16
; GFX11-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -422,14 +413,10 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray,
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000
; GFX11-NEXT: v_mov_b32_e32 v3, 0x40400000
; GFX11-NEXT: v_mov_b32_e32 v4, 4.0
; GFX11-NEXT: v_mov_b32_e32 v5, 0x40a00000
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: v_mov_b32_e32 v7, 1.0
; GFX11-NEXT: v_mov_b32_e32 v8, 2.0
; GFX11-NEXT: v_mov_b32_e32 v9, 0xb36211c7
; GFX11-NEXT: v_mov_b32_e32 v10, 0x102
; GFX11-NEXT: v_dual_mov_b32 v3, 0x40400000 :: v_dual_mov_b32 v4, 4.0
; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0
; GFX11-NEXT: v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v9, 0xb36211c7
; GFX11-NEXT: v_dual_mov_b32 v10, 0x102 :: v_dual_mov_b32 v7, 1.0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
Expand Down Expand Up @@ -513,12 +500,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: v_mov_b32_e32 v4, 1.0
; GFX11-NEXT: v_mov_b32_e32 v5, 2.0
; GFX11-NEXT: v_mov_b32_e32 v6, 0xb36211c6
; GFX11-NEXT: v_mov_b32_e32 v7, 0x102
; GFX11-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v5, 2.0
; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v7, 0x102
; GFX11-NEXT: v_dual_mov_b32 v6, 0xb36211c6 :: v_dual_mov_b32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s

; GFX10PLUS-LABEL: {{^}}dpp8_test:
; GFX10PLUS: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
Expand Down
8 changes: 3 additions & 5 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,8 @@ define amdgpu_kernel void @test_s(i32 addrspace(1)* %out, i32 %src0) {
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_permlane64_b32 v0, v0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
Expand All @@ -28,9 +27,8 @@ define amdgpu_kernel void @test_i(i32 addrspace(1)* %out) {
; GFX11-LABEL: test_i:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0x63
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_permlane64_b32 v0, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
Expand Down
36 changes: 12 additions & 24 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@ define amdgpu_kernel void @test_get_doorbell(i32 addrspace(1)* %out) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
Expand All @@ -18,9 +17,8 @@ define amdgpu_kernel void @test_get_doorbell(i32 addrspace(1)* %out) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL)
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
Expand All @@ -34,9 +32,8 @@ define amdgpu_kernel void @test_get_ddid(i32 addrspace(1)* %out) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
Expand All @@ -45,9 +42,8 @@ define amdgpu_kernel void @test_get_ddid(i32 addrspace(1)* %out) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID)
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
Expand All @@ -63,8 +59,7 @@ define amdgpu_kernel void @test_get_tma(i64 addrspace(1)* %out) {
; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TMA)
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s3
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand All @@ -80,8 +75,7 @@ define amdgpu_kernel void @test_get_realtime(i64 addrspace(1)* %out) {
; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_REALTIME)
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s3
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand All @@ -95,9 +89,8 @@ define amdgpu_kernel void @test_savewave(i32 addrspace(1)* %out) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
Expand All @@ -106,9 +99,8 @@ define amdgpu_kernel void @test_savewave(i32 addrspace(1)* %out) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE)
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
Expand All @@ -124,8 +116,7 @@ define amdgpu_kernel void @test_get_tba(i64 addrspace(1)* %out) {
; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TBA)
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s3
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand All @@ -139,9 +130,8 @@ define amdgpu_kernel void @test_get_0_i32(i32 addrspace(1)* %out) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
Expand All @@ -150,9 +140,8 @@ define amdgpu_kernel void @test_get_0_i32(i32 addrspace(1)* %out) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0)
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
Expand All @@ -168,8 +157,7 @@ define amdgpu_kernel void @test_get_99999_i64(i64 addrspace(1)* %out) {
; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], 99999
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s3
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand Down
47 changes: 19 additions & 28 deletions llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,8 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v0
; GFX11-NEXT: v_mov_b32_e32 v5, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v4, v3, 0
; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v5, v2, 0
Expand Down Expand Up @@ -226,9 +225,8 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v0
; GFX11-NEXT: v_mov_b32_e32 v5, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v4, v3, 0
; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v5, v2, 0
Expand All @@ -251,19 +249,18 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX11-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5
; GFX11-NEXT: v_add3_u32 v1, v1, v9, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v10, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc_lo
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v7 :: v_dual_cndmask_b32 v4, v6, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -558,23 +555,21 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; GFX11-NEXT: s_addc_u32 s6, 0, s6
; GFX11-NEXT: s_sub_u32 s9, s4, s2
; GFX11-NEXT: s_subb_u32 s10, s6, 0
; GFX11-NEXT: v_mov_b32_e32 v1, s9
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v0, s10
; GFX11-NEXT: s_cmp_lt_i32 s1, 0
; GFX11-NEXT: v_mov_b32_e32 v0, s10
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_cmp_lt_i32 s3, 0
; GFX11-NEXT: v_cndmask_b32_e32 v2, s4, v1, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v0, s6, v0, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_sub_co_u32 v3, vcc_lo, v2, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_add_i32 s1, s8, s7
; GFX11-NEXT: s_mul_i32 s0, s0, s2
; GFX11-NEXT: s_add_i32 s1, s1, s5
; GFX11-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v1, v0, v1 :: v_dual_cndmask_b32 v0, v2, v3
; GFX11-NEXT: s_ashr_i32 s4, s1, 31
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s5, s4
Expand Down Expand Up @@ -641,9 +636,8 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_ashrrev_i64 v[5:6], 2, v[4:5]
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_mov_b32_e32 v1, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
Expand Down Expand Up @@ -696,15 +690,12 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) {
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_and_b32_e32 v7, 0x3fffffff, v1
; GFX11-NEXT: v_mov_b32_e32 v6, v0
; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_and_b32 v7, 0x3fffffff, v1
; GFX11-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1]
; GFX11-NEXT: v_alignbit_b32 v3, v1, v0, 30
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_mov_b32_e32 v1, v3
; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
Expand Down
49 changes: 18 additions & 31 deletions llvm/test/CodeGen/AMDGPU/mad_64_32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@ define i64 @mad_i64_i32_sextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -70,8 +69,7 @@ define i64 @mad_i64_i32_sextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -108,8 +106,7 @@ define i64 @mad_u64_u32_zextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -146,8 +143,7 @@ define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -254,8 +250,7 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
; GFX11-NEXT: v_ashrrev_i32_e32 v15, 31, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v14, v1, v[7:8]
; GFX11-NEXT: v_mov_b32_e32 v7, v10
; GFX11-NEXT: v_mov_b32_e32 v10, v8
; GFX11-NEXT: v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v10, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v0, v15, v[9:10]
; GFX11-NEXT: v_mad_i64_i32 v[9:10], null, v1, v14, 0
Expand Down Expand Up @@ -310,8 +305,7 @@ define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -405,8 +399,7 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v5
Expand Down Expand Up @@ -494,14 +487,12 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: v_mov_b32_e32 v6, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5]
; GFX11-NEXT: v_and_b32_e32 v5, 1, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5]
; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_and_b32 v5, 1, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%trunc.lhs = and i64 %arg0, 8589934591
Expand Down Expand Up @@ -550,10 +541,9 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5]
; GFX11-NEXT: v_and_b32_e32 v4, 1, v3
; GFX11-NEXT: v_mov_b32_e32 v3, v1
; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v4, 1, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4]
; GFX11-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -631,9 +621,8 @@ define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mov_b32_e32 v0, v2
; GFX11-NEXT: v_mov_b32_e32 v1, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%tmp4 = lshr i64 %arg0, 32
%tmp5 = and i64 %arg0, 4294967295
Expand Down Expand Up @@ -697,14 +686,13 @@ define amdgpu_kernel void @mad_i64_i32_uniform(i64 addrspace(1)* %out, i32 %arg0
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mul_i32 s6, s2, s3
; GFX11-NEXT: s_mul_hi_u32 s3, s2, s3
; GFX11-NEXT: s_add_u32 s2, s6, s4
; GFX11-NEXT: s_addc_u32 s3, s3, s5
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s3
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand Down Expand Up @@ -935,9 +923,8 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v6, v1
; GFX11-NEXT: v_mov_b32_e32 v7, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v7, v2, v[4:5]
; GFX11-NEXT: v_mul_lo_u32 v3, v7, v3
; GFX11-NEXT: v_mul_lo_u32 v2, v6, v2
Expand Down
30 changes: 1 addition & 29 deletions llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx1030 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -amdgpu-enable-delay-alu=0 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s

Expand All @@ -15,13 +15,6 @@ define amdgpu_ps float @mad_i32_vvv(i32 %a, i32 %b, i32 %c) {
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, v1, v[2:3]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: mad_i32_vvv:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v3, v1
; GFX11-NEXT: v_mov_b32_e32 v4, v0
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v3, v[2:3]
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
%cast = bitcast i32 %add to float
Expand Down Expand Up @@ -51,13 +44,6 @@ define amdgpu_ps float @mad_i32_vvc(i32 %a, i32 %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, v1, 42
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: mad_i32_vvc:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, v1
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, 42
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, 42
%cast = bitcast i32 %add to float
Expand All @@ -76,13 +62,6 @@ define amdgpu_ps float @mad_i32_vvi(i32 %a, i32 %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, v1, 0x12d687
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: mad_i32_vvi:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, v1
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, 0x12d687
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, 1234567
%cast = bitcast i32 %add to float
Expand Down Expand Up @@ -168,13 +147,6 @@ define amdgpu_ps float @mad_i32_vvs(i32 %a, i32 %b, i32 inreg %c) {
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, v1, s[0:1]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: mad_i32_vvs:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, v1
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, s[0:1]
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
%cast = bitcast i32 %add to float
Expand Down
960 changes: 320 additions & 640 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll

Large diffs are not rendered by default.

40 changes: 14 additions & 26 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
Original file line number Diff line number Diff line change
Expand Up @@ -119,11 +119,9 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc dlc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand All @@ -133,11 +131,9 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc dlc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -275,8 +271,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc dlc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand All @@ -291,8 +286,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc dlc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -415,11 +409,9 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand All @@ -429,11 +421,9 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -565,11 +555,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-WGP-LABEL: flat_nontemporal_store_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1
; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2]
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
Expand All @@ -581,11 +570,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-CU-LABEL: flat_nontemporal_store_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2]
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
Expand Down
960 changes: 320 additions & 640 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll

Large diffs are not rendered by default.

960 changes: 320 additions & 640 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll

Large diffs are not rendered by default.

58 changes: 20 additions & 38 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand All @@ -82,12 +80,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -172,8 +168,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand All @@ -189,8 +184,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -265,11 +259,9 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 dlc
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
Expand All @@ -280,11 +272,9 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 dlc
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
Expand Down Expand Up @@ -363,11 +353,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-WGP-LABEL: flat_nontemporal_store_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1
; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2]
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
Expand All @@ -380,11 +369,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-CU-LABEL: flat_nontemporal_store_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2]
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
Expand Down Expand Up @@ -463,13 +451,11 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
Expand All @@ -478,12 +464,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -553,8 +537,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
Expand All @@ -568,8 +551,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
Expand Down
948 changes: 316 additions & 632 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll

Large diffs are not rendered by default.

912 changes: 304 additions & 608 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll

Large diffs are not rendered by default.

498 changes: 166 additions & 332 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll

Large diffs are not rendered by default.

24 changes: 8 additions & 16 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
Original file line number Diff line number Diff line change
Expand Up @@ -121,23 +121,21 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX11-WGP-LABEL: global_nontemporal_load_0:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_nontemporal_load_0:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
Expand Down Expand Up @@ -267,8 +265,7 @@ define amdgpu_kernel void @global_nontemporal_load_1(
; GFX11-WGP-LABEL: global_nontemporal_load_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
; GFX11-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_load_b32 v0, v0, s[0:1] slc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -279,8 +276,7 @@ define amdgpu_kernel void @global_nontemporal_load_1(
; GFX11-CU-LABEL: global_nontemporal_load_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
; GFX11-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_load_b32 v0, v0, s[0:1] slc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -406,23 +402,21 @@ define amdgpu_kernel void @global_nontemporal_store_0(
; GFX11-WGP-LABEL: global_nontemporal_store_0:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_nontemporal_store_0:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
Expand Down Expand Up @@ -547,23 +541,21 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; GFX11-WGP-LABEL: global_nontemporal_store_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_nontemporal_store_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
Expand Down
Loading