diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll index f4900fad9f044..a38b6e3263882 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll @@ -2,7 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s -define amdgpu_kernel void @s_add_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-LABEL: s_add_u64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 @@ -35,11 +35,11 @@ define amdgpu_kernel void @s_add_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) { ; GFX12-NEXT: s_endpgm entry: %add = add i64 %a, %b - store i64 %add, i64 addrspace(1)* %out + store i64 %add, ptr addrspace(1) %out ret void } -define amdgpu_ps void @v_add_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_ps void @v_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GCN-LABEL: v_add_u64: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 @@ -50,11 +50,11 @@ define amdgpu_ps void @v_add_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) { ; GCN-NEXT: s_endpgm entry: %add = add i64 %a, %b - store i64 %add, i64 addrspace(1)* %out + store i64 %add, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @s_sub_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-LABEL: s_sub_u64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 @@ -87,11 +87,11 @@ define amdgpu_kernel void @s_sub_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) { ; GFX12-NEXT: s_endpgm entry: %sub = sub i64 %a, %b - store i64 %sub, i64 addrspace(1)* %out + store i64 %sub, ptr addrspace(1) %out ret void } -define amdgpu_ps void @v_sub_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_ps void @v_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GCN-LABEL: v_sub_u64: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v4 @@ -102,6 +102,6 @@ define amdgpu_ps void @v_sub_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) { ; GCN-NEXT: s_endpgm entry: %sub = sub i64 %a, %b - store i64 %sub, i64 addrspace(1)* %out + store i64 %sub, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-divrem.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-divrem.mir index faa47312d99ce..245e740ed8100 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-divrem.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-divrem.mir @@ -327,8 +327,8 @@ body: | %ptr2:_(p1) = G_IMPLICIT_DEF %ptr3:_(p1) = COPY $vgpr2_vgpr3 %ptr4:_(p1) = COPY $vgpr4_vgpr5 - G_STORE %src1:_(s32), %ptr1:_(p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1) - G_STORE %src2:_(s32), %ptr2:_(p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1) + G_STORE %src1:_(s32), %ptr1:_(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) + G_STORE %src2:_(s32), %ptr2:_(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) %div:_(s32) = G_SDIV %src1:_(s32), %src2:_(s32) G_STORE %div:_(s32), %ptr3:_(p1) :: (store (s32), addrspace 1, align 4) %rem:_(s32) = G_SREM %src1:_(s32), %src2:_(s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll index ea377531df2ae..b29ae366ca1ae 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0] @@ -14,11 +14,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> bb: %fneg.A = fneg <8 x half> %A %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0] @@ -31,11 +31,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> bb: %fneg.B = fneg <8 x half> %B %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] @@ -48,11 +48,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] @@ -65,11 +65,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] @@ -82,11 +82,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] @@ -99,11 +99,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0] @@ -114,11 +114,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> bb: %fneg.A = fneg <8 x half> %A %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0) - store <8 x half> %res, <8 x half> addrspace(1)* %out + store <8 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0] @@ -129,11 +129,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> bb: %fneg.B = fneg <8 x half> %B %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0) - store <8 x half> %res, <8 x half> addrspace(1)* %out + store <8 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] @@ -144,11 +144,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> bb: %fneg.C = fneg <8 x half> %C %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0) - store <8 x half> %res, <8 x half> addrspace(1)* %out + store <8 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] @@ -159,11 +159,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> bb: %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0) - store <8 x half> %res, <8 x half> addrspace(1)* %out + store <8 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] @@ -176,11 +176,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i3 bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] @@ -193,11 +193,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i3 bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] @@ -210,11 +210,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i3 bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] @@ -227,11 +227,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i3 bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] @@ -244,11 +244,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i3 bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] @@ -261,11 +261,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i3 bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] @@ -278,11 +278,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i3 bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] @@ -295,11 +295,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i3 bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0] @@ -312,11 +312,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x ha bb: %fneg.A = fneg <8 x half> %A %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0] @@ -329,11 +329,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x ha bb: %fneg.B = fneg <16 x half> %B %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0] @@ -344,11 +344,11 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x ha bb: %fneg.A = fneg <8 x half> %A %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index) - store <8 x half> %res, <8 x half> addrspace(1)* %out + store <8 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0] @@ -359,13 +359,13 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x ha bb: %fneg.B = fneg <16 x half> %B %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index) - store <8 x half> %res, <8 x half> addrspace(1)* %out + store <8 x half> %res, ptr addrspace(1) %out ret void } ; both neg and abs patterns (wmma matrix C f32 or f16 ) -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1] @@ -379,11 +379,11 @@ bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %fneg.fabs.C = fneg <8 x float> %fabs.C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1] @@ -395,11 +395,11 @@ bb: %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) %fneg.fabs.C = fneg <8 x half> %fabs.C %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0) - store <8 x half> %res, <8 x half> addrspace(1)* %out + store <8 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 @@ -417,13 +417,13 @@ bb: %partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3 %fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } ; A or B matrix modifier and constant in C -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] @@ -436,11 +436,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, bb: %fneg.A = fneg <8 x half> %A %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> ) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] @@ -451,13 +451,13 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, bb: %fneg.B = fneg <8 x half> %B %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> , i1 0) - store <8 x half> %res, <8 x half> addrspace(1)* %out + store <8 x half> %res, ptr addrspace(1) %out ret void } ; pack f16 elements with v_perm_b32 since they don't come from same b32 -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, <8 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_clause 0x1 @@ -485,7 +485,7 @@ bb: %C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> %fneg.C_shuffle = fneg <8 x half> %C_shuffle %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0) - store <8 x half> %res, <8 x half> addrspace(1)* %out + store <8 x half> %res, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll index eafbfb6d1eeb5..e2831afe68e74 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0] @@ -16,7 +16,7 @@ bb: ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0] @@ -27,11 +27,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> bb: %fneg.B = fneg <4 x half> %B %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] @@ -42,11 +42,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] @@ -57,11 +57,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] @@ -72,11 +72,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] @@ -87,11 +87,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0] @@ -102,11 +102,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> bb: %fneg.A = fneg <4 x half> %A %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0) - store <4 x half> %res, <4 x half> addrspace(1)* %out + store <4 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0] @@ -117,11 +117,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> bb: %fneg.B = fneg <4 x half> %B %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0) - store <4 x half> %res, <4 x half> addrspace(1)* %out + store <4 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] @@ -132,11 +132,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> bb: %fneg.C = fneg <4 x half> %C %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0) - store <4 x half> %res, <4 x half> addrspace(1)* %out + store <4 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] @@ -147,11 +147,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> bb: %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0) - store <4 x half> %res, <4 x half> addrspace(1)* %out + store <4 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] @@ -162,11 +162,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] @@ -177,11 +177,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] @@ -192,11 +192,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] @@ -207,11 +207,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] @@ -222,11 +222,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] @@ -237,11 +237,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] @@ -252,11 +252,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] @@ -267,11 +267,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0] @@ -282,11 +282,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x hal bb: %fneg.A = fneg <4 x half> %A %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0] @@ -297,11 +297,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x hal bb: %fneg.B = fneg <8 x half> %B %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0] @@ -312,11 +312,11 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x hal bb: %fneg.A = fneg <4 x half> %A %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index) - store <4 x half> %res, <4 x half> addrspace(1)* %out + store <4 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0] @@ -327,13 +327,13 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x hal bb: %fneg.B = fneg <8 x half> %B %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index) - store <4 x half> %res, <4 x half> addrspace(1)* %out + store <4 x half> %res, ptr addrspace(1) %out ret void } ; both neg and abs patterns (wmma matrix C f32 or f16 ) -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1] @@ -345,11 +345,11 @@ bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %fneg.fabs.C = fneg <4 x float> %fabs.C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1] @@ -361,11 +361,11 @@ bb: %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) %fneg.fabs.C = fneg <4 x half> %fabs.C %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0) - store <4 x half> %res, <4 x half> addrspace(1)* %out + store <4 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 @@ -381,13 +381,13 @@ bb: %partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3 %fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } ; A or B matrix modifier and constant in C -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] @@ -398,11 +398,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, bb: %fneg.A = fneg <4 x half> %A %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> ) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] @@ -413,13 +413,13 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, bb: %fneg.B = fneg <4 x half> %B %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> , i1 0) - store <4 x half> %res, <4 x half> addrspace(1)* %out + store <4 x half> %res, ptr addrspace(1) %out ret void } ; pack f16 elements with v_perm_b32 since they don't come from same b32 -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, <4 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: flat_load_b128 v[8:11], v[4:5] @@ -440,7 +440,7 @@ bb: %C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> %fneg.C_shuffle = fneg <4 x half> %C_shuffle %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0) - store <4 x half> %res, <4 x half> addrspace(1)* %out + store <4 x half> %res, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 2f7190e761102..3c66c83042951 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -2574,7 +2574,7 @@ define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) { ret void } -define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)* %out) { +define void @test_arg_store_v3bf16(<3 x bfloat> %in, ptr addrspace(1) %out) { ; GCN-LABEL: test_arg_store_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2637,7 +2637,7 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)* ; GFX11-NEXT: global_store_b16 v[2:3], v1, off offset:4 ; GFX11-NEXT: global_store_b32 v[2:3], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] - store <3 x bfloat> %in, <3 x bfloat> addrspace(1) * %out + store <3 x bfloat> %in, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-debug-info.mir b/llvm/test/CodeGen/AMDGPU/branch-relaxation-debug-info.mir index af06e12aeaefa..b427b011f5051 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-debug-info.mir +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-debug-info.mir @@ -15,25 +15,25 @@ --- | - define amdgpu_kernel void @long_branch_dbg_value(float addrspace(1)* nocapture %arg, float %arg1) #1 !dbg !5 { + define amdgpu_kernel void @long_branch_dbg_value(ptr addrspace(1) nocapture %arg, float %arg1) #1 !dbg !5 { bb: - %long_branch_dbg_value.kernarg.segment = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() - %arg.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %long_branch_dbg_value.kernarg.segment, i64 0 - %arg.kernarg.offset.cast = bitcast i8 addrspace(4)* %arg.kernarg.offset to float addrspace(1)* addrspace(4)*, !amdgpu.uniform !2, !amdgpu.noclobber !2 - %arg.load = load float addrspace(1)*, float addrspace(1)* addrspace(4)* %arg.kernarg.offset.cast, align 16, !invariant.load !2 - %arg1.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %long_branch_dbg_value.kernarg.segment, i64 8 - %arg1.kernarg.offset.cast = bitcast i8 addrspace(4)* %arg1.kernarg.offset to float addrspace(4)*, !amdgpu.uniform !2, !amdgpu.noclobber !2 - %arg1.load = load float, float addrspace(4)* %arg1.kernarg.offset.cast, align 8, !invariant.load !2 + %long_branch_dbg_value.kernarg.segment = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() + %arg.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %long_branch_dbg_value.kernarg.segment, i64 0 + %arg.kernarg.offset.cast = bitcast ptr addrspace(4) %arg.kernarg.offset to ptr addrspace(4), !amdgpu.uniform !2, !amdgpu.noclobber !2 + %arg.load = load ptr addrspace(1), ptr addrspace(4) %arg.kernarg.offset.cast, align 16, !invariant.load !2 + %arg1.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %long_branch_dbg_value.kernarg.segment, i64 8 + %arg1.kernarg.offset.cast = bitcast ptr addrspace(4) %arg1.kernarg.offset to ptr addrspace(4), !amdgpu.uniform !2, !amdgpu.noclobber !2 + %arg1.load = load float, ptr addrspace(4) %arg1.kernarg.offset.cast, align 8, !invariant.load !2 %tmp = fmul float %arg1.load, %arg1.load - %tmp2 = getelementptr inbounds float, float addrspace(1)* %arg.load, i64 3 - call void @llvm.dbg.value(metadata float addrspace(1)* %tmp2, metadata !11, metadata !DIExpression()) #5, !dbg !12 - store float %tmp, float addrspace(1)* %tmp2, align 4, !dbg !12 + %tmp2 = getelementptr inbounds float, ptr addrspace(1) %arg.load, i64 3 + call void @llvm.dbg.value(metadata ptr addrspace(1) %tmp2, metadata !11, metadata !DIExpression()) #5, !dbg !12 + store float %tmp, ptr addrspace(1) %tmp2, align 4, !dbg !12 %tmp3 = fcmp olt float %tmp, 0x3810000000000000 %tmp3.inv = xor i1 %tmp3, true br i1 %tmp3.inv, label %bb4, label %bb8, !amdgpu.uniform !2 bb4: ; preds = %bb - %tmp5 = load volatile float, float addrspace(1)* undef, align 4 + %tmp5 = load volatile float, ptr addrspace(1) undef, align 4 %tmp6 = fcmp oeq float %tmp5, 0x7FF0000000000000 br i1 %tmp6, label %bb7, label %Flow, !amdgpu.uniform !2 @@ -47,7 +47,7 @@ ret void } - declare align 4 i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2 + declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #2 declare void @llvm.dbg.value(metadata, metadata, metadata) #0 attributes #0 = { nounwind readnone speculatable willreturn } @@ -103,7 +103,7 @@ body: | renamable $sgpr4_sgpr5 = IMPLICIT_DEF $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr4_sgpr5 $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit killed $sgpr4_sgpr5, implicit $exec - renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile load (s32) from `float addrspace(1)* undef`, addrspace 1) + renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile load (s32) from `ptr addrspace(1) undef`, addrspace 1) renamable $sgpr4 = S_MOV_B32 2139095040 S_WAITCNT 3952 renamable $sgpr4_sgpr5 = nofpexcept V_CMP_NEQ_F32_e64 0, killed $sgpr4, 0, killed $vgpr0, 0, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 7b0ad8625b45f..e157c69dff366 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1385,7 +1385,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; The other use of shuffle0_0 make it profitable to lower into v_perm -define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out1, <4 x i8> addrspace(1)* noalias %in, <4 x i8> addrspace(1)* noalias %in1) nounwind { +define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out1, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in1) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 @@ -1547,14 +1547,14 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> add ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid - %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in1, i32 %tid - %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 - %load1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1, align 1 + %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %load = load <4 x i8>, ptr addrspace(1) %gep, align 1 + %load1 = load <4 x i8>, ptr addrspace(1) %gep1, align 1 %shuffle0_0 = shufflevector <4 x i8> %load, <4 x i8> %load1, <4 x i32> %cvt = uitofp <4 x i8> %shuffle0_0 to <4 x float> - store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 - store <4 x i8> %shuffle0_0, <4 x i8> addrspace(1)* %out1, align 4 + store <4 x float> %cvt, ptr addrspace(1) %out, align 16 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1, align 4 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir index f9ee80e5bdb53..091b29c23d60e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir @@ -2,7 +2,7 @@ # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s --- | - define amdgpu_kernel void @single-wave-phase-2b(i32 addrspace(3)* noalias %in0, i32 addrspace(3)* noalias %in1, i32 addrspace(3)* noalias %in2, i32 addrspace(3)* noalias %in3, i32 addrspace(3)* noalias %in4, i32 addrspace(3)* noalias %in5, i32 addrspace(3)* noalias %in6, i32 addrspace(3)* noalias %in7, i32 addrspace(3)* noalias %in8, i32 addrspace(3)* noalias %in9, i32 addrspace(3)* noalias %in10, i32 addrspace(3)* noalias %in11, i32 addrspace(7)* noalias %in12, i32 addrspace(7)* noalias %in13, i32 addrspace(7)* noalias %in14, i32 addrspace(7)* noalias %in15, i32 addrspace(7)* noalias %in16, i32 addrspace(7)* noalias %in17, i32 addrspace(7)* noalias %in18, i32 addrspace(7)* noalias %in19, i32 addrspace(7)* noalias %in20, i32 addrspace(7)* noalias %in21, i32 addrspace(7)* noalias %in22, i32 addrspace(7)* noalias %in23, i32 addrspace(7)* noalias %in24, i32 addrspace(7)* noalias %in25, i32 addrspace(7)* noalias %in26, i32 addrspace(7)* noalias %in27, i32 addrspace(7)* noalias %in28, i32 addrspace(7)* noalias %in29) #0 { ret void } + define amdgpu_kernel void @single-wave-phase-2b(ptr addrspace(3) noalias %in0, ptr addrspace(3) noalias %in1, ptr addrspace(3) noalias %in2, ptr addrspace(3) noalias %in3, ptr addrspace(3) noalias %in4, ptr addrspace(3) noalias %in5, ptr addrspace(3) noalias %in6, ptr addrspace(3) noalias %in7, ptr addrspace(3) noalias %in8, ptr addrspace(3) noalias %in9, ptr addrspace(3) noalias %in10, ptr addrspace(3) noalias %in11, ptr addrspace(7) noalias %in12, ptr addrspace(7) noalias %in13, ptr addrspace(7) noalias %in14, ptr addrspace(7) noalias %in15, ptr addrspace(7) noalias %in16, ptr addrspace(7) noalias %in17, ptr addrspace(7) noalias %in18, ptr addrspace(7) noalias %in19, ptr addrspace(7) noalias %in20, ptr addrspace(7) noalias %in21, ptr addrspace(7) noalias %in22, ptr addrspace(7) noalias %in23, ptr addrspace(7) noalias %in24, ptr addrspace(7) noalias %in25, ptr addrspace(7) noalias %in26, ptr addrspace(7) noalias %in27, ptr addrspace(7) noalias %in28, ptr addrspace(7) noalias %in29) #0 { ret void } !0 = distinct !{!0} !1 = !{!1, !0} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir index 6fed102ed1908..a85478df10eb2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir @@ -2,7 +2,7 @@ # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s --- | - define amdgpu_kernel void @single-wave-phase-2c(i32 addrspace(3)* noalias %in0, i32 addrspace(3)* noalias %in1, i32 addrspace(3)* noalias %in2, i32 addrspace(3)* noalias %in3, i32 addrspace(3)* noalias %in4, i32 addrspace(3)* noalias %in5, i32 addrspace(3)* noalias %in6, i32 addrspace(3)* noalias %in7, i32 addrspace(3)* noalias %in8, i32 addrspace(3)* noalias %in9, i32 addrspace(3)* noalias %in10, i32 addrspace(3)* noalias %in11, i32 addrspace(7)* noalias %in12, i32 addrspace(7)* noalias %in13, i32 addrspace(7)* noalias %in14, i32 addrspace(7)* noalias %in15, i32 addrspace(7)* noalias %in16, i32 addrspace(7)* noalias %in17) #0 { ret void } + define amdgpu_kernel void @single-wave-phase-2c(ptr addrspace(3) noalias %in0, ptr addrspace(3) noalias %in1, ptr addrspace(3) noalias %in2, ptr addrspace(3) noalias %in3, ptr addrspace(3) noalias %in4, ptr addrspace(3) noalias %in5, ptr addrspace(3) noalias %in6, ptr addrspace(3) noalias %in7, ptr addrspace(3) noalias %in8, ptr addrspace(3) noalias %in9, ptr addrspace(3) noalias %in10, ptr addrspace(3) noalias %in11, ptr addrspace(7) noalias %in12, ptr addrspace(7) noalias %in13, ptr addrspace(7) noalias %in14, ptr addrspace(7) noalias %in15, ptr addrspace(7) noalias %in16, ptr addrspace(7) noalias %in17) #0 { ret void } !0 = distinct !{!0} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll index 06f69d50eed01..f4663e9daccc8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll @@ -176,10 +176,10 @@ define amdgpu_kernel void @safe_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %s ; SI-NOT: v_rsq_f64_e32 ; SI: v_sqrt_f64 ; SI: v_rcp_f64 -define amdgpu_kernel void @safe_amdgcn_sqrt_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @safe_amdgcn_sqrt_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #1 { %sqrt = call double @llvm.amdgcn.sqrt.f64(double %src) %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt) - store double %rcp, double addrspace(1)* %out, align 8 + store double %rcp, ptr addrspace(1) %out, align 8 ret void } @@ -195,10 +195,10 @@ define amdgpu_kernel void @safe_amdgcn_sqrt_rsq_rcp_pat_f64(double addrspace(1)* ; SI: v_fma_f64 ; SI: v_rcp_f64 ; SI: buffer_store_dwordx2 -define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 { +define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #2 { %sqrt = call double @llvm.sqrt.f64(double %src) %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt) - store double %rcp, double addrspace(1)* %out, align 8 + store double %rcp, ptr addrspace(1) %out, align 8 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll index c7a2cfa32ba25..7b1355425729e 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll @@ -82,8 +82,8 @@ entry: ; R600: VTX_READ_128 define amdgpu_kernel void @global_load_v9f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { entry: - %tmp0 = load <9 x float>, <9 x float> addrspace(1)* %in - store <9 x float> %tmp0, <9 x float> addrspace(1)* %out + %tmp0 = load <9 x float>, ptr addrspace(1) %in + store <9 x float> %tmp0, ptr addrspace(1) %out ret void } @@ -101,8 +101,8 @@ entry: ; R600: VTX_READ_128 define amdgpu_kernel void @global_load_v10f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { entry: - %tmp0 = load <10 x float>, <10 x float> addrspace(1)* %in - store <10 x float> %tmp0, <10 x float> addrspace(1)* %out + %tmp0 = load <10 x float>, ptr addrspace(1) %in + store <10 x float> %tmp0, ptr addrspace(1) %out ret void } @@ -122,8 +122,8 @@ entry: ; R600: VTX_READ_128 define amdgpu_kernel void @global_load_v11f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { entry: - %tmp0 = load <11 x float>, <11 x float> addrspace(1)* %in - store <11 x float> %tmp0, <11 x float> addrspace(1)* %out + %tmp0 = load <11 x float>, ptr addrspace(1) %in + store <11 x float> %tmp0, ptr addrspace(1) %out ret void } @@ -140,8 +140,8 @@ entry: ; R600: VTX_READ_128 define amdgpu_kernel void @global_load_v12f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { entry: - %tmp0 = load <12 x float>, <12 x float> addrspace(1)* %in - store <12 x float> %tmp0, <12 x float> addrspace(1)* %out + %tmp0 = load <12 x float>, ptr addrspace(1) %in + store <12 x float> %tmp0, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll index cc7943fd0ba76..a883db1fa61f9 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll @@ -6,13 +6,13 @@ ; Check a constructor that's an alias, and an integer literal. @llvm.global_ctors = appending addrspace(1) global [2 x { i32, ptr, ptr }] [ - { i32, ptr, ptr } { i32 1, ptr @foo.alias, i8* null }, - { i32, ptr, ptr } { i32 1, ptr inttoptr (i64 4096 to ptr), i8* null } + { i32, ptr, ptr } { i32 1, ptr @foo.alias, ptr null }, + { i32, ptr, ptr } { i32 1, ptr inttoptr (i64 4096 to ptr), ptr null } ] ; Check a constantexpr addrspacecast @llvm.global_dtors = appending addrspace(1) global [1 x { i32, ptr, ptr }] [ - { i32, ptr, ptr } { i32 1, ptr addrspacecast (ptr addrspace(1) @bar to ptr), i8* null } + { i32, ptr, ptr } { i32 1, ptr addrspacecast (ptr addrspace(1) @bar to ptr), ptr null } ] @foo.alias = hidden alias void (), ptr @foo diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir index 8b5c7d75dfc96..18df16988d8e4 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir @@ -1,30 +1,30 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - | FileCheck %s --- | - define amdgpu_kernel void @multiple_mem_operands(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) #0 { + define amdgpu_kernel void @multiple_mem_operands(ptr addrspace(1) %out, i32 %cond, i32 %if_offset, i32 %else_offset) #0 { entry: %scratch0 = alloca [8192 x i32], addrspace(5) %scratch1 = alloca [8192 x i32], addrspace(5) - %scratchptr01 = bitcast [8192 x i32] addrspace(5)* %scratch0 to i32 addrspace(5)* - store i32 1, i32 addrspace(5)* %scratchptr01 - %scratchptr12 = bitcast [8192 x i32] addrspace(5)* %scratch1 to i32 addrspace(5)* - store i32 2, i32 addrspace(5)* %scratchptr12 + %scratchptr01 = bitcast ptr addrspace(5) %scratch0 to ptr addrspace(5) + store i32 1, ptr addrspace(5) %scratchptr01 + %scratchptr12 = bitcast ptr addrspace(5) %scratch1 to ptr addrspace(5) + store i32 2, ptr addrspace(5) %scratchptr12 %cmp = icmp eq i32 %cond, 0 br i1 %cmp, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0 if: ; preds = %entry - %if_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch0, i32 0, i32 %if_offset, !amdgpu.uniform !0 - %if_value = load i32, i32 addrspace(5)* %if_ptr, align 4, !nontemporal !1 + %if_ptr = getelementptr [8192 x i32], ptr addrspace(5) %scratch0, i32 0, i32 %if_offset, !amdgpu.uniform !0 + %if_value = load i32, ptr addrspace(5) %if_ptr, align 4, !nontemporal !1 br label %done, !structurizecfg.uniform !0 else: ; preds = %entry - %else_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch1, i32 0, i32 %else_offset, !amdgpu.uniform !0 - %else_value = load i32, i32 addrspace(5)* %else_ptr, align 4, !nontemporal !1 + %else_ptr = getelementptr [8192 x i32], ptr addrspace(5) %scratch1, i32 0, i32 %else_offset, !amdgpu.uniform !0 + %else_value = load i32, ptr addrspace(5) %else_ptr, align 4, !nontemporal !1 br label %done, !structurizecfg.uniform !0 done: ; preds = %else, %if %value = phi i32 [ %if_value, %if ], [ %else_value, %else ] - store i32 %value, i32 addrspace(1)* %out + store i32 %value, ptr addrspace(1) %out ret void } @@ -110,9 +110,9 @@ body: | successors: %bb.1.if(0x30000000), %bb.2.else(0x50000000) liveins: $sgpr0_sgpr1, $sgpr3 - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0 :: (non-temporal dereferenceable invariant load (s32) from `i32 addrspace(4)* undef`) + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) undef`) $sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 - $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0 :: (non-temporal dereferenceable invariant load (s64) from `i64 addrspace(4)* undef`) + $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`) $sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 @@ -130,7 +130,7 @@ body: | successors: %bb.3.done(0x80000000) liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11 - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0 :: (non-temporal dereferenceable invariant load (s32) from `i32 addrspace(4)* undef`) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) undef`) S_WAITCNT 3855 $vgpr0 = V_MOV_B32_e32 32772, implicit $exec S_BRANCH %bb.3.done @@ -139,7 +139,7 @@ body: | successors: %bb.3.done(0x80000000) liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11 - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0 :: (non-temporal dereferenceable invariant load (s32) from `i32 addrspace(4)* undef`) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) undef`) S_WAITCNT 3855 $vgpr0 = V_MOV_B32_e32 4, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir index 46601c928ce73..9cc688dd0c532 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir @@ -2,30 +2,30 @@ --- | - define amdgpu_kernel void @multiple_mem_operands(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) #0 { + define amdgpu_kernel void @multiple_mem_operands(ptr addrspace(1) %out, i32 %cond, i32 %if_offset, i32 %else_offset) #0 { entry: %scratch0 = alloca [8192 x i32], addrspace(5) %scratch1 = alloca [8192 x i32], addrspace(5) - %scratchptr01 = bitcast [8192 x i32] addrspace(5)* %scratch0 to i32 addrspace(5)* - store i32 1, i32 addrspace(5)* %scratchptr01 - %scratchptr12 = bitcast [8192 x i32] addrspace(5)* %scratch1 to i32 addrspace(5)* - store i32 2, i32 addrspace(5)* %scratchptr12 + %scratchptr01 = bitcast ptr addrspace(5) %scratch0 to ptr addrspace(5) + store i32 1, ptr addrspace(5) %scratchptr01 + %scratchptr12 = bitcast ptr addrspace(5) %scratch1 to ptr addrspace(5) + store i32 2, ptr addrspace(5) %scratchptr12 %cmp = icmp eq i32 %cond, 0 br i1 %cmp, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0 if: ; preds = %entry - %if_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch0, i32 0, i32 %if_offset, !amdgpu.uniform !0 - %if_value = load i32, i32 addrspace(5)* %if_ptr, align 4, !nontemporal !1 + %if_ptr = getelementptr [8192 x i32], ptr addrspace(5) %scratch0, i32 0, i32 %if_offset, !amdgpu.uniform !0 + %if_value = load i32, ptr addrspace(5) %if_ptr, align 4, !nontemporal !1 br label %done, !structurizecfg.uniform !0 else: ; preds = %entry - %else_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch1, i32 0, i32 %else_offset, !amdgpu.uniform !0 - %else_value = load i32, i32 addrspace(5)* %else_ptr, align 4 + %else_ptr = getelementptr [8192 x i32], ptr addrspace(5) %scratch1, i32 0, i32 %else_offset, !amdgpu.uniform !0 + %else_value = load i32, ptr addrspace(5) %else_ptr, align 4 br label %done, !structurizecfg.uniform !0 done: ; preds = %else, %if %value = phi i32 [ %if_value, %if ], [ %else_value, %else ] - store i32 %value, i32 addrspace(1)* %out + store i32 %value, ptr addrspace(1) %out ret void } @@ -90,9 +90,9 @@ body: | successors: %bb.1.if(0x30000000), %bb.2.else(0x50000000) liveins: $sgpr0_sgpr1, $sgpr3 - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0 :: (non-temporal dereferenceable invariant load (s32) from `i32 addrspace(4)* undef`) + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) undef`) $sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 - $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0 :: (non-temporal dereferenceable invariant load (s64) from `i64 addrspace(4)* undef`) + $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`) $sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 @@ -110,7 +110,7 @@ body: | successors: %bb.3.done(0x80000000) liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11 - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0 :: (non-temporal dereferenceable invariant load (s32) from `i32 addrspace(4)* undef`) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) undef`) S_WAITCNT 3855 $vgpr0 = V_MOV_B32_e32 32772, implicit $exec S_BRANCH %bb.3.done @@ -119,7 +119,7 @@ body: | successors: %bb.3.done(0x80000000) liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11 - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0 :: (non-temporal dereferenceable invariant load (s32) from `i32 addrspace(4)* undef`) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) undef`) S_WAITCNT 3855 $vgpr0 = V_MOV_B32_e32 4, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir index f9801a50dfd74..71846f74bbf03 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir @@ -13,8 +13,8 @@ body: | ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1 ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) - %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) + %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) + %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) S_NOP 0, implicit %1, implicit %2 ... @@ -32,9 +32,9 @@ body: | ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub1 ; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) - %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) - %3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) + %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) + %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) + %3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) S_NOP 0, implicit %1, implicit %2, implicit %3 ... @@ -54,10 +54,10 @@ body: | ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1 ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) - %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) - %3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) - %4:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) + %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) + %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) + %3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) + %4:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4 ... @@ -78,11 +78,11 @@ body: | ; GCN-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 16, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`) ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[FLAT_LOAD_DWORD]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) - %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) - %3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) - %4:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) - %5:vgpr_32 = FLAT_LOAD_DWORD %0, 16, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) + %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) + %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) + %3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) + %4:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) + %5:vgpr_32 = FLAT_LOAD_DWORD %0, 16, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5 ... @@ -105,12 +105,12 @@ body: | ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1 ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[COPY6]], implicit [[COPY7]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) - %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) - %3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) - %4:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) - %5:vgpr_32 = FLAT_LOAD_DWORD %0, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) - %6:vgpr_32 = FLAT_LOAD_DWORD %0, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) + %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) + %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) + %3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) + %4:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) + %5:vgpr_32 = FLAT_LOAD_DWORD %0, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) + %6:vgpr_32 = FLAT_LOAD_DWORD %0, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6 ... @@ -126,8 +126,8 @@ body: | ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub2_sub3 ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `i64* undef`, align 4) - %2:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `i64* undef`, align 4) + %1:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `ptr undef`, align 4) + %2:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `ptr undef`, align 4) S_NOP 0, implicit %1, implicit %2 ... @@ -143,8 +143,8 @@ body: | ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub3 ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vreg_96_align2 = FLAT_LOAD_DWORDX3 %0, 12, 0, implicit $exec, implicit $flat_scr :: (load (s96) from `i128* undef`, align 8) - %2:vgpr_32 = FLAT_LOAD_DWORD %0, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) + %1:vreg_96_align2 = FLAT_LOAD_DWORDX3 %0, 12, 0, implicit $exec, implicit $flat_scr :: (load (s96) from `ptr undef`, align 8) + %2:vgpr_32 = FLAT_LOAD_DWORD %0, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) S_NOP 0, implicit %1, implicit %2 ... @@ -160,8 +160,8 @@ body: | ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[FLAT_LOAD_DWORDX3_]].sub1_sub2 ; GCN-NEXT: S_NOP 0, implicit [[COPY1]], implicit [[COPY]] %0:vreg_64_align2 = IMPLICIT_DEF - %2:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) - %1:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 16, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `i64* undef`, align 8) + %2:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) + %1:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 16, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `ptr undef`, align 8) S_NOP 0, implicit %1, implicit %2 ... @@ -176,8 +176,8 @@ body: | ; GCN-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:agpr_32 = FLAT_LOAD_DWORD [[DEF]], 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`) ; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[FLAT_LOAD_DWORD1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) - %2:agpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) + %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) + %2:agpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) S_NOP 0, implicit %1, implicit %2 ... @@ -192,8 +192,8 @@ body: | ; GCN-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`) ; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[FLAT_LOAD_DWORD1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) - %2:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) + %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) + %2:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) S_NOP 0, implicit %1, implicit %2 ... @@ -208,8 +208,8 @@ body: | ; GCN-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 3, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`) ; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[FLAT_LOAD_DWORD1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) - %2:vgpr_32 = FLAT_LOAD_DWORD %0, 3, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`) + %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) + %2:vgpr_32 = FLAT_LOAD_DWORD %0, 3, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`) S_NOP 0, implicit %1, implicit %2 ... @@ -224,8 +224,8 @@ body: | ; GCN-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`) ; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[FLAT_LOAD_DWORD1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) - %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4) + %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) + %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4) S_NOP 0, implicit %1, implicit %2 ... @@ -243,8 +243,8 @@ body: | %0:vreg_64_align2 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF - FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) - FLAT_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) + FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) + FLAT_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) ... --- @@ -264,9 +264,9 @@ body: | %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF %3:vgpr_32 = IMPLICIT_DEF - FLAT_STORE_DWORD %0, killed %1, 4, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) - FLAT_STORE_DWORD %0, killed %2, 8, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) - FLAT_STORE_DWORD killed %0, killed %3, 12, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) + FLAT_STORE_DWORD %0, killed %1, 4, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) + FLAT_STORE_DWORD %0, killed %2, 8, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) + FLAT_STORE_DWORD killed %0, killed %3, 12, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) ... --- @@ -283,10 +283,10 @@ body: | ; GCN-NEXT: FLAT_STORE_DWORDX4 [[DEF]], killed [[REG_SEQUENCE2]], 4, 2, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr undef`, align 4) %0:vreg_64_align2 = IMPLICIT_DEF %1:vreg_128 = IMPLICIT_DEF - FLAT_STORE_DWORD %0, %1.sub1, 8, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) - FLAT_STORE_DWORD %0, %1.sub2, 12, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) - FLAT_STORE_DWORD %0, %1.sub3, 16, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) - FLAT_STORE_DWORD killed %0, %1.sub0, 4, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) + FLAT_STORE_DWORD %0, %1.sub1, 8, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) + FLAT_STORE_DWORD %0, %1.sub2, 12, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) + FLAT_STORE_DWORD %0, %1.sub3, 16, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) + FLAT_STORE_DWORD killed %0, %1.sub0, 4, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) ... --- @@ -312,11 +312,11 @@ body: | %3:agpr_32 = IMPLICIT_DEF %4:agpr_32 = IMPLICIT_DEF %5:agpr_32 = IMPLICIT_DEF - FLAT_STORE_DWORD %0, %1, 4, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) - FLAT_STORE_DWORD %0, %2, 8, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 8) - FLAT_STORE_DWORD %0, %3, 12, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) - FLAT_STORE_DWORD %0, %4, 16, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) - FLAT_STORE_DWORD %0, %5, 20, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) + FLAT_STORE_DWORD %0, %1, 4, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) + FLAT_STORE_DWORD %0, %2, 8, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 8) + FLAT_STORE_DWORD %0, %3, 12, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) + FLAT_STORE_DWORD %0, %4, 16, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) + FLAT_STORE_DWORD %0, %5, 20, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) ... --- @@ -345,12 +345,12 @@ body: | %4:vgpr_32 = IMPLICIT_DEF %5:vgpr_32 = IMPLICIT_DEF %6:vgpr_32 = IMPLICIT_DEF - FLAT_STORE_DWORD %0, %1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 8) - FLAT_STORE_DWORD %0, %2, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) - FLAT_STORE_DWORD %0, %3, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) - FLAT_STORE_DWORD %0, %4, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) - FLAT_STORE_DWORD %0, %5, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) - FLAT_STORE_DWORD %0, %6, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) + FLAT_STORE_DWORD %0, %1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 8) + FLAT_STORE_DWORD %0, %2, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) + FLAT_STORE_DWORD %0, %3, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) + FLAT_STORE_DWORD %0, %4, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) + FLAT_STORE_DWORD %0, %5, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) + FLAT_STORE_DWORD %0, %6, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) ... --- @@ -367,8 +367,8 @@ body: | %0:vreg_64_align2 = IMPLICIT_DEF %1:vreg_64_align2 = IMPLICIT_DEF %2:vreg_64_align2 = IMPLICIT_DEF - FLAT_STORE_DWORDX2 %0, killed %1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `i64* undef`, align 4) - FLAT_STORE_DWORDX2 %0, killed %2, 12, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `i64* undef`, align 4) + FLAT_STORE_DWORDX2 %0, killed %1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `ptr undef`, align 4) + FLAT_STORE_DWORDX2 %0, killed %2, 12, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `ptr undef`, align 4) ... --- @@ -385,8 +385,8 @@ body: | %0:vreg_64_align2 = IMPLICIT_DEF %1:vreg_96_align2 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF - FLAT_STORE_DWORDX3 %0, killed %1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `i64* undef`, align 16) - FLAT_STORE_DWORD %0, killed %2, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) + FLAT_STORE_DWORDX3 %0, killed %1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `ptr undef`, align 16) + FLAT_STORE_DWORD %0, killed %2, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) ... --- @@ -403,8 +403,8 @@ body: | %0:vreg_64_align2 = IMPLICIT_DEF %1:agpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF - FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) - FLAT_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) + FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) + FLAT_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) ... --- @@ -421,8 +421,8 @@ body: | %0:vreg_64_align2 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF - FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) - FLAT_STORE_DWORD killed %0, killed %2, 6, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) + FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) + FLAT_STORE_DWORD killed %0, killed %2, 6, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) ... --- @@ -439,8 +439,8 @@ body: | %0:vreg_64_align2 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF - FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) - FLAT_STORE_DWORD killed %0, killed %2, 2, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 2) + FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) + FLAT_STORE_DWORD killed %0, killed %2, 2, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 2) ... --- @@ -457,8 +457,8 @@ body: | %0:vreg_64_align2 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF - FLAT_STORE_DWORD %0, killed %1, 0, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) - FLAT_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) + FLAT_STORE_DWORD %0, killed %1, 0, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) + FLAT_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) ... --- @@ -475,6 +475,6 @@ body: | %0:vreg_128_align2 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF - FLAT_STORE_DWORD %0.sub0_sub1, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) - FLAT_STORE_DWORD %0.sub2_sub3, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4) + FLAT_STORE_DWORD %0.sub0_sub1, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) + FLAT_STORE_DWORD %0.sub2_sub3, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4) ... diff --git a/llvm/test/CodeGen/AMDGPU/merge-flat-with-global-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-flat-with-global-load-store.mir index 3a0c973d12456..5c43cd24a686d 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-flat-with-global-load-store.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-flat-with-global-load-store.mir @@ -13,8 +13,8 @@ body: | ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1 ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef` + 4, basealign 4) - %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1) + %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef` + 4, basealign 4) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, basealign 4, addrspace 1) S_NOP 0, implicit %1, implicit %2 ... @@ -30,8 +30,8 @@ body: | ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1 ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef`, basealign 8, addrspace 1) - %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef` + 4, basealign 8) + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, basealign 8, addrspace 1) + %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef` + 4, basealign 8) S_NOP 0, implicit %1, implicit %2 ... @@ -49,9 +49,9 @@ body: | ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub0 ; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, basealign 8, addrspace 1) - %2:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef`, basealign 16) - %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 8, addrspace 1) + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, basealign 8, addrspace 1) + %2:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, basealign 16) + %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, basealign 8, addrspace 1) S_NOP 0, implicit %1, implicit %2 ... @@ -71,10 +71,10 @@ body: | ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1 ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, basealign 8, addrspace 1) - %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 8, addrspace 1) - %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 12, basealign 8, addrspace 1) - %4:vgpr_32 = FLAT_LOAD_DWORD %0, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef` + 16) + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, basealign 8, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, basealign 8, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 12, basealign 8, addrspace 1) + %4:vgpr_32 = FLAT_LOAD_DWORD %0, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef` + 16) S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4 ... @@ -90,8 +90,8 @@ body: | ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub2_sub3 ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `double* undef`) - %2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 8, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, addrspace 1) + %1:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `ptr undef`) + %2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 8, 0, implicit $exec :: (load (s64) from `ptr addrspace(1) undef`, addrspace 1) S_NOP 0, implicit %1, implicit %2 ... @@ -107,8 +107,8 @@ body: | ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_96_align2 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub1_sub2_sub3 ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef`) - %2:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 %0, 4, 0, implicit $exec :: (load (s96) from `<3 x i32> addrspace(1)* undef`, addrspace 1) + %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`) + %2:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 %0, 4, 0, implicit $exec :: (load (s96) from `ptr addrspace(1) undef`, addrspace 1) S_NOP 0, implicit %1, implicit %2 ... @@ -124,8 +124,8 @@ body: | ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_96_align2 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub1_sub2_sub3 ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) - %2:vreg_96_align2 = FLAT_LOAD_DWORDX3 %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s96) from `<3 x i32>* undef`) + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1) + %2:vreg_96_align2 = FLAT_LOAD_DWORDX3 %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s96) from `ptr undef`) S_NOP 0, implicit %1, implicit %2 ... @@ -144,9 +144,9 @@ body: | ; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[COPY]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF %1:sreg_64_xexec = IMPLICIT_DEF - %2:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef`, basealign 4) - %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, basealign 4, addrspace 1) - %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1) + %2:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, basealign 4) + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, basealign 4, addrspace 1) + %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, basealign 4, addrspace 1) S_NOP 0, implicit %2, implicit %3, implicit %4 ... @@ -165,9 +165,9 @@ body: | ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD_SADDR]], implicit [[COPY]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF %1:sreg_64_xexec = IMPLICIT_DEF - %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) - %3:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef` + 4) - %4:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef` + 8) + %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1) + %3:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef` + 4) + %4:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef` + 8) S_NOP 0, implicit %2, implicit %3, implicit %4 ... @@ -184,8 +184,8 @@ body: | %0:vreg_64_align2 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF - FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`) - GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) + FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`) + GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) ... --- @@ -201,8 +201,8 @@ body: | %0:vreg_64_align2 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF - GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) - FLAT_STORE_DWORD %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`) + GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) + FLAT_STORE_DWORD %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`) ... --- @@ -218,8 +218,8 @@ body: | %0:vreg_64_align2 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:vreg_64_align2 = IMPLICIT_DEF - FLAT_STORE_DWORD %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`) - GLOBAL_STORE_DWORDX2 %0, %2, 4, 0, implicit $exec :: (store (s64) into `i64 addrspace(1)* undef`, addrspace 1) + FLAT_STORE_DWORD %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`) + GLOBAL_STORE_DWORDX2 %0, %2, 4, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) undef`, addrspace 1) ... --- @@ -235,8 +235,8 @@ body: | %0:vreg_64_align2 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:vreg_96_align2 = IMPLICIT_DEF - FLAT_STORE_DWORD %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`) - GLOBAL_STORE_DWORDX3 %0, %2, 4, 0, implicit $exec :: (store (s96) into `<3 x i32> addrspace(1)* undef`, addrspace 1) + FLAT_STORE_DWORD %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`) + GLOBAL_STORE_DWORDX3 %0, %2, 4, 0, implicit $exec :: (store (s96) into `ptr addrspace(1) undef`, addrspace 1) ... --- @@ -252,8 +252,8 @@ body: | %0:vreg_64_align2 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:vreg_64_align2 = IMPLICIT_DEF - GLOBAL_STORE_DWORD %0, %1, 8, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) - FLAT_STORE_DWORDX2 %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `i64* undef`) + GLOBAL_STORE_DWORD %0, %1, 8, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) + FLAT_STORE_DWORDX2 %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `ptr undef`) ... --- @@ -269,8 +269,8 @@ body: | %0:vreg_64_align2 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:vreg_96_align2 = IMPLICIT_DEF - GLOBAL_STORE_DWORD %0, %1, 12, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) - FLAT_STORE_DWORDX3 %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `<3 x i32>* undef`) + GLOBAL_STORE_DWORD %0, %1, 12, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) + FLAT_STORE_DWORDX3 %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `ptr undef`) ... --- @@ -288,8 +288,8 @@ body: | %1:sreg_64_xexec = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF %3:vgpr_32 = IMPLICIT_DEF - FLAT_STORE_DWORD %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`) - GLOBAL_STORE_DWORD_SADDR %0.sub0, %3, %1, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) + FLAT_STORE_DWORD %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`) + GLOBAL_STORE_DWORD_SADDR %0.sub0, %3, %1, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) ... --- @@ -307,6 +307,6 @@ body: | %1:sreg_64_xexec = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF %3:vgpr_32 = IMPLICIT_DEF - GLOBAL_STORE_DWORD_SADDR %0.sub0, %2, %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) - FLAT_STORE_DWORD %0, %3, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`) + GLOBAL_STORE_DWORD_SADDR %0.sub0, %2, %1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) + FLAT_STORE_DWORD %0, %3, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`) ... diff --git a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir index 32d7e4afbaf9d..ffa250f1c75b8 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir @@ -13,8 +13,8 @@ body: | ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_]].sub1 ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef` + 4, basealign 4, addrspace 1) - %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1) + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, basealign 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, basealign 4, addrspace 1) S_NOP 0, implicit %1, implicit %2 ... @@ -32,9 +32,9 @@ body: | ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub1 ; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 1, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 1, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 1, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) S_NOP 0, implicit %1, implicit %2, implicit %3 ... @@ -54,10 +54,10 @@ body: | ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1 ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4 ... @@ -78,11 +78,11 @@ body: | ; GCN-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 16, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[GLOBAL_LOAD_DWORD]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5 ... @@ -105,12 +105,12 @@ body: | ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_]].sub1 ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[COPY6]], implicit [[COPY7]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %6:vgpr_32 = GLOBAL_LOAD_DWORD %0, 20, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %6:vgpr_32 = GLOBAL_LOAD_DWORD %0, 20, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6 ... @@ -126,8 +126,8 @@ body: | ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[GLOBAL_LOAD_DWORDX4_]].sub2_sub3 ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 4, addrspace 1) - %2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 8, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 4, addrspace 1) + %1:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 8, 0, implicit $exec :: (load (s64) from `ptr addrspace(1) undef`, align 4, addrspace 1) S_NOP 0, implicit %1, implicit %2 ... @@ -143,8 +143,8 @@ body: | ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX4_]].sub3 ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 %0, 12, 0, implicit $exec :: (load (s96) from `i128 addrspace(1)* undef`, align 8, addrspace 1) - %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 24, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %1:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 %0, 12, 0, implicit $exec :: (load (s96) from `ptr addrspace(1) undef`, align 8, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 24, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) S_NOP 0, implicit %1, implicit %2 ... @@ -160,8 +160,8 @@ body: | ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[GLOBAL_LOAD_DWORDX3_]].sub1_sub2 ; GCN-NEXT: S_NOP 0, implicit [[COPY1]], implicit [[COPY]] %0:vreg_64_align2 = IMPLICIT_DEF - %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %1:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 16, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 8, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %1:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 16, 0, implicit $exec :: (load (s64) from `ptr addrspace(1) undef`, align 8, addrspace 1) S_NOP 0, implicit %1, implicit %2 ... @@ -176,8 +176,8 @@ body: | ; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:agpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %2:agpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %2:agpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) S_NOP 0, implicit %1, implicit %2 ... @@ -192,8 +192,8 @@ body: | ; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) S_NOP 0, implicit %1, implicit %2 ... @@ -208,8 +208,8 @@ body: | ; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 3, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 3, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 3, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1) S_NOP 0, implicit %1, implicit %2 ... @@ -224,8 +224,8 @@ body: | ; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 1, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) S_NOP 0, implicit %1, implicit %2 ... @@ -243,8 +243,8 @@ body: | ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] %0:sreg_64_xexec = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF - %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) S_NOP 0, implicit %2, implicit %3 ... @@ -264,9 +264,9 @@ body: | ; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY1]] %0:sreg_64_xexec = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF - %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 1, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 1, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 1, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) S_NOP 0, implicit %2, implicit %3, implicit %4 ... @@ -288,10 +288,10 @@ body: | ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]] %0:sreg_64_xexec = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF - %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 12, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 12, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) S_NOP 0, implicit %2, implicit %3, implicit %4, implicit %5 ... @@ -316,12 +316,12 @@ body: | ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[COPY6]], implicit [[COPY7]] %0:sreg_64_xexec = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF - %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 12, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 16, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %6:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 20, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %7:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 24, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 12, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 16, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %6:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 20, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %7:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 24, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) S_NOP 0, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7 ... @@ -339,8 +339,8 @@ body: | ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] %0:sreg_64_xexec = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF - %2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR %0, %1, 0, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 4, addrspace 1) - %3:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR %0, %1, 8, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 4, addrspace 1) + %2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR %0, %1, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %3:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR %0, %1, 8, 0, implicit $exec :: (load (s64) from `ptr addrspace(1) undef`, align 4, addrspace 1) S_NOP 0, implicit %2, implicit %3 ... @@ -357,8 +357,8 @@ body: | ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD_SADDR]] %0:sreg_64_xexec = IMPLICIT_DEF %1:vreg_64_align2 = IMPLICIT_DEF - %2:vgpr_32 = GLOBAL_LOAD_DWORD %1, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %1, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) S_NOP 0, implicit %2, implicit %3 ... @@ -375,8 +375,8 @@ body: | ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD_SADDR]], implicit [[GLOBAL_LOAD_DWORD_SADDR1]] %0:sgpr_128 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF - %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0.sub0_sub1, %1, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0.sub2_sub3, %1, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0.sub0_sub1, %1, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0.sub2_sub3, %1, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) S_NOP 0, implicit %2, implicit %3 ... @@ -393,8 +393,8 @@ body: | ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD_SADDR]], implicit [[GLOBAL_LOAD_DWORD_SADDR1]] %0:sreg_64_xexec = IMPLICIT_DEF %1:vreg_64_align2 = IMPLICIT_DEF - %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) - %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub1, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub1, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) S_NOP 0, implicit %2, implicit %3 ... @@ -409,8 +409,8 @@ body: | ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_]].sub0 ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, basealign 8, addrspace 1) - %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef`, align 4, addrspace 1) + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, basealign 8, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1) S_NOP 0, implicit %1, implicit %2 ... @@ -428,9 +428,9 @@ body: | ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub0 ; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]] %0:vreg_64_align2 = IMPLICIT_DEF - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, align 4, addrspace 1) - %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef`, align 16, addrspace 1) - %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, align 8, addrspace 1) + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 16, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, align 8, addrspace 1) S_NOP 0, implicit %1, implicit %2 ... @@ -449,8 +449,8 @@ body: | %0:vreg_64_align2 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF - GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) ... --- @@ -470,9 +470,9 @@ body: | %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF %3:vgpr_32 = IMPLICIT_DEF - GLOBAL_STORE_DWORD %0, killed %1, 4, 1, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD %0, killed %2, 8, 1, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD killed %0, killed %3, 12, 1, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD %0, killed %1, 4, 1, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD %0, killed %2, 8, 1, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD killed %0, killed %3, 12, 1, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) ... --- @@ -489,10 +489,10 @@ body: | ; GCN-NEXT: GLOBAL_STORE_DWORDX4 [[DEF]], killed [[REG_SEQUENCE2]], 4, 2, implicit $exec :: (store (s128) into `ptr addrspace(1) undef`, align 4, addrspace 1) %0:vreg_64_align2 = IMPLICIT_DEF %1:vreg_128 = IMPLICIT_DEF - GLOBAL_STORE_DWORD %0, %1.sub1, 8, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD %0, %1.sub2, 12, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD %0, %1.sub3, 16, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD killed %0, %1.sub0, 4, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD %0, %1.sub1, 8, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD %0, %1.sub2, 12, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD %0, %1.sub3, 16, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD killed %0, %1.sub0, 4, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) ... --- @@ -518,11 +518,11 @@ body: | %3:agpr_32 = IMPLICIT_DEF %4:agpr_32 = IMPLICIT_DEF %5:agpr_32 = IMPLICIT_DEF - GLOBAL_STORE_DWORD %0, %1, 4, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD %0, %2, 8, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 8, addrspace 1) - GLOBAL_STORE_DWORD %0, %3, 12, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD %0, %4, 16, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD %0, %5, 20, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD %0, %1, 4, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD %0, %2, 8, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 8, addrspace 1) + GLOBAL_STORE_DWORD %0, %3, 12, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD %0, %4, 16, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD %0, %5, 20, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) ... --- @@ -551,12 +551,12 @@ body: | %4:vgpr_32 = IMPLICIT_DEF %5:vgpr_32 = IMPLICIT_DEF %6:vgpr_32 = IMPLICIT_DEF - GLOBAL_STORE_DWORD %0, %1, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 8, addrspace 1) - GLOBAL_STORE_DWORD %0, %2, 8, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD %0, %3, 12, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD %0, %4, 16, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD %0, %5, 20, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD %0, %6, 24, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD %0, %1, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 8, addrspace 1) + GLOBAL_STORE_DWORD %0, %2, 8, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD %0, %3, 12, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD %0, %4, 16, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD %0, %5, 20, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD %0, %6, 24, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) ... --- @@ -573,8 +573,8 @@ body: | %0:vreg_64_align2 = IMPLICIT_DEF %1:vreg_64_align2 = IMPLICIT_DEF %2:vreg_64_align2 = IMPLICIT_DEF - GLOBAL_STORE_DWORDX2 %0, killed %1, 4, 0, implicit $exec :: (store (s64) into `i64 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORDX2 %0, killed %2, 12, 0, implicit $exec :: (store (s64) into `i64 addrspace(1)* undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %0, killed %1, 4, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %0, killed %2, 12, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) undef`, align 4, addrspace 1) ... --- @@ -591,8 +591,8 @@ body: | %0:vreg_64_align2 = IMPLICIT_DEF %1:vreg_96_align2 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF - GLOBAL_STORE_DWORDX3 %0, killed %1, 4, 0, implicit $exec :: (store (s96) into `i64 addrspace(1)* undef`, align 16, addrspace 1) - GLOBAL_STORE_DWORD %0, killed %2, 16, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORDX3 %0, killed %1, 4, 0, implicit $exec :: (store (s96) into `ptr addrspace(1) undef`, align 16, addrspace 1) + GLOBAL_STORE_DWORD %0, killed %2, 16, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) ... --- @@ -609,8 +609,8 @@ body: | %0:vreg_64_align2 = IMPLICIT_DEF %1:agpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF - GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) ... --- @@ -627,8 +627,8 @@ body: | %0:vreg_64_align2 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF - GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD killed %0, killed %2, 6, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD killed %0, killed %2, 6, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) ... --- @@ -645,8 +645,8 @@ body: | %0:vreg_64_align2 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF - GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD killed %0, killed %2, 2, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 2, addrspace 1) + GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD killed %0, killed %2, 2, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 2, addrspace 1) ... --- @@ -663,8 +663,8 @@ body: | %0:vreg_64_align2 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF - GLOBAL_STORE_DWORD %0, killed %1, 0, 1, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD %0, killed %1, 0, 1, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) ... --- @@ -681,8 +681,8 @@ body: | %0:vreg_128_align2 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF - GLOBAL_STORE_DWORD %0.sub0_sub1, killed %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD %0.sub2_sub3, killed %2, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD %0.sub0_sub1, killed %1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD %0.sub2_sub3, killed %2, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) ... --- @@ -701,8 +701,8 @@ body: | %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF %3:vgpr_32 = IMPLICIT_DEF - GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) ... --- @@ -724,9 +724,9 @@ body: | %2:vgpr_32 = IMPLICIT_DEF %3:vgpr_32 = IMPLICIT_DEF %4:vgpr_32 = IMPLICIT_DEF - GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 4, 1, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 8, 1, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 12, 1, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 4, 1, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 8, 1, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 12, 1, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) ... --- @@ -751,10 +751,10 @@ body: | %3:vgpr_32 = IMPLICIT_DEF %4:vgpr_32 = IMPLICIT_DEF %5:vgpr_32 = IMPLICIT_DEF - GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 4, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 8, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 12, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD_SADDR %1, %5, %0, 16, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 4, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 8, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 12, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD_SADDR %1, %5, %0, 16, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) ... --- @@ -785,12 +785,12 @@ body: | %5:vgpr_32 = IMPLICIT_DEF %6:vgpr_32 = IMPLICIT_DEF %7:vgpr_32 = IMPLICIT_DEF - GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 4, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 8, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 12, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD_SADDR %1, %5, %0, 16, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 20, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD_SADDR %1, %7, %0, 24, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 4, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 8, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 12, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD_SADDR %1, %5, %0, 16, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 20, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD_SADDR %1, %7, %0, 24, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) ... --- @@ -809,8 +809,8 @@ body: | %1:vreg_64_align2 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF %3:vgpr_32 = IMPLICIT_DEF - GLOBAL_STORE_DWORD_SADDR %1.sub0, %2, %0, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD %1, %3, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD_SADDR %1.sub0, %2, %0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD %1, %3, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) ... --- @@ -829,8 +829,8 @@ body: | %1:vreg_64_align2 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF %3:vgpr_32 = IMPLICIT_DEF - GLOBAL_STORE_DWORD_SADDR %1.sub0, %2, %0, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD_SADDR %1.sub1, %3, %0, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD_SADDR %1.sub0, %2, %0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD_SADDR %1.sub1, %3, %0, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) ... --- @@ -849,6 +849,6 @@ body: | %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF %3:vgpr_32 = IMPLICIT_DEF - GLOBAL_STORE_DWORD_SADDR %1, %2, %0.sub0_sub1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) - GLOBAL_STORE_DWORD_SADDR %1, %3, %0.sub2_sub3, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD_SADDR %1, %2, %0.sub0_sub1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) + GLOBAL_STORE_DWORD_SADDR %1, %3, %0.sub2_sub3, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1) ... diff --git a/llvm/test/CodeGen/AMDGPU/merge-load-store-vreg.mir b/llvm/test/CodeGen/AMDGPU/merge-load-store-vreg.mir index 14420845b3e63..2b3851c348d55 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-load-store-vreg.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-load-store-vreg.mir @@ -29,10 +29,10 @@ ret void bb2: - %tmp = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @0, i32 0, i32 0 - %tmp1 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 8 - %tmp2 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 16 - %tmp3 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 24 + %tmp = getelementptr inbounds [256 x float], ptr addrspace(3) @0, i32 0, i32 0 + %tmp1 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 8 + %tmp2 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 16 + %tmp3 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 24 br label %bb1 } @@ -44,10 +44,10 @@ ret void bb2: - %tmp = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @0, i32 0, i32 0 - %tmp1 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 8 - %tmp2 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 16 - %tmp3 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 24 + %tmp = getelementptr inbounds [256 x float], ptr addrspace(3) @0, i32 0, i32 0 + %tmp1 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 8 + %tmp2 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 16 + %tmp3 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 24 br label %bb1 } @@ -59,10 +59,10 @@ ret void bb2: - %tmp = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @0, i32 0, i32 0 - %tmp1 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 8 - %tmp2 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 16 - %tmp3 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 24 + %tmp = getelementptr inbounds [256 x float], ptr addrspace(3) @0, i32 0, i32 0 + %tmp1 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 8 + %tmp2 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 16 + %tmp3 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 24 br label %bb1 } --- diff --git a/llvm/test/CodeGen/AMDGPU/omod.ll b/llvm/test/CodeGen/AMDGPU/omod.ll index 5b048e067a99e..fa1ca66ef4156 100644 --- a/llvm/test/CodeGen/AMDGPU/omod.ll +++ b/llvm/test/CodeGen/AMDGPU/omod.ll @@ -508,7 +508,7 @@ define amdgpu_ps void @v_omod_mul2_med3(float %x, float %y, float %z) #0 { ; GFX12-NEXT: s_endpgm %fmed3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float %z) %div2 = fmul float %fmed3, 2.0 - store float %div2, float addrspace(1)* undef + store float %div2, ptr addrspace(1) undef ret void } diff --git a/llvm/test/CodeGen/AMDGPU/opencl-printf-unsupported.ll b/llvm/test/CodeGen/AMDGPU/opencl-printf-unsupported.ll index 51c03365bfeee..34e81e36aefd8 100644 --- a/llvm/test/CodeGen/AMDGPU/opencl-printf-unsupported.ll +++ b/llvm/test/CodeGen/AMDGPU/opencl-printf-unsupported.ll @@ -54,8 +54,8 @@ define amdgpu_kernel void @poison_interposable_initializer_gv(i32 %n) { define amdgpu_kernel void @not_constant_gv(i32 %n) { entry: %str = alloca [9 x i8], align 1, addrspace(5) - %arraydecay = getelementptr inbounds [9 x i8], [9 x i8] addrspace(5)* %str, i32 0, i32 0 - %call1 = call i32 (i8 addrspace(4)*, ...) @printf(i8 addrspace(4)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(4)* @not.constant, i32 0, i32 0), i8 addrspace(5)* %arraydecay, i32 %n) + %arraydecay = getelementptr inbounds [9 x i8], ptr addrspace(5) %str, i32 0, i32 0 + %call1 = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) @not.constant, ptr addrspace(5) %arraydecay, i32 %n) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/opencl-printf.ll b/llvm/test/CodeGen/AMDGPU/opencl-printf.ll index a1f0a5da125df..ee5f82f538ef9 100644 --- a/llvm/test/CodeGen/AMDGPU/opencl-printf.ll +++ b/llvm/test/CodeGen/AMDGPU/opencl-printf.ll @@ -752,8 +752,8 @@ define amdgpu_kernel void @test_kernel_addrspacecasted_format_str(i32 %n) { ; entry: %str = alloca [9 x i8], align 1, addrspace(5) - %arraydecay = getelementptr inbounds [9 x i8], [9 x i8] addrspace(5)* %str, i32 0, i32 0 - %call1 = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) addrspacecast (i8 addrspace(1)* getelementptr inbounds ([6 x i8], ptr addrspace(1) @str.as1, i32 0, i32 0) to ptr addrspace(4)), ptr addrspace(5) %arraydecay, i32 %n) + %arraydecay = getelementptr inbounds [9 x i8], ptr addrspace(5) %str, i32 0, i32 0 + %call1 = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) addrspacecast (ptr addrspace(1) @str.as1 to ptr addrspace(4)), ptr addrspace(5) %arraydecay, i32 %n) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll index 05ee10598b21a..bd9234d864b3d 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll @@ -133,7 +133,7 @@ define amdgpu_vs void @promote_memmove_aggr() #0 { store float 1.0, ptr addrspace(5) %foo1 %foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3 store float 2.0, ptr addrspace(5) %foo2 - call void @llvm.memmove.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo1, i32 16, i1 false) + call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo1, i32 16, i1 false) %foo3 = load float, ptr addrspace(5) %f1 store float %foo3, ptr addrspace(1) @pv ret void @@ -160,7 +160,7 @@ define amdgpu_vs void @promote_memcpy_aggr() #0 { %foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4 store float 3.0, ptr addrspace(5) %foo5 - call void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false) + call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false) %foo6 = load float, ptr addrspace(5) %f1 store float %foo6, ptr addrspace(1) @pv ret void @@ -177,7 +177,7 @@ define amdgpu_vs void @promote_memcpy_identity_aggr() #0 { store float 1.0, ptr addrspace(5) %foo1 %foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3 store float 2.0, ptr addrspace(5) %foo2 - call void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %f1, i32 20, i1 false) + call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %f1, i32 20, i1 false) %foo3 = load float, ptr addrspace(5) %f1 store float %foo3, ptr addrspace(1) @pv ret void @@ -229,7 +229,7 @@ define amdgpu_vs void @promote_memcpy_two_aggrs() #0 { %foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4 store float 3.0, ptr addrspace(5) %foo5 - call void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f2, ptr addrspace(5) align 4 %f1, i32 8, i1 false) + call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %f2, ptr addrspace(5) align 4 %f1, i32 8, i1 false) %foo6 = getelementptr [5 x float], ptr addrspace(5) %f2, i32 0, i32 %foo4 %foo7 = load float, ptr addrspace(5) %foo6 @@ -266,7 +266,7 @@ define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0 %foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4 store float 3.0, ptr addrspace(5) %foo5 - call void @llvm.memcpy.p1i8.p5i8.i32(ptr addrspace(1) align 4 @pv, ptr addrspace(5) align 4 %f1, i32 8, i1 false) + call void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) align 4 @pv, ptr addrspace(5) align 4 %f1, i32 8, i1 false) ret void } @@ -289,16 +289,16 @@ define amdgpu_vs void @promote_memcpy_inline_aggr() #0 { %foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4 store float 3.0, ptr addrspace(5) %foo5 - call void @llvm.memcpy.inline.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false) + call void @llvm.memcpy.inline.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false) %foo6 = load float, ptr addrspace(5) %f1 store float %foo6, ptr addrspace(1) @pv ret void } -declare void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg) -declare void @llvm.memcpy.p1i8.p5i8.i32(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg) -declare void @llvm.memcpy.inline.p5i8.p5i8.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg) -declare void @llvm.memmove.p5i8.p5i8.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg) +declare void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg) +declare void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg) +declare void @llvm.memcpy.inline.p5.p5.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg) +declare void @llvm.memmove.p5.p5.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg) @tmp_g = external addrspace(1) global { [4 x double], <2 x double>, <3 x double>, <4 x double> } @frag_color = external addrspace(1) global <4 x float> diff --git a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir index f8c7be8e414ca..c0d199920bd94 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir @@ -11,15 +11,15 @@ @sched_dbg_value_crash.tmp6 = internal unnamed_addr addrspace(3) global [256 x [16 x i8]] undef, align 16 - define amdgpu_kernel void @sched_dbg_value_crash(i8 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture readonly %arg1, %struct.widget.0 addrspace(1)* nocapture readonly %arg2, %struct.baz addrspace(1)* nocapture readonly %arg3, %struct.snork addrspace(1)* nocapture %arg4) local_unnamed_addr #2 { + define amdgpu_kernel void @sched_dbg_value_crash(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture readonly %arg1, ptr addrspace(1) nocapture readonly %arg2, ptr addrspace(1) nocapture readonly %arg3, ptr addrspace(1) nocapture %arg4) local_unnamed_addr #2 { bb: - %0 = getelementptr i32, i32 addrspace(1)* %arg1, i64 0, !amdgpu.uniform !3, !amdgpu.noclobber !3 + %0 = getelementptr i32, ptr addrspace(1) %arg1, i64 0, !amdgpu.uniform !3, !amdgpu.noclobber !3 %tmp5 = alloca %struct.wombat, align 16, addrspace(5) - %1 = call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() - %2 = bitcast i8 addrspace(4)* %1 to i32 addrspace(4)* - %3 = getelementptr inbounds i32, i32 addrspace(4)* %2, i64 1 - %4 = bitcast i32 addrspace(4)* %3 to <2 x i32> addrspace(4)*, !amdgpu.uniform !3, !amdgpu.noclobber !3 - %5 = load <2 x i32>, <2 x i32> addrspace(4)* %4, align 4, !invariant.load !3 + %1 = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() + %2 = bitcast ptr addrspace(4) %1 to ptr addrspace(4) + %3 = getelementptr inbounds i32, ptr addrspace(4) %2, i64 1 + %4 = bitcast ptr addrspace(4) %3 to ptr addrspace(4), !amdgpu.uniform !3, !amdgpu.noclobber !3 + %5 = load <2 x i32>, ptr addrspace(4) %4, align 4, !invariant.load !3 %6 = extractelement <2 x i32> %5, i32 0 %7 = extractelement <2 x i32> %5, i32 1 %8 = lshr i32 %6, 16 @@ -31,69 +31,69 @@ %14 = mul nuw nsw i32 %10, %7 %15 = add i32 %13, %14 %16 = add i32 %15, %11 - %17 = getelementptr inbounds [256 x [16 x i8]], [256 x [16 x i8]] addrspace(3)* @sched_dbg_value_crash.tmp6, i32 0, i32 %16 - %tmp7 = load i64, i64 addrspace(4)* null, align 536870912 + %17 = getelementptr inbounds [256 x [16 x i8]], ptr addrspace(3) @sched_dbg_value_crash.tmp6, i32 0, i32 %16 + %tmp7 = load i64, ptr addrspace(4) null, align 536870912 %tmp8 = tail call i32 @llvm.amdgcn.workitem.id.x() #3, !range !4 %tmp9 = zext i32 %tmp8 to i64 %tmp10 = add i64 %tmp7, %tmp9 %tmp11 = shl i64 %tmp10, 32 %tmp12 = ashr exact i64 %tmp11, 32 - %tmp13 = getelementptr inbounds %struct.widget.0, %struct.widget.0 addrspace(1)* %arg2, i64 %tmp12, i32 1 - %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4 - %tmp15 = getelementptr inbounds %struct.baz, %struct.baz addrspace(1)* %arg3, i64 %tmp12, i32 1 - %tmp16 = load <4 x float>, <4 x float> addrspace(1)* %tmp15, align 16 + %tmp13 = getelementptr inbounds %struct.widget.0, ptr addrspace(1) %arg2, i64 %tmp12, i32 1 + %tmp14 = load i32, ptr addrspace(1) %tmp13, align 4 + %tmp15 = getelementptr inbounds %struct.baz, ptr addrspace(1) %arg3, i64 %tmp12, i32 1 + %tmp16 = load <4 x float>, ptr addrspace(1) %tmp15, align 16 %tmp17 = sext i32 %tmp14 to i64 - %tmp18 = load i32, i32 addrspace(1)* %0, align 4 + %tmp18 = load i32, ptr addrspace(1) %0, align 4 %tmp19 = zext i32 %tmp18 to i64 %tmp20 = shl nuw nsw i64 %tmp19, 2 - %tmp21 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 %tmp20 - %tmp22 = bitcast i8 addrspace(1)* %tmp21 to %struct.wombat.1 addrspace(1)* - %tmp23 = bitcast %struct.wombat addrspace(5)* %tmp5 to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 144, i8 addrspace(5)* nonnull %tmp23) #3 - %tmp24 = getelementptr inbounds %struct.wombat, %struct.wombat addrspace(5)* %tmp5, i32 0, i32 6 - %tmp25 = getelementptr i32, i32 addrspace(1)* %arg1, i64 3, !amdgpu.uniform !3, !amdgpu.noclobber !3 - %tmp26 = load i32, i32 addrspace(1)* %tmp25, align 4 + %tmp21 = getelementptr inbounds i8, ptr addrspace(1) %arg, i64 %tmp20 + %tmp22 = bitcast ptr addrspace(1) %tmp21 to ptr addrspace(1) + %tmp23 = bitcast ptr addrspace(5) %tmp5 to ptr addrspace(5) + call void @llvm.lifetime.start.p5(i64 144, ptr addrspace(5) nonnull %tmp23) #3 + %tmp24 = getelementptr inbounds %struct.wombat, ptr addrspace(5) %tmp5, i32 0, i32 6 + %tmp25 = getelementptr i32, ptr addrspace(1) %arg1, i64 3, !amdgpu.uniform !3, !amdgpu.noclobber !3 + %tmp26 = load i32, ptr addrspace(1) %tmp25, align 4 %tmp27 = zext i32 %tmp26 to i64 %tmp28 = shl nuw nsw i64 %tmp27, 2 - %tmp29 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 %tmp28 - %tmp30 = bitcast i8 addrspace(1)* %tmp29 to <2 x float> addrspace(1)* - %tmp31 = getelementptr inbounds %struct.wombat.1, %struct.wombat.1 addrspace(1)* %tmp22, i64 %tmp17, i32 2, i64 0 - %18 = bitcast i32 addrspace(1)* %tmp31 to <3 x i32> addrspace(1)* - %19 = load <3 x i32>, <3 x i32> addrspace(1)* %18, align 4 + %tmp29 = getelementptr inbounds i8, ptr addrspace(1) %arg, i64 %tmp28 + %tmp30 = bitcast ptr addrspace(1) %tmp29 to ptr addrspace(1) + %tmp31 = getelementptr inbounds %struct.wombat.1, ptr addrspace(1) %tmp22, i64 %tmp17, i32 2, i64 0 + %18 = bitcast ptr addrspace(1) %tmp31 to ptr addrspace(1) + %19 = load <3 x i32>, ptr addrspace(1) %18, align 4 %tmp325 = extractelement <3 x i32> %19, i32 0 %tmp386 = extractelement <3 x i32> %19, i32 1 %tmp447 = extractelement <3 x i32> %19, i32 2 %tmp33 = sext i32 %tmp325 to i64 - %tmp34 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %tmp30, i64 %tmp33 - %tmp35 = load <2 x float>, <2 x float> addrspace(1)* %tmp34, align 8 + %tmp34 = getelementptr inbounds <2 x float>, ptr addrspace(1) %tmp30, i64 %tmp33 + %tmp35 = load <2 x float>, ptr addrspace(1) %tmp34, align 8 %tmp36 = extractelement <2 x float> %tmp35, i32 1 %tmp39 = sext i32 %tmp386 to i64 - %tmp40 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %tmp30, i64 %tmp39 - %tmp41 = load <2 x float>, <2 x float> addrspace(1)* %tmp40, align 8 + %tmp40 = getelementptr inbounds <2 x float>, ptr addrspace(1) %tmp30, i64 %tmp39 + %tmp41 = load <2 x float>, ptr addrspace(1) %tmp40, align 8 %tmp42 = extractelement <2 x float> %tmp41, i32 1 %tmp45 = sext i32 %tmp447 to i64 - %tmp46 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %tmp30, i64 %tmp45 - %tmp47 = load <2 x float>, <2 x float> addrspace(1)* %tmp46, align 8 + %tmp46 = getelementptr inbounds <2 x float>, ptr addrspace(1) %tmp30, i64 %tmp45 + %tmp47 = load <2 x float>, ptr addrspace(1) %tmp46, align 8 %tmp48 = extractelement <2 x float> %tmp47, i32 1 - %tmp49 = getelementptr i32, i32 addrspace(1)* %arg1, i64 1, !amdgpu.uniform !3, !amdgpu.noclobber !3 - %tmp50 = load i32, i32 addrspace(1)* %tmp49, align 4 + %tmp49 = getelementptr i32, ptr addrspace(1) %arg1, i64 1, !amdgpu.uniform !3, !amdgpu.noclobber !3 + %tmp50 = load i32, ptr addrspace(1) %tmp49, align 4 %tmp51 = zext i32 %tmp50 to i64 %tmp52 = shl nuw nsw i64 %tmp51, 2 - %tmp53 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 %tmp52 - %tmp54 = bitcast i8 addrspace(1)* %tmp53 to <4 x float> addrspace(1)* - %tmp55 = getelementptr inbounds %struct.wombat.1, %struct.wombat.1 addrspace(1)* %tmp22, i64 %tmp17, i32 0, i64 0 - %20 = bitcast i32 addrspace(1)* %tmp55 to <2 x i32> addrspace(1)* - %21 = load <2 x i32>, <2 x i32> addrspace(1)* %20, align 4 + %tmp53 = getelementptr inbounds i8, ptr addrspace(1) %arg, i64 %tmp52 + %tmp54 = bitcast ptr addrspace(1) %tmp53 to ptr addrspace(1) + %tmp55 = getelementptr inbounds %struct.wombat.1, ptr addrspace(1) %tmp22, i64 %tmp17, i32 0, i64 0 + %20 = bitcast ptr addrspace(1) %tmp55 to ptr addrspace(1) + %21 = load <2 x i32>, ptr addrspace(1) %20, align 4 %tmp568 = extractelement <2 x i32> %21, i32 0 %tmp639 = extractelement <2 x i32> %21, i32 1 %tmp57 = sext i32 %tmp568 to i64 - %tmp58 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %tmp54, i64 %tmp57 - %tmp59 = load <4 x float>, <4 x float> addrspace(1)* %tmp58, align 16 + %tmp58 = getelementptr inbounds <4 x float>, ptr addrspace(1) %tmp54, i64 %tmp57 + %tmp59 = load <4 x float>, ptr addrspace(1) %tmp58, align 16 %tmp60 = extractelement <4 x float> %tmp59, i32 0 %tmp61 = extractelement <4 x float> %tmp59, i32 1 %tmp64 = sext i32 %tmp639 to i64 - %tmp65 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %tmp54, i64 %tmp64 - %tmp66 = load <4 x float>, <4 x float> addrspace(1)* %tmp65, align 16 + %tmp65 = getelementptr inbounds <4 x float>, ptr addrspace(1) %tmp54, i64 %tmp64 + %tmp66 = load <4 x float>, ptr addrspace(1) %tmp65, align 16 %tmp67 = extractelement <4 x float> %tmp16, i64 0 %tmp69 = fsub fast float -0.000000e+00, %tmp67 %tmp70 = fmul float %tmp67, 0.000000e+00 @@ -103,7 +103,7 @@ %tmp74 = insertelement <4 x float> , float %tmp69, i32 0 %tmp75 = insertelement <4 x float> %tmp74, float %tmp71, i32 1 %tmp76 = insertelement <4 x float> %tmp75, float %tmp73, i32 2 - store <4 x float> %tmp76, <4 x float> addrspace(5)* %tmp24, align 16 + store <4 x float> %tmp76, ptr addrspace(5) %tmp24, align 16 %tmp77 = fsub float undef, %tmp60 %tmp78 = fsub float undef, %tmp61 %tmp79 = extractelement <4 x float> %tmp66, i32 2 @@ -128,27 +128,27 @@ %fadd = fadd <2 x float> %fmul, undef %extractelement = extractelement <2 x float> %fadd, i64 1 %tmp96 = fsub float %extractelement, %tmp95 - %tmp97 = getelementptr inbounds %struct.wombat, %struct.wombat addrspace(5)* %tmp5, i32 0, i32 8, i32 1 - call void @func(float %tmp96, i64 0, i16 addrspace(5)* nonnull %tmp97) #3 - %tmp984 = bitcast [16 x i8] addrspace(3)* %17 to i8 addrspace(3)* - %tmp99 = getelementptr inbounds %struct.snork, %struct.snork addrspace(1)* %arg4, i64 %tmp12, i32 8, i32 1, i64 0 - call void @llvm.memcpy.p1i8.p3i8.i64(i8 addrspace(1)* %tmp99, i8 addrspace(3)* %tmp984, i64 16, i32 16, i1 false) - call void @llvm.lifetime.end.p5i8(i64 144, i8 addrspace(5)* nonnull %tmp23) #3 + %tmp97 = getelementptr inbounds %struct.wombat, ptr addrspace(5) %tmp5, i32 0, i32 8, i32 1 + call void @func(float %tmp96, i64 0, ptr addrspace(5) nonnull %tmp97) #3 + %tmp984 = bitcast ptr addrspace(3) %17 to ptr addrspace(3) + %tmp99 = getelementptr inbounds %struct.snork, ptr addrspace(1) %arg4, i64 %tmp12, i32 8, i32 1, i64 0 + call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) %tmp99, ptr addrspace(3) %tmp984, i64 16, i32 16, i1 false) + call void @llvm.lifetime.end.p5(i64 144, ptr addrspace(5) nonnull %tmp23) #3 ret void } - declare void @func(float, i64, i16 addrspace(5)*) - declare void @llvm.lifetime.start.p5i8(i64, i8 addrspace(5)* nocapture) #0 + declare void @func(float, i64, ptr addrspace(5)) + declare void @llvm.lifetime.start.p5(i64, ptr addrspace(5) nocapture) #0 declare float @llvm.fmuladd.f32(float, float, float) #1 - declare void @llvm.lifetime.end.p5i8(i64, i8 addrspace(5)* nocapture) #0 + declare void @llvm.lifetime.end.p5(i64, ptr addrspace(5) nocapture) #0 declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1 declare i32 @llvm.amdgcn.workitem.id.x() #1 declare void @llvm.dbg.value(metadata, metadata, metadata) #1 - declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 + declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #1 declare i32 @llvm.amdgcn.workitem.id.y() #1 declare i32 @llvm.amdgcn.workitem.id.z() #1 - declare void @llvm.memcpy.p1i8.p5i8.i64(i8 addrspace(1)* nocapture writeonly, i8 addrspace(5)* nocapture readonly, i64, i32, i1) #0 - declare void @llvm.memcpy.p1i8.p3i8.i64(i8 addrspace(1)* nocapture writeonly, i8 addrspace(3)* nocapture readonly, i64, i32, i1) #0 + declare void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i32, i1) #0 + declare void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i32, i1) #0 attributes #0 = { argmemonly nounwind } attributes #1 = { nounwind readnone speculatable } @@ -203,9 +203,9 @@ body: | %2:vgpr_32 = COPY $vgpr2 %1:vgpr_32 = COPY $vgpr1 %0:vgpr_32 = COPY $vgpr0 - %5:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load (s64) from `i64 addrspace(4)* undef`) - %6:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 8, 0 :: (non-temporal dereferenceable invariant load (s64) from `i64 addrspace(4)* undef`) - %7:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 16, 0 :: (non-temporal dereferenceable invariant load (s64) from `i64 addrspace(4)* undef`) + %5:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`) + %6:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 8, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`) + %7:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 16, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`) %8:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 24, 0 %9:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 32, 0 %10:sreg_64_xexec = S_LOAD_DWORDX2_IMM %3, 4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir b/llvm/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir index 12eb3241f77e2..63ab75b140fdc 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir @@ -26,7 +26,7 @@ source_filename = "sdwa-scalar-ops.opt.ll" target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" - define amdgpu_kernel void @sdwa_imm_operand(i32 addrspace(1)* nocapture %arg) { + define amdgpu_kernel void @sdwa_imm_operand(ptr addrspace(1) nocapture %arg) { bb: br label %bb2 @@ -35,29 +35,29 @@ bb2: ; preds = %bb2, %bb %lsr.iv = phi i64 [ %lsr.iv.next, %bb2 ], [ 0, %bb ] - %bc = bitcast i32 addrspace(1)* %arg to i8 addrspace(1)* - %uglygep4 = getelementptr i8, i8 addrspace(1)* %bc, i64 %lsr.iv - %uglygep45 = bitcast i8 addrspace(1)* %uglygep4 to i32 addrspace(1)* - %tmp5 = load i32, i32 addrspace(1)* %uglygep45, align 4 + %bc = bitcast ptr addrspace(1) %arg to ptr addrspace(1) + %uglygep4 = getelementptr i8, ptr addrspace(1) %bc, i64 %lsr.iv + %uglygep45 = bitcast ptr addrspace(1) %uglygep4 to ptr addrspace(1) + %tmp5 = load i32, ptr addrspace(1) %uglygep45, align 4 %tmp6 = lshr i32 %tmp5, 8 %tmp7 = and i32 %tmp6, 255 %tmp8 = zext i32 %tmp7 to i64 - %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp8 - store i32 1, i32 addrspace(1)* %tmp9, align 4 - %scevgep = getelementptr i32, i32 addrspace(1)* %uglygep45, i64 1 - %tmp13 = load i32, i32 addrspace(1)* %scevgep, align 4 + %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp8 + store i32 1, ptr addrspace(1) %tmp9, align 4 + %scevgep = getelementptr i32, ptr addrspace(1) %uglygep45, i64 1 + %tmp13 = load i32, ptr addrspace(1) %scevgep, align 4 %tmp14 = lshr i32 %tmp13, 8 %tmp15 = and i32 %tmp14, 255 %tmp16 = zext i32 %tmp15 to i64 - %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp16 - store i32 1, i32 addrspace(1)* %tmp17, align 4 + %tmp17 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp16 + store i32 1, ptr addrspace(1) %tmp17, align 4 %lsr.iv.next = add nuw nsw i64 %lsr.iv, 8 %tmp1 = trunc i64 %lsr.iv.next to i32 %tmp19 = icmp eq i32 %tmp1, 4096 br i1 %tmp19, label %bb1, label %bb2 } - define amdgpu_kernel void @sdwa_sgpr_operand(i32 addrspace(1)* nocapture %arg) { + define amdgpu_kernel void @sdwa_sgpr_operand(ptr addrspace(1) nocapture %arg) { bb: br label %bb2 @@ -66,22 +66,22 @@ bb2: ; preds = %bb2, %bb %lsr.iv = phi i64 [ %lsr.iv.next, %bb2 ], [ 0, %bb ] - %bc = bitcast i32 addrspace(1)* %arg to i8 addrspace(1)* - %uglygep4 = getelementptr i8, i8 addrspace(1)* %bc, i64 %lsr.iv - %uglygep45 = bitcast i8 addrspace(1)* %uglygep4 to i32 addrspace(1)* - %tmp5 = load i32, i32 addrspace(1)* %uglygep45, align 4 + %bc = bitcast ptr addrspace(1) %arg to ptr addrspace(1) + %uglygep4 = getelementptr i8, ptr addrspace(1) %bc, i64 %lsr.iv + %uglygep45 = bitcast ptr addrspace(1) %uglygep4 to ptr addrspace(1) + %tmp5 = load i32, ptr addrspace(1) %uglygep45, align 4 %tmp6 = lshr i32 %tmp5, 8 %tmp7 = and i32 %tmp6, 255 %tmp8 = zext i32 %tmp7 to i64 - %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp8 - store i32 1, i32 addrspace(1)* %tmp9, align 4 - %scevgep = getelementptr i32, i32 addrspace(1)* %uglygep45, i64 1 - %tmp13 = load i32, i32 addrspace(1)* %scevgep, align 4 + %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp8 + store i32 1, ptr addrspace(1) %tmp9, align 4 + %scevgep = getelementptr i32, ptr addrspace(1) %uglygep45, i64 1 + %tmp13 = load i32, ptr addrspace(1) %scevgep, align 4 %tmp14 = lshr i32 %tmp13, 8 %tmp15 = and i32 %tmp14, 255 %tmp16 = zext i32 %tmp15 to i64 - %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp16 - store i32 1, i32 addrspace(1)* %tmp17, align 4 + %tmp17 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp16 + store i32 1, ptr addrspace(1) %tmp17, align 4 %lsr.iv.next = add nuw nsw i64 %lsr.iv, 8 %tmp1 = trunc i64 %lsr.iv.next to i32 %tmp19 = icmp eq i32 %tmp1, 4096 @@ -203,7 +203,7 @@ body: | liveins: $sgpr4_sgpr5 %4 = COPY $sgpr4_sgpr5 - %9 = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load (s64) from `i64 addrspace(4)* undef`) + %9 = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`) %8 = S_MOV_B64 0 %7 = COPY %9 %30 = V_MOV_B32_e32 1, implicit $exec @@ -365,7 +365,7 @@ body: | liveins: $sgpr4_sgpr5 %4 = COPY $sgpr4_sgpr5 - %9 = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load (s64) from `i64 addrspace(4)* undef`) + %9 = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`) %8 = S_MOV_B64 0 %7 = COPY %9 %30 = V_MOV_B32_e32 1, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll index bbb97902905c7..5f662ac088a35 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0] @@ -14,11 +14,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> bb: %fneg.A = fneg <8 x half> %A %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0] @@ -31,11 +31,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> bb: %fneg.B = fneg <8 x half> %B %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] @@ -48,11 +48,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] @@ -65,11 +65,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] @@ -82,11 +82,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] @@ -99,11 +99,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0] @@ -114,11 +114,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> bb: %fneg.A = fneg <8 x half> %A %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0) - store <8 x half> %res, <8 x half> addrspace(1)* %out + store <8 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0] @@ -129,11 +129,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> bb: %fneg.B = fneg <8 x half> %B %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0) - store <8 x half> %res, <8 x half> addrspace(1)* %out + store <8 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] @@ -144,11 +144,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> bb: %fneg.C = fneg <8 x half> %C %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0) - store <8 x half> %res, <8 x half> addrspace(1)* %out + store <8 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] @@ -159,11 +159,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> bb: %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0) - store <8 x half> %res, <8 x half> addrspace(1)* %out + store <8 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] @@ -176,11 +176,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i3 bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] @@ -193,11 +193,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i3 bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] @@ -210,11 +210,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i3 bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] @@ -227,11 +227,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i3 bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] @@ -244,11 +244,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i3 bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] @@ -261,11 +261,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i3 bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] @@ -278,11 +278,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i3 bb: %fneg.C = fneg <8 x float> %C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] @@ -295,11 +295,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i3 bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0] @@ -312,11 +312,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x ha bb: %fneg.A = fneg <8 x half> %A %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0] @@ -329,11 +329,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x ha bb: %fneg.B = fneg <16 x half> %B %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0] @@ -344,11 +344,11 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x ha bb: %fneg.A = fneg <8 x half> %A %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index) - store <8 x half> %res, <8 x half> addrspace(1)* %out + store <8 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0] @@ -359,13 +359,13 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x ha bb: %fneg.B = fneg <16 x half> %B %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index) - store <8 x half> %res, <8 x half> addrspace(1)* %out + store <8 x half> %res, ptr addrspace(1) %out ret void } ; both neg and abs patterns (wmma matrix C f32 or f16 ) -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1] @@ -379,11 +379,11 @@ bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) %fneg.fabs.C = fneg <8 x float> %fabs.C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1] @@ -395,11 +395,11 @@ bb: %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) %fneg.fabs.C = fneg <8 x half> %fabs.C %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0) - store <8 x half> %res, <8 x half> addrspace(1)* %out + store <8 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 @@ -417,13 +417,13 @@ bb: %partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3 %fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } ; A or B matrix modifier and constant in C -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] @@ -436,11 +436,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, bb: %fneg.A = fneg <8 x half> %A %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x float> ) - store <8 x float> %res, <8 x float> addrspace(1)* %out + store <8 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] @@ -451,13 +451,13 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, bb: %fneg.B = fneg <8 x half> %B %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> , i1 0) - store <8 x half> %res, <8 x half> addrspace(1)* %out + store <8 x half> %res, ptr addrspace(1) %out ret void } ; pack f16 elements with v_perm_b32 since they don't come from same b32 -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, <8 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_clause 0x1 @@ -480,7 +480,7 @@ bb: %C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> %fneg.C_shuffle = fneg <8 x half> %C_shuffle %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0) - store <8 x half> %res, <8 x half> addrspace(1)* %out + store <8 x half> %res, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll index a62c3c0643362..1e82e74d92c4e 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0] @@ -16,7 +16,7 @@ bb: ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0] @@ -27,11 +27,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> bb: %fneg.B = fneg <4 x half> %B %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] @@ -42,11 +42,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] @@ -57,11 +57,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] @@ -72,11 +72,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] @@ -87,11 +87,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0] @@ -102,11 +102,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> bb: %fneg.A = fneg <4 x half> %A %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0) - store <4 x half> %res, <4 x half> addrspace(1)* %out + store <4 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0] @@ -117,11 +117,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> bb: %fneg.B = fneg <4 x half> %B %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0) - store <4 x half> %res, <4 x half> addrspace(1)* %out + store <4 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] @@ -132,11 +132,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> bb: %fneg.C = fneg <4 x half> %C %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0) - store <4 x half> %res, <4 x half> addrspace(1)* %out + store <4 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] @@ -147,11 +147,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> bb: %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0) - store <4 x half> %res, <4 x half> addrspace(1)* %out + store <4 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] @@ -162,11 +162,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] @@ -177,11 +177,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] @@ -192,11 +192,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] @@ -207,11 +207,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] @@ -222,11 +222,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] @@ -237,11 +237,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] @@ -252,11 +252,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x bb: %fneg.C = fneg <4 x float> %C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] @@ -267,11 +267,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0] @@ -282,11 +282,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x hal bb: %fneg.A = fneg <4 x half> %A %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0] @@ -297,11 +297,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x hal bb: %fneg.B = fneg <8 x half> %B %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0] @@ -312,11 +312,11 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x hal bb: %fneg.A = fneg <4 x half> %A %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index) - store <4 x half> %res, <4 x half> addrspace(1)* %out + store <4 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0] @@ -327,13 +327,13 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x hal bb: %fneg.B = fneg <8 x half> %B %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index) - store <4 x half> %res, <4 x half> addrspace(1)* %out + store <4 x half> %res, ptr addrspace(1) %out ret void } ; both neg and abs patterns (wmma matrix C f32 or f16 ) -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1] @@ -345,11 +345,11 @@ bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) %fneg.fabs.C = fneg <4 x float> %fabs.C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1] @@ -361,11 +361,11 @@ bb: %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) %fneg.fabs.C = fneg <4 x half> %fabs.C %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0) - store <4 x half> %res, <4 x half> addrspace(1)* %out + store <4 x half> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 @@ -381,13 +381,13 @@ bb: %partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3 %fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } ; A or B matrix modifier and constant in C -define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] @@ -398,11 +398,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, bb: %fneg.A = fneg <4 x half> %A %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %fneg.A, <4 x half> %B, <4 x float> ) - store <4 x float> %res, <4 x float> addrspace(1)* %out + store <4 x float> %res, ptr addrspace(1) %out ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] @@ -413,13 +413,13 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, bb: %fneg.B = fneg <4 x half> %B %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> , i1 0) - store <4 x half> %res, <4 x half> addrspace(1)* %out + store <4 x half> %res, ptr addrspace(1) %out ret void } ; pack f16 elements with v_perm_b32 since they don't come from same b32 -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, <4 x half> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, ptr addrspace(1) %out) { ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: flat_load_b128 v[8:11], v[4:5] @@ -437,7 +437,7 @@ bb: %C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> %fneg.C_shuffle = fneg <4 x half> %C_shuffle %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0) - store <4 x half> %res, <4 x half> addrspace(1)* %out + store <4 x half> %res, ptr addrspace(1) %out ret void }