96 changes: 42 additions & 54 deletions llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s

define amdgpu_kernel void @divergent_or3_b32(<3 x i32> addrspace(1)* %arg) {
define amdgpu_kernel void @divergent_or3_b32(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_or3_b32:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
Expand All @@ -16,20 +16,19 @@ define amdgpu_kernel void @divergent_or3_b32(<3 x i32> addrspace(1)* %arg) {
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
%i1 = zext i32 %i to i64
%i2 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %arg, i64 %i1
%i3 = load <3 x i32>, <3 x i32> addrspace(1)* %i2, align 16
%i2 = getelementptr inbounds <3 x i32>, ptr addrspace(1) %arg, i64 %i1
%i3 = load <3 x i32>, ptr addrspace(1) %i2, align 16
%i4 = extractelement <3 x i32> %i3, i64 0
%i5 = extractelement <3 x i32> %i3, i64 1
%i6 = extractelement <3 x i32> %i3, i64 2
%i7 = or i32 %i5, %i4
%i8 = or i32 %i7, %i6
%i9 = xor i32 %i8, -1
%i10 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %i2, i64 0, i64 0
store i32 %i9, i32 addrspace(1)* %i10, align 16
store i32 %i9, ptr addrspace(1) %i2, align 16
ret void
}

define amdgpu_kernel void @divergent_or3_b64(<3 x i64> addrspace(1)* %arg) {
define amdgpu_kernel void @divergent_or3_b64(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_or3_b64:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
Expand All @@ -47,20 +46,19 @@ define amdgpu_kernel void @divergent_or3_b64(<3 x i64> addrspace(1)* %arg) {
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
%i1 = zext i32 %i to i64
%i2 = getelementptr inbounds <3 x i64>, <3 x i64> addrspace(1)* %arg, i64 %i1
%i3 = load <3 x i64>, <3 x i64> addrspace(1)* %i2, align 32
%i2 = getelementptr inbounds <3 x i64>, ptr addrspace(1) %arg, i64 %i1
%i3 = load <3 x i64>, ptr addrspace(1) %i2, align 32
%i4 = extractelement <3 x i64> %i3, i64 0
%i5 = extractelement <3 x i64> %i3, i64 1
%i6 = extractelement <3 x i64> %i3, i64 2
%i7 = or i64 %i5, %i4
%i8 = or i64 %i7, %i6
%i9 = xor i64 %i8, -1
%i10 = getelementptr inbounds <3 x i64>, <3 x i64> addrspace(1)* %i2, i64 0, i64 0
store i64 %i9, i64 addrspace(1)* %i10, align 32
store i64 %i9, ptr addrspace(1) %i2, align 32
ret void
}

define amdgpu_kernel void @divergent_and3_b32(<3 x i32> addrspace(1)* %arg) {
define amdgpu_kernel void @divergent_and3_b32(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_and3_b32:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
Expand All @@ -76,20 +74,19 @@ define amdgpu_kernel void @divergent_and3_b32(<3 x i32> addrspace(1)* %arg) {
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
%i1 = zext i32 %i to i64
%i2 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %arg, i64 %i1
%i3 = load <3 x i32>, <3 x i32> addrspace(1)* %i2, align 16
%i2 = getelementptr inbounds <3 x i32>, ptr addrspace(1) %arg, i64 %i1
%i3 = load <3 x i32>, ptr addrspace(1) %i2, align 16
%i4 = extractelement <3 x i32> %i3, i64 0
%i5 = extractelement <3 x i32> %i3, i64 1
%i6 = extractelement <3 x i32> %i3, i64 2
%i7 = and i32 %i5, %i4
%i8 = and i32 %i7, %i6
%i9 = xor i32 %i8, -1
%i10 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %i2, i64 0, i64 0
store i32 %i9, i32 addrspace(1)* %i10, align 16
store i32 %i9, ptr addrspace(1) %i2, align 16
ret void
}

define amdgpu_kernel void @divergent_and3_b64(<3 x i64> addrspace(1)* %arg) {
define amdgpu_kernel void @divergent_and3_b64(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_and3_b64:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
Expand All @@ -110,20 +107,19 @@ define amdgpu_kernel void @divergent_and3_b64(<3 x i64> addrspace(1)* %arg) {
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
%i1 = zext i32 %i to i64
%i2 = getelementptr inbounds <3 x i64>, <3 x i64> addrspace(1)* %arg, i64 %i1
%i3 = load <3 x i64>, <3 x i64> addrspace(1)* %i2, align 32
%i2 = getelementptr inbounds <3 x i64>, ptr addrspace(1) %arg, i64 %i1
%i3 = load <3 x i64>, ptr addrspace(1) %i2, align 32
%i4 = extractelement <3 x i64> %i3, i64 0
%i5 = extractelement <3 x i64> %i3, i64 1
%i6 = extractelement <3 x i64> %i3, i64 2
%i7 = and i64 %i5, %i4
%i8 = and i64 %i7, %i6
%i9 = xor i64 %i8, -1
%i10 = getelementptr inbounds <3 x i64>, <3 x i64> addrspace(1)* %i2, i64 0, i64 0
store i64 %i9, i64 addrspace(1)* %i10, align 32
store i64 %i9, ptr addrspace(1) %i2, align 32
ret void
}

define amdgpu_kernel void @divergent_xor3_b32(<3 x i32> addrspace(1)* %arg) {
define amdgpu_kernel void @divergent_xor3_b32(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_xor3_b32:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
Expand All @@ -138,20 +134,19 @@ define amdgpu_kernel void @divergent_xor3_b32(<3 x i32> addrspace(1)* %arg) {
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
%i1 = zext i32 %i to i64
%i2 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %arg, i64 %i1
%i3 = load <3 x i32>, <3 x i32> addrspace(1)* %i2, align 16
%i2 = getelementptr inbounds <3 x i32>, ptr addrspace(1) %arg, i64 %i1
%i3 = load <3 x i32>, ptr addrspace(1) %i2, align 16
%i4 = extractelement <3 x i32> %i3, i64 0
%i5 = extractelement <3 x i32> %i3, i64 1
%i6 = extractelement <3 x i32> %i3, i64 2
%i7 = xor i32 %i5, %i4
%i8 = xor i32 %i7, %i6
%i9 = xor i32 %i8, -1
%i10 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %i2, i64 0, i64 0
store i32 %i9, i32 addrspace(1)* %i10, align 16
store i32 %i9, ptr addrspace(1) %i2, align 16
ret void
}

define amdgpu_kernel void @divergent_xor3_b64(<3 x i64> addrspace(1)* %arg) {
define amdgpu_kernel void @divergent_xor3_b64(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_xor3_b64:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
Expand All @@ -170,20 +165,19 @@ define amdgpu_kernel void @divergent_xor3_b64(<3 x i64> addrspace(1)* %arg) {
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
%i1 = zext i32 %i to i64
%i2 = getelementptr inbounds <3 x i64>, <3 x i64> addrspace(1)* %arg, i64 %i1
%i3 = load <3 x i64>, <3 x i64> addrspace(1)* %i2, align 32
%i2 = getelementptr inbounds <3 x i64>, ptr addrspace(1) %arg, i64 %i1
%i3 = load <3 x i64>, ptr addrspace(1) %i2, align 32
%i4 = extractelement <3 x i64> %i3, i64 0
%i5 = extractelement <3 x i64> %i3, i64 1
%i6 = extractelement <3 x i64> %i3, i64 2
%i7 = xor i64 %i5, %i4
%i8 = xor i64 %i7, %i6
%i9 = xor i64 %i8, -1
%i10 = getelementptr inbounds <3 x i64>, <3 x i64> addrspace(1)* %i2, i64 0, i64 0
store i64 %i9, i64 addrspace(1)* %i10, align 32
store i64 %i9, ptr addrspace(1) %i2, align 32
ret void
}

define amdgpu_kernel void @uniform_or3_b32(<3 x i32> addrspace(1)* %arg) {
define amdgpu_kernel void @uniform_or3_b32(ptr addrspace(1) %arg) {
; GCN-LABEL: uniform_or3_b32:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
Expand All @@ -197,19 +191,18 @@ define amdgpu_kernel void @uniform_or3_b32(<3 x i32> addrspace(1)* %arg) {
; GCN-NEXT: global_store_dword v0, v1, s[4:5]
; GCN-NEXT: s_endpgm
bb:
%i3 = load <3 x i32>, <3 x i32> addrspace(1)* %arg, align 16
%i3 = load <3 x i32>, ptr addrspace(1) %arg, align 16
%i4 = extractelement <3 x i32> %i3, i64 0
%i5 = extractelement <3 x i32> %i3, i64 1
%i6 = extractelement <3 x i32> %i3, i64 2
%i7 = or i32 %i5, %i4
%i8 = or i32 %i7, %i6
%i9 = xor i32 %i8, -1
%i10 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %arg, i64 0, i64 0
store i32 %i9, i32 addrspace(1)* %i10, align 16
store i32 %i9, ptr addrspace(1) %arg, align 16
ret void
}

define amdgpu_kernel void @uniform_or3_b64(<3 x i64> addrspace(1)* %arg) {
define amdgpu_kernel void @uniform_or3_b64(ptr addrspace(1) %arg) {
; GCN-LABEL: uniform_or3_b64:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
Expand All @@ -225,19 +218,18 @@ define amdgpu_kernel void @uniform_or3_b64(<3 x i64> addrspace(1)* %arg) {
; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GCN-NEXT: s_endpgm
bb:
%i3 = load <3 x i64>, <3 x i64> addrspace(1)* %arg, align 32
%i3 = load <3 x i64>, ptr addrspace(1) %arg, align 32
%i4 = extractelement <3 x i64> %i3, i64 0
%i5 = extractelement <3 x i64> %i3, i64 1
%i6 = extractelement <3 x i64> %i3, i64 2
%i7 = or i64 %i5, %i4
%i8 = or i64 %i7, %i6
%i9 = xor i64 %i8, -1
%i10 = getelementptr inbounds <3 x i64>, <3 x i64> addrspace(1)* %arg, i64 0, i64 0
store i64 %i9, i64 addrspace(1)* %i10, align 32
store i64 %i9, ptr addrspace(1) %arg, align 32
ret void
}

define amdgpu_kernel void @uniform_and3_b32(<3 x i32> addrspace(1)* %arg) {
define amdgpu_kernel void @uniform_and3_b32(ptr addrspace(1) %arg) {
; GCN-LABEL: uniform_and3_b32:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
Expand All @@ -251,19 +243,18 @@ define amdgpu_kernel void @uniform_and3_b32(<3 x i32> addrspace(1)* %arg) {
; GCN-NEXT: global_store_dword v0, v1, s[4:5]
; GCN-NEXT: s_endpgm
bb:
%i3 = load <3 x i32>, <3 x i32> addrspace(1)* %arg, align 16
%i3 = load <3 x i32>, ptr addrspace(1) %arg, align 16
%i4 = extractelement <3 x i32> %i3, i64 0
%i5 = extractelement <3 x i32> %i3, i64 1
%i6 = extractelement <3 x i32> %i3, i64 2
%i7 = and i32 %i5, %i4
%i8 = and i32 %i7, %i6
%i9 = xor i32 %i8, -1
%i10 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %arg, i64 0, i64 0
store i32 %i9, i32 addrspace(1)* %i10, align 16
store i32 %i9, ptr addrspace(1) %arg, align 16
ret void
}

define amdgpu_kernel void @uniform_and3_b64(<3 x i64> addrspace(1)* %arg) {
define amdgpu_kernel void @uniform_and3_b64(ptr addrspace(1) %arg) {
; GCN-LABEL: uniform_and3_b64:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
Expand All @@ -279,19 +270,18 @@ define amdgpu_kernel void @uniform_and3_b64(<3 x i64> addrspace(1)* %arg) {
; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GCN-NEXT: s_endpgm
bb:
%i3 = load <3 x i64>, <3 x i64> addrspace(1)* %arg, align 32
%i3 = load <3 x i64>, ptr addrspace(1) %arg, align 32
%i4 = extractelement <3 x i64> %i3, i64 0
%i5 = extractelement <3 x i64> %i3, i64 1
%i6 = extractelement <3 x i64> %i3, i64 2
%i7 = and i64 %i5, %i4
%i8 = and i64 %i7, %i6
%i9 = xor i64 %i8, -1
%i10 = getelementptr inbounds <3 x i64>, <3 x i64> addrspace(1)* %arg, i64 0, i64 0
store i64 %i9, i64 addrspace(1)* %i10, align 32
store i64 %i9, ptr addrspace(1) %arg, align 32
ret void
}

define amdgpu_kernel void @uniform_xor3_b32(<3 x i32> addrspace(1)* %arg) {
define amdgpu_kernel void @uniform_xor3_b32(ptr addrspace(1) %arg) {
; GCN-LABEL: uniform_xor3_b32:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
Expand All @@ -305,19 +295,18 @@ define amdgpu_kernel void @uniform_xor3_b32(<3 x i32> addrspace(1)* %arg) {
; GCN-NEXT: global_store_dword v0, v1, s[4:5]
; GCN-NEXT: s_endpgm
bb:
%i3 = load <3 x i32>, <3 x i32> addrspace(1)* %arg, align 16
%i3 = load <3 x i32>, ptr addrspace(1) %arg, align 16
%i4 = extractelement <3 x i32> %i3, i64 0
%i5 = extractelement <3 x i32> %i3, i64 1
%i6 = extractelement <3 x i32> %i3, i64 2
%i7 = xor i32 %i5, %i4
%i8 = xor i32 %i7, %i6
%i9 = xor i32 %i8, -1
%i10 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %arg, i64 0, i64 0
store i32 %i9, i32 addrspace(1)* %i10, align 16
store i32 %i9, ptr addrspace(1) %arg, align 16
ret void
}

define amdgpu_kernel void @uniform_xor3_b64(<3 x i64> addrspace(1)* %arg) {
define amdgpu_kernel void @uniform_xor3_b64(ptr addrspace(1) %arg) {
; GCN-LABEL: uniform_xor3_b64:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
Expand All @@ -333,15 +322,14 @@ define amdgpu_kernel void @uniform_xor3_b64(<3 x i64> addrspace(1)* %arg) {
; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GCN-NEXT: s_endpgm
bb:
%i3 = load <3 x i64>, <3 x i64> addrspace(1)* %arg, align 32
%i3 = load <3 x i64>, ptr addrspace(1) %arg, align 32
%i4 = extractelement <3 x i64> %i3, i64 0
%i5 = extractelement <3 x i64> %i3, i64 1
%i6 = extractelement <3 x i64> %i3, i64 2
%i7 = xor i64 %i5, %i4
%i8 = xor i64 %i7, %i6
%i9 = xor i64 %i8, -1
%i10 = getelementptr inbounds <3 x i64>, <3 x i64> addrspace(1)* %arg, i64 0, i64 0
store i64 %i9, i64 addrspace(1)* %i10, align 32
store i64 %i9, ptr addrspace(1) %arg, align 32
ret void
}

Expand Down
26 changes: 13 additions & 13 deletions llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -37,20 +37,20 @@ define i64 @lshl_add_u64_vvv(i64 %v, i64 %s, i64 %a) {
define amdgpu_kernel void @lshl_add_u64_s2v(i64 %v) {
; GCN-LABEL: lshl_add_u64_s2v:
; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 2, v[{{[0-9:]+}}]
%a = load i64, i64* undef
%a = load i64, ptr undef
%shl = shl i64 %v, 2
%add = add i64 %shl, %a
store i64 %add, i64* undef
store i64 %add, ptr undef
ret void
}

define amdgpu_kernel void @lshl_add_u64_v2s(i64 %a) {
; GCN-LABEL: lshl_add_u64_v2s:
; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 2, s[{{[0-9:]+}}]
%v = load i64, i64* undef
%v = load i64, ptr undef
%shl = shl i64 %v, 2
%add = add i64 %shl, %a
store i64 %add, i64* undef
store i64 %add, ptr undef
ret void
}

Expand All @@ -61,7 +61,7 @@ define amdgpu_kernel void @lshl_add_u64_s2s(i64 %v, i64 %a) {
; GCN: s_addc_u32
%shl = shl i64 %v, 2
%add = add i64 %shl, %a
store i64 %add, i64* undef
store i64 %add, ptr undef
ret void
}

Expand All @@ -75,18 +75,18 @@ define i64 @add_u64_vv(i64 %v, i64 %a) {
define amdgpu_kernel void @add_u64_sv(i64 %v) {
; GCN-LABEL: add_u64_sv:
; GCN: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
%a = load i64, i64* undef
%a = load i64, ptr undef
%add = add i64 %v, %a
store i64 %add, i64* undef
store i64 %add, ptr undef
ret void
}

define amdgpu_kernel void @add_u64_vs(i64 %a) {
; GCN-LABEL: add_u64_vs:
; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
%v = load i64, i64* undef
%v = load i64, ptr undef
%add = add i64 %v, %a
store i64 %add, i64* undef
store i64 %add, ptr undef
ret void
}

Expand All @@ -95,14 +95,14 @@ define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) {
; GCN: s_add_u32
; GCN: s_addc_u32 s1, s1, s3
%add = add i64 %v, %a
store i64 %add, i64* undef
store i64 %add, ptr undef
ret void
}

define i32 @lshl_add_u64_gep(i32 *%p, i64 %a) {
define i32 @lshl_add_u64_gep(ptr %p, i64 %a) {
; GCN-LABEL: lshl_add_u64_gep:
; GCN: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
%gep = getelementptr inbounds i32, i32* %p, i64 %a
%v = load i32, i32* %gep
%gep = getelementptr inbounds i32, ptr %p, i64 %a
%v = load i32, ptr %gep
ret i32 %v
}
32 changes: 16 additions & 16 deletions llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-- -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

define amdgpu_kernel void @zext_shl64_to_32(i64 addrspace(1)* nocapture %out, i32 %x) {
define amdgpu_kernel void @zext_shl64_to_32(ptr addrspace(1) nocapture %out, i32 %x) {
; GCN-LABEL: zext_shl64_to_32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s4, s[0:1], 0xb
Expand All @@ -17,11 +17,11 @@ define amdgpu_kernel void @zext_shl64_to_32(i64 addrspace(1)* nocapture %out, i3
%and = and i32 %x, 1073741823
%ext = zext i32 %and to i64
%shl = shl i64 %ext, 2
store i64 %shl, i64 addrspace(1)* %out, align 4
store i64 %shl, ptr addrspace(1) %out, align 4
ret void
}

define amdgpu_kernel void @sext_shl64_to_32(i64 addrspace(1)* nocapture %out, i32 %x) {
define amdgpu_kernel void @sext_shl64_to_32(ptr addrspace(1) nocapture %out, i32 %x) {
; GCN-LABEL: sext_shl64_to_32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s4, s[0:1], 0xb
Expand All @@ -38,11 +38,11 @@ define amdgpu_kernel void @sext_shl64_to_32(i64 addrspace(1)* nocapture %out, i3
%and = and i32 %x, 536870911
%ext = sext i32 %and to i64
%shl = shl i64 %ext, 2
store i64 %shl, i64 addrspace(1)* %out, align 4
store i64 %shl, ptr addrspace(1) %out, align 4
ret void
}

define amdgpu_kernel void @zext_shl64_overflow(i64 addrspace(1)* nocapture %out, i32 %x) {
define amdgpu_kernel void @zext_shl64_overflow(ptr addrspace(1) nocapture %out, i32 %x) {
; GCN-LABEL: zext_shl64_overflow:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s4, s[0:1], 0xb
Expand All @@ -60,11 +60,11 @@ define amdgpu_kernel void @zext_shl64_overflow(i64 addrspace(1)* nocapture %out,
%and = and i32 %x, 2147483647
%ext = zext i32 %and to i64
%shl = shl i64 %ext, 2
store i64 %shl, i64 addrspace(1)* %out, align 4
store i64 %shl, ptr addrspace(1) %out, align 4
ret void
}

define amdgpu_kernel void @sext_shl64_overflow(i64 addrspace(1)* nocapture %out, i32 %x) {
define amdgpu_kernel void @sext_shl64_overflow(ptr addrspace(1) nocapture %out, i32 %x) {
; GCN-LABEL: sext_shl64_overflow:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s4, s[0:1], 0xb
Expand All @@ -82,11 +82,11 @@ define amdgpu_kernel void @sext_shl64_overflow(i64 addrspace(1)* nocapture %out,
%and = and i32 %x, 2147483647
%ext = sext i32 %and to i64
%shl = shl i64 %ext, 2
store i64 %shl, i64 addrspace(1)* %out, align 4
store i64 %shl, ptr addrspace(1) %out, align 4
ret void
}

define amdgpu_kernel void @mulu24_shl64(i32 addrspace(1)* nocapture %arg) {
define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: mulu24_shl64:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
Expand All @@ -104,12 +104,12 @@ bb:
%tmp1 = and i32 %tmp, 6
%mulconv = mul nuw nsw i32 %tmp1, 7
%tmp2 = zext i32 %mulconv to i64
%tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp2
store i32 0, i32 addrspace(1)* %tmp3, align 4
%tmp3 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp2
store i32 0, ptr addrspace(1) %tmp3, align 4
ret void
}

define amdgpu_kernel void @muli24_shl64(i64 addrspace(1)* nocapture %arg, i32 addrspace(1)* nocapture readonly %arg1) {
define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr addrspace(1) nocapture readonly %arg1) {
; GCN-LABEL: muli24_shl64:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
Expand All @@ -132,14 +132,14 @@ define amdgpu_kernel void @muli24_shl64(i64 addrspace(1)* nocapture %arg, i32 ad
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp2 = sext i32 %tmp to i64
%tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp2
%tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
%tmp3 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp2
%tmp4 = load i32, ptr addrspace(1) %tmp3, align 4
%tmp5 = or i32 %tmp4, -8388608
%tmp6 = mul nsw i32 %tmp5, -7
%tmp7 = zext i32 %tmp6 to i64
%tmp8 = shl nuw nsw i64 %tmp7, 3
%tmp9 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp2
store i64 %tmp8, i64 addrspace(1)* %tmp9, align 8
%tmp9 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 %tmp2
store i64 %tmp8, ptr addrspace(1) %tmp9, align 8
ret void
}

Expand Down
82 changes: 41 additions & 41 deletions llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s

define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
; GFX9-LABEL: s_lshr_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Expand Down Expand Up @@ -71,11 +71,11 @@ define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%result = lshr <2 x i16> %lhs, %rhs
store <2 x i16> %result, <2 x i16> addrspace(1)* %out
store <2 x i16> %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: v_lshr_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Expand Down Expand Up @@ -152,17 +152,17 @@ define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
%b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1
%a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
%b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
%in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
%b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %in.gep, i32 1
%a = load <2 x i16>, ptr addrspace(1) %in.gep
%b = load <2 x i16>, ptr addrspace(1) %b_ptr
%result = lshr <2 x i16> %a, %b
store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
store <2 x i16> %result, ptr addrspace(1) %out.gep
ret void
}

define amdgpu_kernel void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 {
; GFX9-LABEL: lshr_v_s_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
Expand Down Expand Up @@ -246,15 +246,15 @@ define amdgpu_kernel void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
%vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
%in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
%vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
%result = lshr <2 x i16> %vgpr, %sgpr
store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
store <2 x i16> %result, ptr addrspace(1) %out.gep
ret void
}

define amdgpu_kernel void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 {
; GFX9-LABEL: lshr_s_v_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
Expand Down Expand Up @@ -338,15 +338,15 @@ define amdgpu_kernel void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
%vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
%in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
%vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
%result = lshr <2 x i16> %sgpr, %vgpr
store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
store <2 x i16> %result, ptr addrspace(1) %out.gep
ret void
}

define amdgpu_kernel void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: lshr_imm_v_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Expand Down Expand Up @@ -422,15 +422,15 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
%vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
%in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
%vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
%result = lshr <2 x i16> <i16 8, i16 8>, %vgpr
store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
store <2 x i16> %result, ptr addrspace(1) %out.gep
ret void
}

define amdgpu_kernel void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: lshr_v_imm_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Expand Down Expand Up @@ -502,15 +502,15 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
%vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
%in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
%vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
%result = lshr <2 x i16> %vgpr, <i16 8, i16 8>
store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
store <2 x i16> %result, ptr addrspace(1) %out.gep
ret void
}

define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: v_lshr_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Expand Down Expand Up @@ -600,17 +600,17 @@ define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16>
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
%out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
%b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1
%a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
%b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
%in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
%out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext
%b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %in.gep, i32 1
%a = load <4 x i16>, ptr addrspace(1) %in.gep
%b = load <4 x i16>, ptr addrspace(1) %b_ptr
%result = lshr <4 x i16> %a, %b
store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
store <4 x i16> %result, ptr addrspace(1) %out.gep
ret void
}

define amdgpu_kernel void @lshr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: lshr_v_imm_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Expand Down Expand Up @@ -690,11 +690,11 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
%out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
%vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
%in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
%out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext
%vgpr = load <4 x i16>, ptr addrspace(1) %in.gep
%result = lshr <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8>
store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
store <4 x i16> %result, ptr addrspace(1) %out.gep
ret void
}

Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/nor.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
; GCN-LABEL: {{^}}scalar_nor_i32_one_use
; GCN: s_nor_b32
define amdgpu_kernel void @scalar_nor_i32_one_use(
i32 addrspace(1)* %r0, i32 %a, i32 %b) {
ptr addrspace(1) %r0, i32 %a, i32 %b) {
entry:
%or = or i32 %a, %b
%r0.val = xor i32 %or, -1
store i32 %r0.val, i32 addrspace(1)* %r0
store i32 %r0.val, ptr addrspace(1) %r0
ret void
}

Expand All @@ -20,24 +20,24 @@ entry:
; GCN: s_not_b32
; GCN: s_add_i32
define amdgpu_kernel void @scalar_nor_i32_mul_use(
i32 addrspace(1)* %r0, i32 addrspace(1)* %r1, i32 %a, i32 %b) {
ptr addrspace(1) %r0, ptr addrspace(1) %r1, i32 %a, i32 %b) {
entry:
%or = or i32 %a, %b
%r0.val = xor i32 %or, -1
%r1.val = add i32 %or, %a
store i32 %r0.val, i32 addrspace(1)* %r0
store i32 %r1.val, i32 addrspace(1)* %r1
store i32 %r0.val, ptr addrspace(1) %r0
store i32 %r1.val, ptr addrspace(1) %r1
ret void
}

; GCN-LABEL: {{^}}scalar_nor_i64_one_use
; GCN: s_nor_b64
define amdgpu_kernel void @scalar_nor_i64_one_use(
i64 addrspace(1)* %r0, i64 %a, i64 %b) {
ptr addrspace(1) %r0, i64 %a, i64 %b) {
entry:
%or = or i64 %a, %b
%r0.val = xor i64 %or, -1
store i64 %r0.val, i64 addrspace(1)* %r0
store i64 %r0.val, ptr addrspace(1) %r0
ret void
}

Expand All @@ -48,13 +48,13 @@ entry:
; GCN: s_add_u32
; GCN: s_addc_u32
define amdgpu_kernel void @scalar_nor_i64_mul_use(
i64 addrspace(1)* %r0, i64 addrspace(1)* %r1, i64 %a, i64 %b) {
ptr addrspace(1) %r0, ptr addrspace(1) %r1, i64 %a, i64 %b) {
entry:
%or = or i64 %a, %b
%r0.val = xor i64 %or, -1
%r1.val = add i64 %or, %a
store i64 %r0.val, i64 addrspace(1)* %r0
store i64 %r1.val, i64 addrspace(1)* %r1
store i64 %r0.val, ptr addrspace(1) %r0
store i64 %r1.val, ptr addrspace(1) %r1
ret void
}

Expand Down
128 changes: 64 additions & 64 deletions llvm/test/CodeGen/AMDGPU/or.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@

; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
define amdgpu_kernel void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
%b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
%a = load <2 x i32>, <2 x i32> addrspace(1) * %in
%b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
%b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
%a = load <2 x i32>, ptr addrspace(1) %in
%b = load <2 x i32>, ptr addrspace(1) %b_ptr
%result = or <2 x i32> %a, %b
store <2 x i32> %result, <2 x i32> addrspace(1)* %out
store <2 x i32> %result, ptr addrspace(1) %out
ret void
}

Expand All @@ -28,37 +28,37 @@ define amdgpu_kernel void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addr
; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
define amdgpu_kernel void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
%a = load <4 x i32>, <4 x i32> addrspace(1) * %in
%b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
%b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
%a = load <4 x i32>, ptr addrspace(1) %in
%b = load <4 x i32>, ptr addrspace(1) %b_ptr
%result = or <4 x i32> %a, %b
store <4 x i32> %result, <4 x i32> addrspace(1)* %out
store <4 x i32> %result, ptr addrspace(1) %out
ret void
}

; FUNC-LABEL: {{^}}scalar_or_i32:
; SI: s_or_b32
define amdgpu_kernel void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
%or = or i32 %a, %b
store i32 %or, i32 addrspace(1)* %out
store i32 %or, ptr addrspace(1) %out
ret void
}

; FUNC-LABEL: {{^}}vector_or_i32:
; SI: v_or_b32_e32 v{{[0-9]}}
define amdgpu_kernel void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) {
%loada = load i32, i32 addrspace(1)* %a
define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, i32 %b) {
%loada = load i32, ptr addrspace(1) %a
%or = or i32 %loada, %b
store i32 %or, i32 addrspace(1)* %out
store i32 %or, ptr addrspace(1) %out
ret void
}

; FUNC-LABEL: {{^}}scalar_or_literal_i32:
; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1869f
define amdgpu_kernel void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) {
define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) {
%or = or i32 %a, 99999
store i32 %or, i32 addrspace(1)* %out, align 4
store i32 %or, ptr addrspace(1) %out, align 4
ret void
}

Expand All @@ -68,9 +68,9 @@ define amdgpu_kernel void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a)
; SI-DAG: s_or_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039
; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]]
define amdgpu_kernel void @scalar_or_literal_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
%or = or i64 %a, 4261135838621753
store i64 %or, i64 addrspace(1)* %out
store i64 %or, ptr addrspace(1) %out
ret void
}

Expand All @@ -82,12 +82,12 @@ define amdgpu_kernel void @scalar_or_literal_i64(i64 addrspace(1)* %out, [8 x i3

; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3039
; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0xf237b
define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
%or = or i64 %a, 4261135838621753
store i64 %or, i64 addrspace(1)* %out
store i64 %or, ptr addrspace(1) %out

%foo = add i64 %b, 4261135838621753
store volatile i64 %foo, i64 addrspace(1)* undef
store volatile i64 %foo, ptr addrspace(1) undef
ret void
}

Expand All @@ -101,21 +101,21 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %ou
; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]]
; SI-NOT: or_b32
; SI: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
define amdgpu_kernel void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
%or = or i64 %a, 63
store i64 %or, i64 addrspace(1)* %out
store i64 %or, ptr addrspace(1) %out
ret void
}

; FUNC-LABEL: {{^}}scalar_or_inline_imm_multi_use_i64:
; SI-NOT: or_b32
; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 63
; SI-NOT: or_b32
define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
%or = or i64 %a, 63
store i64 %or, i64 addrspace(1)* %out
store i64 %or, ptr addrspace(1) %out
%foo = add i64 %b, 63
store volatile i64 %foo, i64 addrspace(1)* undef
store volatile i64 %foo, ptr addrspace(1) undef
ret void
}

Expand All @@ -125,27 +125,27 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)*
; SI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], -1{{$}}
; SI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[VAL]]
; SI: buffer_store_dwordx2 v[[[V_LO]]:[[V_HI]]]
define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
%or = or i64 %a, -8
store i64 %or, i64 addrspace(1)* %out
store i64 %or, ptr addrspace(1) %out
ret void
}

; FUNC-LABEL: {{^}}vector_or_literal_i32:
; SI: v_or_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
define amdgpu_kernel void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
%loada = load i32, i32 addrspace(1)* %a, align 4
define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
%loada = load i32, ptr addrspace(1) %a, align 4
%or = or i32 %loada, 65535
store i32 %or, i32 addrspace(1)* %out, align 4
store i32 %or, ptr addrspace(1) %out, align 4
ret void
}

; FUNC-LABEL: {{^}}vector_or_inline_immediate_i32:
; SI: v_or_b32_e32 v{{[0-9]+}}, 4, v{{[0-9]+}}
define amdgpu_kernel void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
%loada = load i32, i32 addrspace(1)* %a, align 4
define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
%loada = load i32, ptr addrspace(1) %a, align 4
%or = or i32 %loada, 4
store i32 %or, i32 addrspace(1)* %out, align 4
store i32 %or, ptr addrspace(1) %out, align 4
ret void
}

Expand All @@ -154,30 +154,30 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out
; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z

; SI: s_or_b64
define amdgpu_kernel void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
%or = or i64 %a, %b
store i64 %or, i64 addrspace(1)* %out
store i64 %or, ptr addrspace(1) %out
ret void
}

; FUNC-LABEL: {{^}}vector_or_i64:
; SI: v_or_b32_e32 v{{[0-9]}}
; SI: v_or_b32_e32 v{{[0-9]}}
define amdgpu_kernel void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
%loada = load i64, i64 addrspace(1)* %a, align 8
%loadb = load i64, i64 addrspace(1)* %b, align 8
define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
%loada = load i64, ptr addrspace(1) %a, align 8
%loadb = load i64, ptr addrspace(1) %b, align 8
%or = or i64 %loada, %loadb
store i64 %or, i64 addrspace(1)* %out
store i64 %or, ptr addrspace(1) %out
ret void
}

; FUNC-LABEL: {{^}}scalar_vector_or_i64:
; SI: v_or_b32_e32 v{{[0-9]}}
; SI: v_or_b32_e32 v{{[0-9]}}
define amdgpu_kernel void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) {
%loada = load i64, i64 addrspace(1)* %a
define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, i64 %b) {
%loada = load i64, ptr addrspace(1) %a
%or = or i64 %loada, %b
store i64 %or, i64 addrspace(1)* %out
store i64 %or, ptr addrspace(1) %out
ret void
}

Expand All @@ -186,10 +186,10 @@ define amdgpu_kernel void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addr
; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]]
; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]]
; SI: s_endpgm
define amdgpu_kernel void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
%loada = load i64, i64 addrspace(1)* %a, align 8
define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
%loada = load i64, ptr addrspace(1) %a, align 8
%or = or i64 %loada, 22470723082367
store i64 %or, i64 addrspace(1)* %out
store i64 %or, ptr addrspace(1) %out
ret void
}

Expand All @@ -200,10 +200,10 @@ define amdgpu_kernel void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 add
; SI-NOT: v_or_b32_e32 {{v[0-9]+}}, 0
; SI: buffer_store_dwordx2 v[[[LO_RESULT]]:[[HI_VREG]]]
; SI: s_endpgm
define amdgpu_kernel void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
%loada = load i64, i64 addrspace(1)* %a, align 8
define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
%loada = load i64, ptr addrspace(1) %a, align 8
%or = or i64 %loada, 8
store i64 %or, i64 addrspace(1)* %out
store i64 %or, ptr addrspace(1) %out
ret void
}

Expand All @@ -213,10 +213,10 @@ define amdgpu_kernel void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspa
; SI-DAG: v_mov_b32_e32 v[[RES_HI:[0-9]+]], -1{{$}}
; SI: buffer_store_dwordx2 v[[[RES_LO]]:[[RES_HI]]]
; SI: s_endpgm
define amdgpu_kernel void @vector_or_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
%loada = load i64, i64 addrspace(1)* %a, align 8
define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
%loada = load i64, ptr addrspace(1) %a, align 8
%or = or i64 %loada, -8
store i64 %or, i64 addrspace(1)* %out
store i64 %or, ptr addrspace(1) %out
ret void
}

Expand All @@ -226,10 +226,10 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(i64 addrspace(1)* %out,
; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffffff38, v[[LO_VREG]]
; SI: buffer_store_dwordx2
; SI: s_endpgm
define amdgpu_kernel void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
%loada = load i64, i64 addrspace(1)* %a, align 8
define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
%loada = load i64, ptr addrspace(1) %a, align 8
%or = or i64 %loada, -200
store i64 %or, i64 addrspace(1)* %out
store i64 %or, ptr addrspace(1) %out
ret void
}

Expand All @@ -239,25 +239,25 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64
; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]]
; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]]
; SI: buffer_store_dword [[VRESULT]],
define amdgpu_kernel void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
%add = or i64 %b, %a
%trunc = trunc i64 %add to i32
store i32 %trunc, i32 addrspace(1)* %out, align 8
store i32 %trunc, ptr addrspace(1) %out, align 8
ret void
}

; FUNC-LABEL: {{^}}or_i1:
; EG: OR_INT * {{\** *}}T{{[0-9]+\.[XYZW], PS, PV\.[XYZW]}}

; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], vcc
define amdgpu_kernel void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
%a = load float, float addrspace(1)* %in0
%b = load float, float addrspace(1)* %in1
define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
%a = load float, ptr addrspace(1) %in0
%b = load float, ptr addrspace(1) %in1
%acmp = fcmp oge float %a, 0.000000e+00
%bcmp = fcmp oge float %b, 0.000000e+00
%or = or i1 %acmp, %bcmp
%result = zext i1 %or to i32
store i32 %result, i32 addrspace(1)* %out
store i32 %result, ptr addrspace(1) %out
ret void
}

Expand All @@ -267,10 +267,10 @@ define amdgpu_kernel void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in
; SI: s_cmp_eq_u32
; SI: s_cselect_b64 [[C2:[^,]+]], -1, 0
; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], [[C1]], [[C2]]
define amdgpu_kernel void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) {
%cmp0 = icmp eq i32 %a, %b
%cmp1 = icmp eq i32 %c, %d
%or = or i1 %cmp0, %cmp1
store i1 %or, i1 addrspace(1)* %out
store i1 %or, ptr addrspace(1) %out
ret void
}
110 changes: 55 additions & 55 deletions llvm/test/CodeGen/AMDGPU/permute.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s

define amdgpu_kernel void @lsh8_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
define amdgpu_kernel void @lsh8_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) {
; GCN-LABEL: lsh8_or_and:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
Expand All @@ -19,16 +19,16 @@ define amdgpu_kernel void @lsh8_or_and(i32 addrspace(1)* nocapture %arg, i32 %ar
; GCN-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%tmp = load i32, i32 addrspace(1)* %gep, align 4
%gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
%tmp = load i32, ptr addrspace(1) %gep, align 4
%tmp2 = shl i32 %tmp, 8
%tmp3 = and i32 %arg1, 255
%tmp4 = or i32 %tmp2, %tmp3
store i32 %tmp4, i32 addrspace(1)* %gep, align 4
store i32 %tmp4, ptr addrspace(1) %gep, align 4
ret void
}

define amdgpu_kernel void @lsr24_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
define amdgpu_kernel void @lsr24_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) {
; GCN-LABEL: lsr24_or_and:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
Expand All @@ -46,16 +46,16 @@ define amdgpu_kernel void @lsr24_or_and(i32 addrspace(1)* nocapture %arg, i32 %a
; GCN-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%tmp = load i32, i32 addrspace(1)* %gep, align 4
%gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
%tmp = load i32, ptr addrspace(1) %gep, align 4
%tmp2 = lshr i32 %tmp, 24
%tmp3 = and i32 %arg1, 4294967040 ; 0xffffff00
%tmp4 = or i32 %tmp2, %tmp3
store i32 %tmp4, i32 addrspace(1)* %gep, align 4
store i32 %tmp4, ptr addrspace(1) %gep, align 4
ret void
}

define amdgpu_kernel void @and_or_lsr24(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
define amdgpu_kernel void @and_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %arg1) {
; GCN-LABEL: and_or_lsr24:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
Expand All @@ -74,17 +74,17 @@ define amdgpu_kernel void @and_or_lsr24(i32 addrspace(1)* nocapture %arg, i32 %a
; GCN-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%tmp = load i32, i32 addrspace(1)* %gep, align 4
%gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
%tmp = load i32, ptr addrspace(1) %gep, align 4
%tmp2 = and i32 %tmp, 4294967040 ; 0xffffff00
%tmp3 = lshr i32 %arg1, 24
%tmp4 = or i32 %tmp2, %tmp3
%tmp5 = xor i32 %tmp4, -2147483648
store i32 %tmp5, i32 addrspace(1)* %gep, align 4
store i32 %tmp5, ptr addrspace(1) %gep, align 4
ret void
}

define amdgpu_kernel void @and_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
define amdgpu_kernel void @and_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) {
; GCN-LABEL: and_or_and:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
Expand All @@ -102,16 +102,16 @@ define amdgpu_kernel void @and_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg
; GCN-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%tmp = load i32, i32 addrspace(1)* %gep, align 4
%gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
%tmp = load i32, ptr addrspace(1) %gep, align 4
%tmp2 = and i32 %tmp, -16711936
%tmp3 = and i32 %arg1, 16711935
%tmp4 = or i32 %tmp2, %tmp3
store i32 %tmp4, i32 addrspace(1)* %gep, align 4
store i32 %tmp4, ptr addrspace(1) %gep, align 4
ret void
}

define amdgpu_kernel void @lsh8_or_lsr24(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
define amdgpu_kernel void @lsh8_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %arg1) {
; GCN-LABEL: lsh8_or_lsr24:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
Expand All @@ -128,16 +128,16 @@ define amdgpu_kernel void @lsh8_or_lsr24(i32 addrspace(1)* nocapture %arg, i32 %
; GCN-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%tmp = load i32, i32 addrspace(1)* %gep, align 4
%gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
%tmp = load i32, ptr addrspace(1) %gep, align 4
%tmp2 = shl i32 %tmp, 8
%tmp3 = lshr i32 %arg1, 24
%tmp4 = or i32 %tmp2, %tmp3
store i32 %tmp4, i32 addrspace(1)* %gep, align 4
store i32 %tmp4, ptr addrspace(1) %gep, align 4
ret void
}

define amdgpu_kernel void @lsh16_or_lsr24(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
define amdgpu_kernel void @lsh16_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %arg1) {
; GCN-LABEL: lsh16_or_lsr24:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
Expand All @@ -155,16 +155,16 @@ define amdgpu_kernel void @lsh16_or_lsr24(i32 addrspace(1)* nocapture %arg, i32
; GCN-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%tmp = load i32, i32 addrspace(1)* %gep, align 4
%gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
%tmp = load i32, ptr addrspace(1) %gep, align 4
%tmp2 = shl i32 %tmp, 16
%tmp3 = lshr i32 %arg1, 24
%tmp4 = or i32 %tmp2, %tmp3
store i32 %tmp4, i32 addrspace(1)* %gep, align 4
store i32 %tmp4, ptr addrspace(1) %gep, align 4
ret void
}

define amdgpu_kernel void @and_xor_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
define amdgpu_kernel void @and_xor_and(ptr addrspace(1) nocapture %arg, i32 %arg1) {
; GCN-LABEL: and_xor_and:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
Expand All @@ -182,17 +182,17 @@ define amdgpu_kernel void @and_xor_and(i32 addrspace(1)* nocapture %arg, i32 %ar
; GCN-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%tmp = load i32, i32 addrspace(1)* %gep, align 4
%gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
%tmp = load i32, ptr addrspace(1) %gep, align 4
%tmp2 = and i32 %tmp, -16776961
%tmp3 = and i32 %arg1, 16776960
%tmp4 = xor i32 %tmp2, %tmp3
store i32 %tmp4, i32 addrspace(1)* %gep, align 4
store i32 %tmp4, ptr addrspace(1) %gep, align 4
ret void
}

; FIXME here should have been "v_perm_b32" with 0xffff0500 mask.
define amdgpu_kernel void @and_or_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
define amdgpu_kernel void @and_or_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) {
; GCN-LABEL: and_or_or_and:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
Expand All @@ -212,17 +212,17 @@ define amdgpu_kernel void @and_or_or_and(i32 addrspace(1)* nocapture %arg, i32 %
; GCN-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%tmp = load i32, i32 addrspace(1)* %gep, align 4
%gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
%tmp = load i32, ptr addrspace(1) %gep, align 4
%and = and i32 %tmp, 16711935 ; 0x00ff00ff
%tmp1 = and i32 %arg1, 4294967040 ; 0xffffff00
%tmp2 = or i32 %tmp1, -65536
%tmp3 = or i32 %tmp2, %and
store i32 %tmp3, i32 addrspace(1)* %gep, align 4
store i32 %tmp3, ptr addrspace(1) %gep, align 4
ret void
}

define amdgpu_kernel void @and_or_and_shl(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
define amdgpu_kernel void @and_or_and_shl(ptr addrspace(1) nocapture %arg, i32 %arg1) {
; GCN-LABEL: and_or_and_shl:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
Expand All @@ -240,17 +240,17 @@ define amdgpu_kernel void @and_or_and_shl(i32 addrspace(1)* nocapture %arg, i32
; GCN-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%tmp = load i32, i32 addrspace(1)* %gep, align 4
%gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
%tmp = load i32, ptr addrspace(1) %gep, align 4
%tmp2 = shl i32 %tmp, 16
%tmp3 = and i32 %arg1, 65535
%tmp4 = or i32 %tmp2, %tmp3
%and = and i32 %tmp4, 4278190335
store i32 %and, i32 addrspace(1)* %gep, align 4
store i32 %and, ptr addrspace(1) %gep, align 4
ret void
}

define amdgpu_kernel void @or_and_or(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
define amdgpu_kernel void @or_and_or(ptr addrspace(1) nocapture %arg, i32 %arg1) {
; GCN-LABEL: or_and_or:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
Expand All @@ -268,17 +268,17 @@ define amdgpu_kernel void @or_and_or(i32 addrspace(1)* nocapture %arg, i32 %arg1
; GCN-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%tmp = load i32, i32 addrspace(1)* %gep, align 4
%gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
%tmp = load i32, ptr addrspace(1) %gep, align 4
%or1 = or i32 %tmp, 16776960 ; 0x00ffff00
%or2 = or i32 %arg1, 4278190335 ; 0xff0000ff
%and = and i32 %or1, %or2
store i32 %and, i32 addrspace(1)* %gep, align 4
store i32 %and, ptr addrspace(1) %gep, align 4
ret void
}

; FIXME here should have been "v_perm_b32" with 0xffff0500 mask.
define amdgpu_kernel void @known_ffff0500(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
define amdgpu_kernel void @known_ffff0500(ptr addrspace(1) nocapture %arg, i32 %arg1) {
; GCN-LABEL: known_ffff0500:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
Expand All @@ -304,21 +304,21 @@ define amdgpu_kernel void @known_ffff0500(i32 addrspace(1)* nocapture %arg, i32
; GCN-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%load = load i32, i32 addrspace(1)* %gep, align 4
%gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
%load = load i32, ptr addrspace(1) %gep, align 4
%mask1 = or i32 %arg1, 32768 ; 0x8000
%mask2 = or i32 %load, 4
%and = and i32 %mask2, 16711935 ; 0x00ff00ff
%tmp1 = and i32 %mask1, 4294967040 ; 0xffffff00
%tmp2 = or i32 %tmp1, 4294901760 ; 0xffff0000
%tmp3 = or i32 %tmp2, %and
store i32 %tmp3, i32 addrspace(1)* %gep, align 4
store i32 %tmp3, ptr addrspace(1) %gep, align 4
%v = and i32 %tmp3, 4294934532 ; 0xffff8004
store i32 %v, i32 addrspace(1)* %arg, align 4
store i32 %v, ptr addrspace(1) %arg, align 4
ret void
}

define amdgpu_kernel void @known_050c0c00(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
define amdgpu_kernel void @known_050c0c00(ptr addrspace(1) nocapture %arg, i32 %arg1) {
; GCN-LABEL: known_050c0c00:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
Expand All @@ -341,20 +341,20 @@ define amdgpu_kernel void @known_050c0c00(i32 addrspace(1)* nocapture %arg, i32
; GCN-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%tmp = load i32, i32 addrspace(1)* %gep, align 4
%gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
%tmp = load i32, ptr addrspace(1) %gep, align 4
%tmp2 = shl i32 %tmp, 16
%mask = or i32 %arg1, 4
%tmp3 = and i32 %mask, 65535
%tmp4 = or i32 %tmp2, %tmp3
%and = and i32 %tmp4, 4278190335
store i32 %and, i32 addrspace(1)* %gep, align 4
store i32 %and, ptr addrspace(1) %gep, align 4
%v = and i32 %and, 16776964
store i32 %v, i32 addrspace(1)* %arg, align 4
store i32 %v, ptr addrspace(1) %arg, align 4
ret void
}

define amdgpu_kernel void @known_ffff8004(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
define amdgpu_kernel void @known_ffff8004(ptr addrspace(1) nocapture %arg, i32 %arg1) {
; GCN-LABEL: known_ffff8004:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
Expand All @@ -378,17 +378,17 @@ define amdgpu_kernel void @known_ffff8004(i32 addrspace(1)* nocapture %arg, i32
; GCN-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%load = load i32, i32 addrspace(1)* %gep, align 4
%gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
%load = load i32, ptr addrspace(1) %gep, align 4
%mask1 = or i32 %arg1, 4
%mask2 = or i32 %load, 32768 ; 0x8000
%and = and i32 %mask1, 16711935 ; 0x00ff00ff
%tmp1 = and i32 %mask2, 4294967040 ; 0xffffff00
%tmp2 = or i32 %tmp1, 4294901760 ; 0xffff0000
%tmp3 = or i32 %tmp2, %and
store i32 %tmp3, i32 addrspace(1)* %gep, align 4
store i32 %tmp3, ptr addrspace(1) %gep, align 4
%v = and i32 %tmp3, 4294934532 ; 0xffff8004
store i32 %v, i32 addrspace(1)* %arg, align 4
store i32 %v, ptr addrspace(1) %arg, align 4
ret void
}

Expand Down
50 changes: 25 additions & 25 deletions llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

; Extract the high bit of the 1st quarter
define amdgpu_kernel void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
define amdgpu_kernel void @v_uextract_bit_31_i128(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_31_i128:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
Expand All @@ -24,17 +24,17 @@ define amdgpu_kernel void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128
; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
%in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
%out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
%ld.64 = load i128, i128 addrspace(1)* %in.gep
%in.gep = getelementptr i128, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i128, ptr addrspace(1) %out, i32 %id.x
%ld.64 = load i128, ptr addrspace(1) %in.gep
%srl = lshr i128 %ld.64, 31
%bit = and i128 %srl, 1
store i128 %bit, i128 addrspace(1)* %out.gep
store i128 %bit, ptr addrspace(1) %out.gep
ret void
}

; Extract the high bit of the 2nd quarter
define amdgpu_kernel void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
define amdgpu_kernel void @v_uextract_bit_63_i128(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_63_i128:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
Expand All @@ -55,17 +55,17 @@ define amdgpu_kernel void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128
; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
%out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
%ld.64 = load i128, i128 addrspace(1)* %in.gep
%in.gep = getelementptr i128, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i128, ptr addrspace(1) %out, i32 %id.x
%ld.64 = load i128, ptr addrspace(1) %in.gep
%srl = lshr i128 %ld.64, 63
%bit = and i128 %srl, 1
store i128 %bit, i128 addrspace(1)* %out.gep
store i128 %bit, ptr addrspace(1) %out.gep
ret void
}

; Extract the high bit of the 3rd quarter
define amdgpu_kernel void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
define amdgpu_kernel void @v_uextract_bit_95_i128(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_95_i128:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
Expand All @@ -88,17 +88,17 @@ define amdgpu_kernel void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128
; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
%in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
%out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
%ld.64 = load i128, i128 addrspace(1)* %in.gep
%in.gep = getelementptr i128, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i128, ptr addrspace(1) %out, i32 %id.x
%ld.64 = load i128, ptr addrspace(1) %in.gep
%srl = lshr i128 %ld.64, 95
%bit = and i128 %srl, 1
store i128 %bit, i128 addrspace(1)* %out.gep
store i128 %bit, ptr addrspace(1) %out.gep
ret void
}

; Extract the high bit of the 4th quarter
define amdgpu_kernel void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
define amdgpu_kernel void @v_uextract_bit_127_i128(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_127_i128:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
Expand All @@ -119,17 +119,17 @@ define amdgpu_kernel void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128
; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
%out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
%ld.64 = load i128, i128 addrspace(1)* %in.gep
%in.gep = getelementptr i128, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i128, ptr addrspace(1) %out, i32 %id.x
%ld.64 = load i128, ptr addrspace(1) %in.gep
%srl = lshr i128 %ld.64, 127
%bit = and i128 %srl, 1
store i128 %bit, i128 addrspace(1)* %out.gep
store i128 %bit, ptr addrspace(1) %out.gep
ret void
}

; Spans more than 2 dword boundaries
define amdgpu_kernel void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
define amdgpu_kernel void @v_uextract_bit_34_100_i128(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_34_100_i128:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
Expand All @@ -150,12 +150,12 @@ define amdgpu_kernel void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i
; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[8:9], s[0:3], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
%out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
%ld.64 = load i128, i128 addrspace(1)* %in.gep
%in.gep = getelementptr i128, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i128, ptr addrspace(1) %out, i32 %id.x
%ld.64 = load i128, ptr addrspace(1) %in.gep
%srl = lshr i128 %ld.64, 34
%bit = and i128 %srl, 73786976294838206463
store i128 %bit, i128 addrspace(1)* %out.gep
store i128 %bit, ptr addrspace(1) %out.gep
ret void
}

Expand Down
228 changes: 114 additions & 114 deletions llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/shift-i128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
%shift = shl i128 %lhs, %rhs
store i128 %shift, i128 addrspace(1)* null
store i128 %shift, ptr addrspace(1) null
ret void
}

Expand Down Expand Up @@ -242,7 +242,7 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
%shift = lshr i128 %lhs, %rhs
store i128 %shift, i128 addrspace(1)* null
store i128 %shift, ptr addrspace(1) null
ret void
}

Expand Down Expand Up @@ -276,7 +276,7 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
%shift = ashr i128 %lhs, %rhs
store i128 %shift, i128 addrspace(1)* null
store i128 %shift, ptr addrspace(1) null
ret void
}

Expand Down Expand Up @@ -497,7 +497,7 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
%shift = shl <2 x i128> %lhs, %rhs
store <2 x i128> %shift, <2 x i128> addrspace(1)* null
store <2 x i128> %shift, ptr addrspace(1) null
ret void
}

Expand Down Expand Up @@ -569,7 +569,7 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
%shift = lshr <2 x i128> %lhs, %rhs
store <2 x i128> %shift, <2 x i128> addrspace(1)* null
store <2 x i128> %shift, ptr addrspace(1) null
ret void
}

Expand Down Expand Up @@ -643,7 +643,7 @@ define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
%shift = ashr <2 x i128> %lhs, %rhs
store <2 x i128> %shift, <2 x i128> addrspace(1)* null
store <2 x i128> %shift, ptr addrspace(1) null
ret void
}

152 changes: 76 additions & 76 deletions llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll

Large diffs are not rendered by default.

16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@
; CHECK-NOT: v_lshl
; CHECK: v_add_u32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADD]]
; CHECK: load_dword v{{[0-9]+}}, v[[[ADDRLO]]:
define amdgpu_kernel void @add_const_offset(i32 addrspace(1)* nocapture %arg) {
define amdgpu_kernel void @add_const_offset(ptr addrspace(1) nocapture %arg) {
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%add = add i32 %id, 200
%shl = shl i32 %add, 2
%ptr = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %shl
%val = load i32, i32 addrspace(1)* %ptr, align 4
store i32 %val, i32 addrspace(1)* %arg, align 4
%ptr = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %shl
%val = load i32, ptr addrspace(1) %ptr, align 4
store i32 %val, ptr addrspace(1) %arg, align 4
ret void
}

Expand All @@ -26,14 +26,14 @@ bb:
; CHECK-NOT: v_lshl
; CHECK: v_add_u32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[OR]]
; CHECK: load_dword v{{[0-9]+}}, v[[[ADDRLO]]:
define amdgpu_kernel void @or_const_offset(i32 addrspace(1)* nocapture %arg) {
define amdgpu_kernel void @or_const_offset(ptr addrspace(1) nocapture %arg) {
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%add = or i32 %id, 256
%shl = shl i32 %add, 2
%ptr = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %shl
%val = load i32, i32 addrspace(1)* %ptr, align 4
store i32 %val, i32 addrspace(1)* %arg, align 4
%ptr = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %shl
%val = load i32, ptr addrspace(1) %ptr, align 4
store i32 %val, ptr addrspace(1) %arg, align 4
ret void
}

Expand Down
222 changes: 111 additions & 111 deletions llvm/test/CodeGen/AMDGPU/shl.ll

Large diffs are not rendered by default.

82 changes: 41 additions & 41 deletions llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s

define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
; GFX9-LABEL: s_shl_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Expand Down Expand Up @@ -82,11 +82,11 @@ define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%result = shl <2 x i16> %lhs, %rhs
store <2 x i16> %result, <2 x i16> addrspace(1)* %out
store <2 x i16> %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: v_shl_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Expand Down Expand Up @@ -163,17 +163,17 @@ define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> a
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
%b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1
%a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
%b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
%in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
%b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %in.gep, i32 1
%a = load <2 x i16>, ptr addrspace(1) %in.gep
%b = load <2 x i16>, ptr addrspace(1) %b_ptr
%result = shl <2 x i16> %a, %b
store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
store <2 x i16> %result, ptr addrspace(1) %out.gep
ret void
}

define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 {
; GFX9-LABEL: shl_v_s_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
Expand Down Expand Up @@ -257,15 +257,15 @@ define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
%vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
%in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
%vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
%result = shl <2 x i16> %vgpr, %sgpr
store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
store <2 x i16> %result, ptr addrspace(1) %out.gep
ret void
}

define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 {
; GFX9-LABEL: shl_s_v_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
Expand Down Expand Up @@ -349,15 +349,15 @@ define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
%vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
%in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
%vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
%result = shl <2 x i16> %sgpr, %vgpr
store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
store <2 x i16> %result, ptr addrspace(1) %out.gep
ret void
}

define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: shl_imm_v_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Expand Down Expand Up @@ -434,15 +434,15 @@ define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i1
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
%vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
%in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
%vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
%result = shl <2 x i16> <i16 8, i16 8>, %vgpr
store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
store <2 x i16> %result, ptr addrspace(1) %out.gep
ret void
}

define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: shl_v_imm_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Expand Down Expand Up @@ -515,15 +515,15 @@ define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i1
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
%vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
%in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
%out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
%vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
%result = shl <2 x i16> %vgpr, <i16 8, i16 8>
store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
store <2 x i16> %result, ptr addrspace(1) %out.gep
ret void
}

define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: v_shl_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Expand Down Expand Up @@ -613,17 +613,17 @@ define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
%out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
%b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1
%a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
%b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
%in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
%out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext
%b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %in.gep, i32 1
%a = load <4 x i16>, ptr addrspace(1) %in.gep
%b = load <4 x i16>, ptr addrspace(1) %b_ptr
%result = shl <4 x i16> %a, %b
store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
store <4 x i16> %result, ptr addrspace(1) %out.gep
ret void
}

define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: shl_v_imm_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Expand Down Expand Up @@ -709,11 +709,11 @@ define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i1
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
%out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
%vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
%in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
%out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext
%vgpr = load <4 x i16>, ptr addrspace(1) %in.gep
%result = shl <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8>
store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
store <4 x i16> %result, ptr addrspace(1) %out.gep
ret void
}

Expand Down
34 changes: 17 additions & 17 deletions llvm/test/CodeGen/AMDGPU/shl_add_constant.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 36, [[REG]]
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
define amdgpu_kernel void @shl_2_add_9_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
%val = load i32, i32 addrspace(1)* %ptr, align 4
%ptr = getelementptr i32, ptr addrspace(1) %in, i32 %tid.x
%val = load i32, ptr addrspace(1) %ptr, align 4
%add = add i32 %val, 9
%result = shl i32 %add, 2
store i32 %result, i32 addrspace(1)* %out, align 4
store i32 %result, ptr addrspace(1) %out, align 4
ret void
}

Expand All @@ -25,14 +25,14 @@ define amdgpu_kernel void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace
; SI-DAG: buffer_store_dword [[ADDREG]]
; SI-DAG: buffer_store_dword [[SHLREG]]
; SI: s_endpgm
define amdgpu_kernel void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
define amdgpu_kernel void @shl_2_add_9_i32_2_add_uses(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
%val = load i32, i32 addrspace(1)* %ptr, align 4
%ptr = getelementptr i32, ptr addrspace(1) %in, i32 %tid.x
%val = load i32, ptr addrspace(1) %ptr, align 4
%add = add i32 %val, 9
%result = shl i32 %add, 2
store i32 %result, i32 addrspace(1)* %out0, align 4
store i32 %add, i32 addrspace(1)* %out1, align 4
store i32 %result, ptr addrspace(1) %out0, align 4
store i32 %add, ptr addrspace(1) %out1, align 4
ret void
}

Expand All @@ -43,13 +43,13 @@ define amdgpu_kernel void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i
; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xf9c, [[REG]]
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
define amdgpu_kernel void @shl_2_add_999_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
%val = load i32, i32 addrspace(1)* %ptr, align 4
%ptr = getelementptr i32, ptr addrspace(1) %in, i32 %tid.x
%val = load i32, ptr addrspace(1) %ptr, align 4
%shl = add i32 %val, 999
%result = shl i32 %shl, 2
store i32 %result, i32 addrspace(1)* %out, align 4
store i32 %result, ptr addrspace(1) %out, align 4
ret void
}

Expand All @@ -60,11 +60,11 @@ define amdgpu_kernel void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspa
; SI: s_addk_i32 [[RESULT]], 0x3d8
; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
; SI: buffer_store_dword [[VRESULT]]
define amdgpu_kernel void @test_add_shl_add_constant(i32 addrspace(1)* %out, [8 x i32], i32 %x, i32 %y) #0 {
define amdgpu_kernel void @test_add_shl_add_constant(ptr addrspace(1) %out, [8 x i32], i32 %x, i32 %y) #0 {
%add.0 = add i32 %x, 123
%shl = shl i32 %add.0, 3
%add.1 = add i32 %shl, %y
store i32 %add.1, i32 addrspace(1)* %out, align 4
store i32 %add.1, ptr addrspace(1) %out, align 4
ret void
}

Expand All @@ -76,11 +76,11 @@ define amdgpu_kernel void @test_add_shl_add_constant(i32 addrspace(1)* %out, [8
; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[TMP]]
; SI: buffer_store_dword [[VRESULT]]

define amdgpu_kernel void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, [8 x i32], i32 %x, i32 %y) #0 {
define amdgpu_kernel void @test_add_shl_add_constant_inv(ptr addrspace(1) %out, [8 x i32], i32 %x, i32 %y) #0 {
%add.0 = add i32 %x, 123
%shl = shl i32 %add.0, 3
%add.1 = add i32 %y, %shl
store i32 %add.1, i32 addrspace(1)* %out, align 4
store i32 %add.1, ptr addrspace(1) %out, align 4
ret void
}

Expand Down
244 changes: 122 additions & 122 deletions llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll

Large diffs are not rendered by default.

14 changes: 7 additions & 7 deletions llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,16 @@
; GCN: v_add_co_ci_u32_e32 v[[EXTRA_HI:[0-9]+]], vcc_lo, 0, v5, vcc_lo
; GCN: global_atomic_csub v{{[0-9]+}}, v[[[LO]]:[[HI]]], [[K]], off offset:512 glc
; GCN: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[EXTRA_LO]]:[[EXTRA_HI]]]
define i32 @shl_base_atomicrmw_global_atomic_csub_ptr(i32 addrspace(1)* %out, i64 addrspace(1)* %extra.use, [512 x i32] addrspace(1)* %ptr) #0 {
%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(1)* %ptr, i64 0, i64 32
%cast = ptrtoint i32 addrspace(1)* %arrayidx0 to i64
define i32 @shl_base_atomicrmw_global_atomic_csub_ptr(ptr addrspace(1) %out, ptr addrspace(1) %extra.use, ptr addrspace(1) %ptr) #0 {
%arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 32
%cast = ptrtoint ptr addrspace(1) %arrayidx0 to i64
%shl = shl i64 %cast, 2
%castback = inttoptr i64 %shl to i32 addrspace(1)*
%val = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %castback, i32 43)
store volatile i64 %cast, i64 addrspace(1)* %extra.use, align 4
%castback = inttoptr i64 %shl to ptr addrspace(1)
%val = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %castback, i32 43)
store volatile i64 %cast, ptr addrspace(1) %extra.use, align 4
ret i32 %val
}

declare i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* nocapture, i32) #0
declare i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) nocapture, i32) #0

attributes #0 = { argmemonly nounwind }
26 changes: 13 additions & 13 deletions llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@
; GCN-DAG: v_mov_b32_e32 [[THREE:v[0-9]+]], 3
; GCN-DAG: global_atomic_and v[[[LO]]:[[HI]]], [[THREE]], off offset:512
; GCN-DAG: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[EXTRA_LO]]:[[EXTRA_HI]]]
define void @shl_base_atomicrmw_global_ptr(i32 addrspace(1)* %out, i64 addrspace(1)* %extra.use, [512 x i32] addrspace(1)* %ptr) #0 {
%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(1)* %ptr, i64 0, i64 32
%cast = ptrtoint i32 addrspace(1)* %arrayidx0 to i64
define void @shl_base_atomicrmw_global_ptr(ptr addrspace(1) %out, ptr addrspace(1) %extra.use, ptr addrspace(1) %ptr) #0 {
%arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 32
%cast = ptrtoint ptr addrspace(1) %arrayidx0 to i64
%shl = shl i64 %cast, 2
%castback = inttoptr i64 %shl to i32 addrspace(1)*
%val = atomicrmw and i32 addrspace(1)* %castback, i32 3 seq_cst
store volatile i64 %cast, i64 addrspace(1)* %extra.use, align 4
%castback = inttoptr i64 %shl to ptr addrspace(1)
%val = atomicrmw and ptr addrspace(1) %castback, i32 3 seq_cst
store volatile i64 %cast, ptr addrspace(1) %extra.use, align 4
ret void
}

Expand All @@ -24,17 +24,17 @@ define void @shl_base_atomicrmw_global_ptr(i32 addrspace(1)* %out, i64 addrspace
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000
; GCN-DAG: global_atomic_add_f32 v[[[LO]]:[[HI]]], [[K]], off offset:512
; GCN-DAG: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[EXTRA_LO]]:[[EXTRA_HI]]]
define void @shl_base_global_ptr_global_atomic_fadd(i32 addrspace(1)* %out, i64 addrspace(1)* %extra.use, [512 x i32] addrspace(1)* %ptr) #0 {
%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(1)* %ptr, i64 0, i64 32
%cast = ptrtoint i32 addrspace(1)* %arrayidx0 to i64
define void @shl_base_global_ptr_global_atomic_fadd(ptr addrspace(1) %out, ptr addrspace(1) %extra.use, ptr addrspace(1) %ptr) #0 {
%arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 32
%cast = ptrtoint ptr addrspace(1) %arrayidx0 to i64
%shl = shl i64 %cast, 2
%castback = inttoptr i64 %shl to float addrspace(1)*
call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %castback, float 100.0)
store volatile i64 %cast, i64 addrspace(1)* %extra.use, align 4
%castback = inttoptr i64 %shl to ptr addrspace(1)
call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %castback, float 100.0)
store volatile i64 %cast, ptr addrspace(1) %extra.use, align 4
ret void
}

declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) #1
declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapture, float) #1

attributes #0 = { nounwind }
attributes #1 = { argmemonly nounwind willreturn }
102 changes: 51 additions & 51 deletions llvm/test/CodeGen/AMDGPU/sra.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

declare i32 @llvm.amdgcn.workitem.id.x() #0

define amdgpu_kernel void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
define amdgpu_kernel void @ashr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: ashr_v2i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
Expand Down Expand Up @@ -61,15 +61,15 @@ define amdgpu_kernel void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
; EG-NEXT: ASHR T0.X, T0.X, T0.Z,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
%a = load <2 x i32>, <2 x i32> addrspace(1)* %in
%b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
%b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
%a = load <2 x i32>, ptr addrspace(1) %in
%b = load <2 x i32>, ptr addrspace(1) %b_ptr
%result = ashr <2 x i32> %a, %b
store <2 x i32> %result, <2 x i32> addrspace(1)* %out
store <2 x i32> %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
define amdgpu_kernel void @ashr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: ashr_v4i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
Expand Down Expand Up @@ -134,17 +134,17 @@ define amdgpu_kernel void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
; EG-NEXT: ASHR T0.X, T0.X, T1.X,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
%a = load <4 x i32>, <4 x i32> addrspace(1)* %in
%b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
%b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
%a = load <4 x i32>, ptr addrspace(1) %in
%b = load <4 x i32>, ptr addrspace(1) %b_ptr
%result = ashr <4 x i32> %a, %b
store <4 x i32> %result, <4 x i32> addrspace(1)* %out
store <4 x i32> %result, ptr addrspace(1) %out
ret void
}

; FIXME: The ashr operation is uniform, but because its operands come from a
; global load we end up with the vector instructions rather than scalar.
define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: ashr_v2i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
Expand Down Expand Up @@ -230,17 +230,17 @@ define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> ad
; EG-NEXT: OR_INT T6.X, PS, PV.W,
; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
%a = load <2 x i16>, <2 x i16> addrspace(1)* %in
%b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
%b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %in, i16 1
%a = load <2 x i16>, ptr addrspace(1) %in
%b = load <2 x i16>, ptr addrspace(1) %b_ptr
%result = ashr <2 x i16> %a, %b
store <2 x i16> %result, <2 x i16> addrspace(1)* %out
store <2 x i16> %result, ptr addrspace(1) %out
ret void
}

; FIXME: The ashr operation is uniform, but because its operands come from a
; global load we end up with the vector instructions rather than scalar.
define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: ashr_v4i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
Expand Down Expand Up @@ -393,15 +393,15 @@ define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> ad
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV T7.X, PV.Y,
; EG-NEXT: MOV * T10.X, T6.X,
%b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
%a = load <4 x i16>, <4 x i16> addrspace(1)* %in
%b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
%b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %in, i16 1
%a = load <4 x i16>, ptr addrspace(1) %in
%b = load <4 x i16>, ptr addrspace(1) %b_ptr
%result = ashr <4 x i16> %a, %b
store <4 x i16> %result, <4 x i16> addrspace(1)* %out
store <4 x i16> %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) {
define amdgpu_kernel void @s_ashr_i64(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_ashr_i64:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s4, s[0:1], 0xb
Expand Down Expand Up @@ -445,11 +445,11 @@ define amdgpu_kernel void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) {
entry:
%in.ext = sext i32 %in to i64
%ashr = ashr i64 %in.ext, 8
store i64 %ashr, i64 addrspace(1)* %out
store i64 %ashr, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
define amdgpu_kernel void @ashr_i64_2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: ashr_i64_2:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
Expand Down Expand Up @@ -511,15 +511,15 @@ define amdgpu_kernel void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)*
; EG-NEXT: 31(4.344025e-44), 2(2.802597e-45)
; EG-NEXT: CNDE_INT * T0.Y, T1.W, T1.Z, PV.W,
entry:
%b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
%a = load i64, i64 addrspace(1)* %in
%b = load i64, i64 addrspace(1)* %b_ptr
%b_ptr = getelementptr i64, ptr addrspace(1) %in, i64 1
%a = load i64, ptr addrspace(1) %in
%b = load i64, ptr addrspace(1) %b_ptr
%result = ashr i64 %a, %b
store i64 %result, i64 addrspace(1)* %out
store i64 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
define amdgpu_kernel void @ashr_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: ashr_v2i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
Expand Down Expand Up @@ -594,16 +594,16 @@ define amdgpu_kernel void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> ad
; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
; EG-NEXT: CNDE_INT * T0.Y, T2.W, T2.Y, T1.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
%a = load <2 x i64>, <2 x i64> addrspace(1)* %in
%b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
%b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %in, i64 1
%a = load <2 x i64>, ptr addrspace(1) %in
%b = load <2 x i64>, ptr addrspace(1) %b_ptr
%result = ashr <2 x i64> %a, %b
store <2 x i64> %result, <2 x i64> addrspace(1)* %out
store <2 x i64> %result, ptr addrspace(1) %out
ret void
}

; FIXME: Broken on r600
define amdgpu_kernel void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: ashr_v4i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
Expand Down Expand Up @@ -712,15 +712,15 @@ define amdgpu_kernel void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> ad
; EG-NEXT: LSHR T3.X, KC0[2].Y, literal.x,
; EG-NEXT: CNDE_INT * T2.Y, T4.W, T5.Y, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
%a = load <4 x i64>, <4 x i64> addrspace(1)* %in
%b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
%b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %in, i64 1
%a = load <4 x i64>, ptr addrspace(1) %in
%b = load <4 x i64>, ptr addrspace(1) %b_ptr
%result = ashr <4 x i64> %a, %b
store <4 x i64> %result, <4 x i64> addrspace(1)* %out
store <4 x i64> %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @s_ashr_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
define amdgpu_kernel void @s_ashr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
; SI-LABEL: s_ashr_32_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[0:1], 0x14
Expand Down Expand Up @@ -770,11 +770,11 @@ define amdgpu_kernel void @s_ashr_32_i64(i64 addrspace(1)* %out, [8 x i32], i64
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%result = ashr i64 %a, 32
%add = add i64 %result, %b
store i64 %add, i64 addrspace(1)* %out
store i64 %add, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
define amdgpu_kernel void @v_ashr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: v_ashr_32_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
Expand Down Expand Up @@ -831,15 +831,15 @@ define amdgpu_kernel void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1
; EG-NEXT: ASHR * T0.Y, T0.X, literal.y,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
%a = load i64, i64 addrspace(1)* %gep.in
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i64, ptr addrspace(1) %out, i32 %tid
%a = load i64, ptr addrspace(1) %gep.in
%result = ashr i64 %a, 32
store i64 %result, i64 addrspace(1)* %gep.out
store i64 %result, ptr addrspace(1) %gep.out
ret void
}

define amdgpu_kernel void @s_ashr_63_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
; SI-LABEL: s_ashr_63_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[0:1], 0x14
Expand Down Expand Up @@ -889,11 +889,11 @@ define amdgpu_kernel void @s_ashr_63_i64(i64 addrspace(1)* %out, [8 x i32], i64
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%result = ashr i64 %a, 63
%add = add i64 %result, %b
store i64 %add, i64 addrspace(1)* %out
store i64 %add, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
define amdgpu_kernel void @v_ashr_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: v_ashr_63_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
Expand Down Expand Up @@ -954,11 +954,11 @@ define amdgpu_kernel void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1
; EG-NEXT: MOV * T0.Y, PV.X,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
%a = load i64, i64 addrspace(1)* %gep.in
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i64, ptr addrspace(1) %out, i32 %tid
%a = load i64, ptr addrspace(1) %gep.in
%result = ashr i64 %a, 63
store i64 %result, i64 addrspace(1)* %gep.out
store i64 %result, ptr addrspace(1) %gep.out
ret void
}

Expand Down
Loading