87 changes: 63 additions & 24 deletions llvm/test/CodeGen/AArch64/scmp.ll
Original file line number Diff line number Diff line change
@@ -1,26 +1,45 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI

define i8 @scmp.8.8(i8 %x, i8 %y) nounwind {
; CHECK-LABEL: scmp.8.8:
; CHECK: // %bb.0:
; CHECK-NEXT: sxtb w8, w0
; CHECK-NEXT: cmp w8, w1, sxtb
; CHECK-NEXT: cset w8, gt
; CHECK-NEXT: csinv w0, w8, wzr, ge
; CHECK-NEXT: ret
; CHECK-SD-LABEL: scmp.8.8:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: sxtb w8, w0
; CHECK-SD-NEXT: cmp w8, w1, sxtb
; CHECK-SD-NEXT: cset w8, gt
; CHECK-SD-NEXT: csinv w0, w8, wzr, ge
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: scmp.8.8:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: sxtb w8, w0
; CHECK-GI-NEXT: sxtb w9, w1
; CHECK-GI-NEXT: cmp w8, w9
; CHECK-GI-NEXT: cset w8, gt
; CHECK-GI-NEXT: csinv w0, w8, wzr, ge
; CHECK-GI-NEXT: ret
%1 = call i8 @llvm.scmp(i8 %x, i8 %y)
ret i8 %1
}

define i8 @scmp.8.16(i16 %x, i16 %y) nounwind {
; CHECK-LABEL: scmp.8.16:
; CHECK: // %bb.0:
; CHECK-NEXT: sxth w8, w0
; CHECK-NEXT: cmp w8, w1, sxth
; CHECK-NEXT: cset w8, gt
; CHECK-NEXT: csinv w0, w8, wzr, ge
; CHECK-NEXT: ret
; CHECK-SD-LABEL: scmp.8.16:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: sxth w8, w0
; CHECK-SD-NEXT: cmp w8, w1, sxth
; CHECK-SD-NEXT: cset w8, gt
; CHECK-SD-NEXT: csinv w0, w8, wzr, ge
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: scmp.8.16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: sxth w8, w0
; CHECK-GI-NEXT: sxth w9, w1
; CHECK-GI-NEXT: cmp w8, w9
; CHECK-GI-NEXT: cset w8, gt
; CHECK-GI-NEXT: csinv w0, w8, wzr, ge
; CHECK-GI-NEXT: ret
%1 = call i8 @llvm.scmp(i16 %x, i16 %y)
ret i8 %1
}
Expand Down Expand Up @@ -48,15 +67,35 @@ define i8 @scmp.8.64(i64 %x, i64 %y) nounwind {
}

define i8 @scmp.8.128(i128 %x, i128 %y) nounwind {
; CHECK-LABEL: scmp.8.128:
; CHECK: // %bb.0:
; CHECK-NEXT: cmp x2, x0
; CHECK-NEXT: sbcs xzr, x3, x1
; CHECK-NEXT: cset w8, lt
; CHECK-NEXT: cmp x0, x2
; CHECK-NEXT: sbcs xzr, x1, x3
; CHECK-NEXT: csinv w0, w8, wzr, ge
; CHECK-NEXT: ret
; CHECK-SD-LABEL: scmp.8.128:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: cmp x2, x0
; CHECK-SD-NEXT: sbcs xzr, x3, x1
; CHECK-SD-NEXT: cset w8, lt
; CHECK-SD-NEXT: cmp x0, x2
; CHECK-SD-NEXT: sbcs xzr, x1, x3
; CHECK-SD-NEXT: csinv w0, w8, wzr, ge
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: scmp.8.128:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: cmp x1, x3
; CHECK-GI-NEXT: cset w8, gt
; CHECK-GI-NEXT: cmp x0, x2
; CHECK-GI-NEXT: cset w9, hi
; CHECK-GI-NEXT: cmp x1, x3
; CHECK-GI-NEXT: csel w8, w9, w8, eq
; CHECK-GI-NEXT: tst w8, #0x1
; CHECK-GI-NEXT: cset w8, ne
; CHECK-GI-NEXT: cmp x1, x3
; CHECK-GI-NEXT: cset w9, lt
; CHECK-GI-NEXT: cmp x0, x2
; CHECK-GI-NEXT: cset w10, lo
; CHECK-GI-NEXT: cmp x1, x3
; CHECK-GI-NEXT: csel w9, w10, w9, eq
; CHECK-GI-NEXT: tst w9, #0x1
; CHECK-GI-NEXT: csinv w0, w8, wzr, eq
; CHECK-GI-NEXT: ret
%1 = call i8 @llvm.scmp(i128 %x, i128 %y)
ret i8 %1
}
Expand Down
134 changes: 98 additions & 36 deletions llvm/test/CodeGen/AArch64/ucmp.ll
Original file line number Diff line number Diff line change
@@ -1,26 +1,45 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI

define i8 @ucmp.8.8(i8 %x, i8 %y) nounwind {
; CHECK-LABEL: ucmp.8.8:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w0, #0xff
; CHECK-NEXT: cmp w8, w1, uxtb
; CHECK-NEXT: cset w8, hi
; CHECK-NEXT: csinv w0, w8, wzr, hs
; CHECK-NEXT: ret
; CHECK-SD-LABEL: ucmp.8.8:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: and w8, w0, #0xff
; CHECK-SD-NEXT: cmp w8, w1, uxtb
; CHECK-SD-NEXT: cset w8, hi
; CHECK-SD-NEXT: csinv w0, w8, wzr, hs
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: ucmp.8.8:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: and w8, w0, #0xff
; CHECK-GI-NEXT: and w9, w1, #0xff
; CHECK-GI-NEXT: cmp w8, w9
; CHECK-GI-NEXT: cset w8, hi
; CHECK-GI-NEXT: csinv w0, w8, wzr, hs
; CHECK-GI-NEXT: ret
%1 = call i8 @llvm.ucmp(i8 %x, i8 %y)
ret i8 %1
}

define i8 @ucmp.8.16(i16 %x, i16 %y) nounwind {
; CHECK-LABEL: ucmp.8.16:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w0, #0xffff
; CHECK-NEXT: cmp w8, w1, uxth
; CHECK-NEXT: cset w8, hi
; CHECK-NEXT: csinv w0, w8, wzr, hs
; CHECK-NEXT: ret
; CHECK-SD-LABEL: ucmp.8.16:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: and w8, w0, #0xffff
; CHECK-SD-NEXT: cmp w8, w1, uxth
; CHECK-SD-NEXT: cset w8, hi
; CHECK-SD-NEXT: csinv w0, w8, wzr, hs
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: ucmp.8.16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: and w8, w0, #0xffff
; CHECK-GI-NEXT: and w9, w1, #0xffff
; CHECK-GI-NEXT: cmp w8, w9
; CHECK-GI-NEXT: cset w8, hi
; CHECK-GI-NEXT: csinv w0, w8, wzr, hs
; CHECK-GI-NEXT: ret
%1 = call i8 @llvm.ucmp(i16 %x, i16 %y)
ret i8 %1
}
Expand Down Expand Up @@ -48,15 +67,35 @@ define i8 @ucmp.8.64(i64 %x, i64 %y) nounwind {
}

define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind {
; CHECK-LABEL: ucmp.8.128:
; CHECK: // %bb.0:
; CHECK-NEXT: cmp x2, x0
; CHECK-NEXT: sbcs xzr, x3, x1
; CHECK-NEXT: cset w8, lo
; CHECK-NEXT: cmp x0, x2
; CHECK-NEXT: sbcs xzr, x1, x3
; CHECK-NEXT: csinv w0, w8, wzr, hs
; CHECK-NEXT: ret
; CHECK-SD-LABEL: ucmp.8.128:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: cmp x2, x0
; CHECK-SD-NEXT: sbcs xzr, x3, x1
; CHECK-SD-NEXT: cset w8, lo
; CHECK-SD-NEXT: cmp x0, x2
; CHECK-SD-NEXT: sbcs xzr, x1, x3
; CHECK-SD-NEXT: csinv w0, w8, wzr, hs
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: ucmp.8.128:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: cmp x1, x3
; CHECK-GI-NEXT: cset w8, hi
; CHECK-GI-NEXT: cmp x0, x2
; CHECK-GI-NEXT: cset w9, hi
; CHECK-GI-NEXT: cmp x1, x3
; CHECK-GI-NEXT: csel w8, w9, w8, eq
; CHECK-GI-NEXT: tst w8, #0x1
; CHECK-GI-NEXT: cset w8, ne
; CHECK-GI-NEXT: cmp x1, x3
; CHECK-GI-NEXT: cset w9, lo
; CHECK-GI-NEXT: cmp x0, x2
; CHECK-GI-NEXT: cset w10, lo
; CHECK-GI-NEXT: cmp x1, x3
; CHECK-GI-NEXT: csel w9, w10, w9, eq
; CHECK-GI-NEXT: tst w9, #0x1
; CHECK-GI-NEXT: csinv w0, w8, wzr, eq
; CHECK-GI-NEXT: ret
%1 = call i8 @llvm.ucmp(i128 %x, i128 %y)
ret i8 %1
}
Expand Down Expand Up @@ -95,18 +134,41 @@ define i64 @ucmp.64.64(i64 %x, i64 %y) nounwind {
}

define <1 x i64> @ucmp.1.64.65(<1 x i65> %x, <1 x i65> %y) {
; CHECK-LABEL: ucmp.1.64.65:
; CHECK: // %bb.0:
; CHECK-NEXT: and x8, x1, #0x1
; CHECK-NEXT: and x9, x3, #0x1
; CHECK-NEXT: cmp x2, x0
; CHECK-NEXT: sbcs xzr, x9, x8
; CHECK-NEXT: cset x10, lo
; CHECK-NEXT: cmp x0, x2
; CHECK-NEXT: sbcs xzr, x8, x9
; CHECK-NEXT: csinv x8, x10, xzr, hs
; CHECK-NEXT: fmov d0, x8
; CHECK-NEXT: ret
; CHECK-SD-LABEL: ucmp.1.64.65:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: and x8, x1, #0x1
; CHECK-SD-NEXT: and x9, x3, #0x1
; CHECK-SD-NEXT: cmp x2, x0
; CHECK-SD-NEXT: sbcs xzr, x9, x8
; CHECK-SD-NEXT: cset x10, lo
; CHECK-SD-NEXT: cmp x0, x2
; CHECK-SD-NEXT: sbcs xzr, x8, x9
; CHECK-SD-NEXT: csinv x8, x10, xzr, hs
; CHECK-SD-NEXT: fmov d0, x8
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: ucmp.1.64.65:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: and x8, x1, #0x1
; CHECK-GI-NEXT: and x9, x3, #0x1
; CHECK-GI-NEXT: cmp x8, x9
; CHECK-GI-NEXT: cset w10, hi
; CHECK-GI-NEXT: cmp x0, x2
; CHECK-GI-NEXT: cset w11, hi
; CHECK-GI-NEXT: cmp x8, x9
; CHECK-GI-NEXT: csel w10, w11, w10, eq
; CHECK-GI-NEXT: tst w10, #0x1
; CHECK-GI-NEXT: cset x10, ne
; CHECK-GI-NEXT: cmp x8, x9
; CHECK-GI-NEXT: cset w11, lo
; CHECK-GI-NEXT: cmp x0, x2
; CHECK-GI-NEXT: cset w12, lo
; CHECK-GI-NEXT: cmp x8, x9
; CHECK-GI-NEXT: csel w8, w12, w11, eq
; CHECK-GI-NEXT: tst w8, #0x1
; CHECK-GI-NEXT: csinv x8, x10, xzr, eq
; CHECK-GI-NEXT: fmov d0, x8
; CHECK-GI-NEXT: ret
%1 = call <1 x i64> @llvm.ucmp(<1 x i65> %x, <1 x i65> %y)
ret <1 x i64> %1
}
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
Original file line number Diff line number Diff line change
Expand Up @@ -658,17 +658,17 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
;
; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign:
; GFX1013: ; %bb.0:
; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000
; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000
; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000
; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000
; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
; GFX1013-NEXT: v_mov_b32_e32 v0, s0
; GFX1013-NEXT: v_mov_b32_e32 v1, s1
; GFX1013-NEXT: v_mov_b32_e32 v2, s2
; GFX1013-NEXT: v_mov_b32_e32 v3, s3
; GFX1013-NEXT: v_mov_b32_e32 v0, s4
; GFX1013-NEXT: v_mov_b32_e32 v1, s5
; GFX1013-NEXT: v_mov_b32_e32 v2, s6
; GFX1013-NEXT: v_mov_b32_e32 v3, s7
; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6
; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
Expand All @@ -681,7 +681,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0
; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[8:11]
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
Expand Down Expand Up @@ -769,14 +769,14 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
;
; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
; GFX1013: ; %bb.0:
; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700
; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
; GFX1013-NEXT: v_mov_b32_e32 v0, s0
; GFX1013-NEXT: v_mov_b32_e32 v1, s1
; GFX1013-NEXT: v_mov_b32_e32 v2, s2
; GFX1013-NEXT: v_mov_b32_e32 v3, s3
; GFX1013-NEXT: v_mov_b32_e32 v0, s4
; GFX1013-NEXT: v_mov_b32_e32 v1, s5
; GFX1013-NEXT: v_mov_b32_e32 v2, s6
; GFX1013-NEXT: v_mov_b32_e32 v3, s7
; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6
; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
Expand All @@ -789,7 +789,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0
; GFX1013-NEXT: v_mov_b32_e32 v5, 0x44004200
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16
; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
Expand Down
28 changes: 14 additions & 14 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
;
; GFX10-LABEL: dpp_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: v_mov_b32_e32 v0, s6
; GFX10-NEXT: v_mov_b32_e32 v1, s7
; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: dpp_test:
Expand Down Expand Up @@ -176,16 +176,16 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
;
; GFX10-LABEL: update_dppv2i32_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
; GFX10-NEXT: v_mov_b32_e32 v2, s6
; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dppv2i32_test:
Expand Down Expand Up @@ -232,16 +232,16 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
;
; GFX10-LABEL: update_dppv2f32_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
; GFX10-NEXT: v_mov_b32_e32 v2, s6
; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dppv2f32_test:
Expand Down
152 changes: 76 additions & 76 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
Original file line number Diff line number Diff line change
Expand Up @@ -692,148 +692,148 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
;
; GFX9-LABEL: sdivrem_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s8, s6, 31
; GFX9-NEXT: s_add_i32 s6, s6, s8
; GFX9-NEXT: s_xor_b32 s6, s6, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: s_ashr_i32 s9, s7, 31
; GFX9-NEXT: s_add_i32 s7, s7, s9
; GFX9-NEXT: s_xor_b32 s7, s7, s9
; GFX9-NEXT: s_ashr_i32 s0, s14, 31
; GFX9-NEXT: s_add_i32 s1, s14, s0
; GFX9-NEXT: s_xor_b32 s1, s1, s0
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1
; GFX9-NEXT: s_ashr_i32 s2, s15, 31
; GFX9-NEXT: s_add_i32 s3, s15, s2
; GFX9-NEXT: s_xor_b32 s3, s3, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
; GFX9-NEXT: s_sub_i32 s12, 0, s6
; GFX9-NEXT: s_ashr_i32 s10, s4, 31
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
; GFX9-NEXT: s_sub_i32 s6, 0, s1
; GFX9-NEXT: s_ashr_i32 s4, s12, 31
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX9-NEXT: s_add_i32 s4, s4, s10
; GFX9-NEXT: s_xor_b32 s4, s4, s10
; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0
; GFX9-NEXT: s_sub_i32 s7, 0, s3
; GFX9-NEXT: s_ashr_i32 s5, s13, 31
; GFX9-NEXT: v_mul_lo_u32 v2, s6, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: s_sub_i32 s12, 0, s7
; GFX9-NEXT: s_add_i32 s6, s12, s4
; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX9-NEXT: s_ashr_i32 s11, s5, 31
; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1
; GFX9-NEXT: s_add_i32 s5, s5, s11
; GFX9-NEXT: s_xor_b32 s6, s6, s4
; GFX9-NEXT: v_mul_lo_u32 v3, s7, v1
; GFX9-NEXT: s_add_i32 s7, s13, s5
; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0
; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0
; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3
; GFX9-NEXT: s_xor_b32 s5, s5, s11
; GFX9-NEXT: v_mul_lo_u32 v3, v0, s6
; GFX9-NEXT: s_xor_b32 s7, s7, s5
; GFX9-NEXT: s_xor_b32 s0, s4, s0
; GFX9-NEXT: v_mul_lo_u32 v3, v0, s1
; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1
; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3
; GFX9-NEXT: v_mul_hi_u32 v1, s7, v1
; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v3
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v3
; GFX9-NEXT: v_subrev_u32_e32 v2, s1, v3
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX9-NEXT: v_add_u32_e32 v3, 1, v0
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v2
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v2
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v2
; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7
; GFX9-NEXT: v_mul_lo_u32 v3, v1, s3
; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
; GFX9-NEXT: s_xor_b32 s4, s10, s8
; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0
; GFX9-NEXT: v_sub_u32_e32 v3, s5, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3
; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX9-NEXT: v_subrev_u32_e32 v0, s0, v0
; GFX9-NEXT: v_sub_u32_e32 v3, s7, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3
; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3
; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3
; GFX9-NEXT: s_xor_b32 s4, s11, s9
; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3
; GFX9-NEXT: s_xor_b32 s0, s5, s2
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1
; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2
; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1
; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3
; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1
; GFX9-NEXT: v_xor_b32_e32 v2, s4, v2
; GFX9-NEXT: v_subrev_u32_e32 v1, s0, v1
; GFX9-NEXT: v_xor_b32_e32 v3, s5, v3
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2
; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3]
; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2
; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v3
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sdivrem_v2i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0
; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_ashr_i32 s1, s10, 31
; GFX10-NEXT: s_ashr_i32 s2, s11, 31
; GFX10-NEXT: s_add_i32 s0, s10, s1
; GFX10-NEXT: s_add_i32 s3, s11, s2
; GFX10-NEXT: s_xor_b32 s10, s0, s1
; GFX10-NEXT: s_ashr_i32 s1, s14, 31
; GFX10-NEXT: s_ashr_i32 s2, s15, 31
; GFX10-NEXT: s_add_i32 s0, s14, s1
; GFX10-NEXT: s_add_i32 s3, s15, s2
; GFX10-NEXT: s_xor_b32 s4, s0, s1
; GFX10-NEXT: s_xor_b32 s3, s3, s2
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3
; GFX10-NEXT: s_sub_i32 s0, 0, s10
; GFX10-NEXT: s_sub_i32 s11, 0, s3
; GFX10-NEXT: s_ashr_i32 s12, s9, 31
; GFX10-NEXT: s_sub_i32 s0, 0, s4
; GFX10-NEXT: s_sub_i32 s5, 0, s3
; GFX10-NEXT: s_ashr_i32 s6, s13, 31
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX10-NEXT: s_add_i32 s7, s13, s6
; GFX10-NEXT: s_xor_b32 s7, s7, s6
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX10-NEXT: v_mul_lo_u32 v2, s0, v0
; GFX10-NEXT: v_mul_lo_u32 v3, s11, v1
; GFX10-NEXT: s_ashr_i32 s11, s8, 31
; GFX10-NEXT: s_add_i32 s0, s8, s11
; GFX10-NEXT: s_add_i32 s8, s9, s12
; GFX10-NEXT: s_xor_b32 s0, s0, s11
; GFX10-NEXT: s_xor_b32 s8, s8, s12
; GFX10-NEXT: v_mul_lo_u32 v3, s5, v1
; GFX10-NEXT: s_ashr_i32 s5, s12, 31
; GFX10-NEXT: s_add_i32 s0, s12, s5
; GFX10-NEXT: s_xor_b32 s1, s5, s1
; GFX10-NEXT: s_xor_b32 s0, s0, s5
; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3
; GFX10-NEXT: s_xor_b32 s1, s11, s1
; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3
; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX10-NEXT: v_mul_hi_u32 v1, s8, v1
; GFX10-NEXT: v_mul_lo_u32 v2, v0, s10
; GFX10-NEXT: v_mul_hi_u32 v1, s7, v1
; GFX10-NEXT: v_mul_lo_u32 v2, v0, s4
; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s0, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v3, s8, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v3, s7, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
; GFX10-NEXT: s_xor_b32 s0, s12, s2
; GFX10-NEXT: s_xor_b32 s0, s6, s2
; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0
; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1
; GFX10-NEXT: v_xor_b32_e32 v2, s11, v2
; GFX10-NEXT: v_xor_b32_e32 v3, s12, v3
; GFX10-NEXT: v_xor_b32_e32 v2, s5, v2
; GFX10-NEXT: v_xor_b32_e32 v3, s6, v3
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1
; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s11, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s12, v3
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s5, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v3
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
; GFX10-NEXT: s_endpgm
%div = sdiv <2 x i32> %x, %y
store <2 x i32> %div, ptr addrspace(1) %out0
Expand Down
84 changes: 42 additions & 42 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
Original file line number Diff line number Diff line change
Expand Up @@ -576,12 +576,12 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
;
; GFX9-LABEL: udivrem_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11
; GFX9-NEXT: s_sub_i32 s0, 0, s10
; GFX9-NEXT: s_sub_i32 s1, 0, s11
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s14
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s15
; GFX9-NEXT: s_sub_i32 s0, 0, s14
; GFX9-NEXT: s_sub_i32 s1, 0, s15
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
Expand All @@ -593,47 +593,47 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3
; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0
; GFX9-NEXT: v_mul_hi_u32 v0, s12, v0
; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1
; GFX9-NEXT: v_mul_lo_u32 v2, v0, s10
; GFX9-NEXT: v_mul_hi_u32 v1, s13, v1
; GFX9-NEXT: v_mul_lo_u32 v2, v0, s14
; GFX9-NEXT: v_add_u32_e32 v4, 1, v0
; GFX9-NEXT: v_mul_lo_u32 v3, v1, s11
; GFX9-NEXT: v_mul_lo_u32 v3, v1, s15
; GFX9-NEXT: v_add_u32_e32 v5, 1, v1
; GFX9-NEXT: v_sub_u32_e32 v2, s8, v2
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2
; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3
; GFX9-NEXT: v_sub_u32_e32 v2, s12, v2
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v2
; GFX9-NEXT: v_sub_u32_e32 v3, s13, v3
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GFX9-NEXT: v_subrev_u32_e32 v4, s10, v2
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v3
; GFX9-NEXT: v_subrev_u32_e32 v4, s14, v2
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v3
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
; GFX9-NEXT: v_subrev_u32_e32 v5, s11, v3
; GFX9-NEXT: v_subrev_u32_e32 v5, s15, v3
; GFX9-NEXT: v_add_u32_e32 v4, 1, v0
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v2
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GFX9-NEXT: v_subrev_u32_e32 v4, s10, v2
; GFX9-NEXT: v_subrev_u32_e32 v4, s14, v2
; GFX9-NEXT: v_add_u32_e32 v5, 1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3
; GFX9-NEXT: v_subrev_u32_e32 v4, s11, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v3
; GFX9-NEXT: v_subrev_u32_e32 v4, s15, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: udivrem_v2i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0
; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s11
; GFX10-NEXT: s_sub_i32 s0, 0, s10
; GFX10-NEXT: s_sub_i32 s1, 0, s11
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15
; GFX10-NEXT: s_sub_i32 s0, 0, s14
; GFX10-NEXT: s_sub_i32 s1, 0, s15
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
Expand All @@ -646,34 +646,34 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3
; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3
; GFX10-NEXT: v_mul_hi_u32 v0, s8, v0
; GFX10-NEXT: v_mul_hi_u32 v1, s9, v1
; GFX10-NEXT: v_mul_lo_u32 v2, v0, s10
; GFX10-NEXT: v_mul_lo_u32 v3, v1, s11
; GFX10-NEXT: v_mul_hi_u32 v0, s12, v0
; GFX10-NEXT: v_mul_hi_u32 v1, s13, v1
; GFX10-NEXT: v_mul_lo_u32 v2, v0, s14
; GFX10-NEXT: v_mul_lo_u32 v3, v1, s15
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s8, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v3, s9, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v3
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s11, v3
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s12, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v3, s13, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s14, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s15, v3
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s15, v3
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s11, v3
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s15, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s14, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s15, v3
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[4:5]
; GFX10-NEXT: global_store_dwordx2 v8, v[2:3], s[6:7]
; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[8:9]
; GFX10-NEXT: global_store_dwordx2 v8, v[2:3], s[10:11]
; GFX10-NEXT: s_endpgm
%div = udiv <2 x i32> %x, %y
store <2 x i32> %div, ptr addrspace(1) %out0
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/AMDGPU/add.v2i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -245,21 +245,21 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x
;
; GFX9-LABEL: s_test_add_v2i16_kernarg:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_pk_add_u16 v1, s2, v1
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_pk_add_u16 v1, s6, v1
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_add_v2i16_kernarg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v1, s2, s3
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: v_pk_add_u16 v1, s6, s7
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_add_v2i16_kernarg:
Expand Down
408 changes: 204 additions & 204 deletions llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/build_vector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -316,14 +316,14 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
;
; GFX940-LABEL: build_v2i32_from_v4i16_shuffle:
; GFX940: ; %bb.0: ; %entry
; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: s_lshl_b32 s3, s3, 16
; GFX940-NEXT: s_lshl_b32 s2, s2, 16
; GFX940-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NEXT: v_mov_b32_e32 v1, s3
; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_lshl_b32 s0, s7, 16
; GFX940-NEXT: s_lshl_b32 s1, s6, 16
; GFX940-NEXT: v_mov_b32_e32 v0, s1
; GFX940-NEXT: v_mov_b32_e32 v1, s0
; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1
; GFX940-NEXT: s_endpgm
entry:
%shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
Expand Down
88 changes: 44 additions & 44 deletions llvm/test/CodeGen/AMDGPU/cluster_stores.ll
Original file line number Diff line number Diff line change
Expand Up @@ -49,37 +49,37 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali
;
; GFX10-LABEL: cluster_load_cluster_store:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s4, s0, 8
; GFX10-NEXT: s_addc_u32 s5, s1, 0
; GFX10-NEXT: s_add_u32 s6, s0, 16
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_addc_u32 s7, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: s_add_u32 s0, s0, 24
; GFX10-NEXT: v_mov_b32_e32 v2, s4
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v3, s5
; GFX10-NEXT: v_mov_b32_e32 v4, s6
; GFX10-NEXT: v_mov_b32_e32 v5, s7
; GFX10-NEXT: s_add_u32 s0, s4, 8
; GFX10-NEXT: s_addc_u32 s1, s5, 0
; GFX10-NEXT: s_add_u32 s2, s4, 16
; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: s_addc_u32 s3, s5, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: s_add_u32 s0, s4, 24
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: s_addc_u32 s1, s5, 0
; GFX10-NEXT: v_mov_b32_e32 v5, s3
; GFX10-NEXT: v_mov_b32_e32 v4, s2
; GFX10-NEXT: v_mov_b32_e32 v7, s1
; GFX10-NEXT: v_mov_b32_e32 v6, s0
; GFX10-NEXT: s_clause 0x3
; GFX10-NEXT: flat_load_dword v8, v[0:1]
; GFX10-NEXT: flat_load_dword v9, v[2:3]
; GFX10-NEXT: flat_load_dword v10, v[4:5]
; GFX10-NEXT: flat_load_dword v11, v[6:7]
; GFX10-NEXT: s_add_u32 s0, s2, 8
; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: s_add_u32 s0, s6, 8
; GFX10-NEXT: s_addc_u32 s1, s7, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s6
; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: s_add_u32 s0, s2, 16
; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: s_add_u32 s2, s2, 24
; GFX10-NEXT: s_addc_u32 s3, s3, 0
; GFX10-NEXT: s_add_u32 s0, s6, 16
; GFX10-NEXT: s_addc_u32 s1, s7, 0
; GFX10-NEXT: s_add_u32 s2, s6, 24
; GFX10-NEXT: v_mov_b32_e32 v1, s7
; GFX10-NEXT: s_addc_u32 s3, s7, 0
; GFX10-NEXT: v_mov_b32_e32 v5, s1
; GFX10-NEXT: v_mov_b32_e32 v4, s0
; GFX10-NEXT: v_mov_b32_e32 v7, s3
Expand Down Expand Up @@ -175,39 +175,39 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr
;
; GFX10-LABEL: cluster_load_valu_cluster_store:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s4, s0, 8
; GFX10-NEXT: s_addc_u32 s5, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v2, s4
; GFX10-NEXT: s_add_u32 s6, s0, 16
; GFX10-NEXT: v_mov_b32_e32 v3, s5
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_addc_u32 s7, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: s_add_u32 s0, s0, 24
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v4, s6
; GFX10-NEXT: v_mov_b32_e32 v5, s7
; GFX10-NEXT: s_add_u32 s0, s4, 8
; GFX10-NEXT: s_addc_u32 s1, s5, 0
; GFX10-NEXT: s_add_u32 s2, s4, 16
; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: s_addc_u32 s3, s5, 0
; GFX10-NEXT: s_add_u32 s0, s4, 24
; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: s_addc_u32 s1, s5, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: v_mov_b32_e32 v5, s3
; GFX10-NEXT: v_mov_b32_e32 v4, s2
; GFX10-NEXT: flat_load_dword v6, v[2:3]
; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: flat_load_dword v8, v[0:1]
; GFX10-NEXT: flat_load_dword v9, v[4:5]
; GFX10-NEXT: flat_load_dword v10, v[2:3]
; GFX10-NEXT: s_add_u32 s0, s2, 8
; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: s_add_u32 s4, s2, 16
; GFX10-NEXT: s_add_u32 s0, s6, 8
; GFX10-NEXT: s_addc_u32 s1, s7, 0
; GFX10-NEXT: s_add_u32 s2, s6, 16
; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: s_addc_u32 s5, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: s_addc_u32 s3, s7, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s6
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: s_add_u32 s0, s2, 24
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v5, s5
; GFX10-NEXT: s_add_u32 s0, s6, 24
; GFX10-NEXT: v_mov_b32_e32 v1, s7
; GFX10-NEXT: v_mov_b32_e32 v5, s3
; GFX10-NEXT: s_addc_u32 s1, s7, 0
; GFX10-NEXT: v_mov_b32_e32 v4, s2
; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v6
; GFX10-NEXT: v_mov_b32_e32 v7, s1
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
Original file line number Diff line number Diff line change
Expand Up @@ -665,17 +665,17 @@ define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %ar
;
; GFX9-LABEL: sub_zext_setcc_commute:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v3, v2, s[0:1]
; GFX9-NEXT: global_load_dword v3, v2, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3
; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0
; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: v_add_u32_e32 v0, s6, v0
; GFX9-NEXT: v_subrev_u32_e32 v0, s7, v0
; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
; GFX9-NEXT: s_endpgm
bb:
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
Expand Down Expand Up @@ -714,17 +714,17 @@ define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %ar
;
; GFX9-LABEL: sub_sext_setcc_commute:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v3, v2, s[0:1]
; GFX9-NEXT: global_load_dword v3, v2, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3
; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0
; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: v_add_u32_e32 v0, s6, v0
; GFX9-NEXT: v_subrev_u32_e32 v0, s7, v0
; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
; GFX9-NEXT: s_endpgm
bb:
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
Expand Down
180 changes: 90 additions & 90 deletions llvm/test/CodeGen/AMDGPU/ctlz.ll

Large diffs are not rendered by default.

158 changes: 79 additions & 79 deletions llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll

Large diffs are not rendered by default.

184 changes: 92 additions & 92 deletions llvm/test/CodeGen/AMDGPU/cttz.ll

Large diffs are not rendered by default.

126 changes: 63 additions & 63 deletions llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll

Large diffs are not rendered by default.

294 changes: 146 additions & 148 deletions llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll

Large diffs are not rendered by default.

32 changes: 16 additions & 16 deletions llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -376,22 +376,22 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32
;
; GFX9-LABEL: uniform_vec_i16_LH:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_pack_lh_b32_b16 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_pack_lh_b32_b16 s0, s6, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX906-LABEL: uniform_vec_i16_LH:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX906-NEXT: v_mov_b32_e32 v0, 0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: s_pack_lh_b32_b16 s2, s2, s3
; GFX906-NEXT: v_mov_b32_e32 v1, s2
; GFX906-NEXT: global_store_dword v0, v1, s[0:1]
; GFX906-NEXT: s_pack_lh_b32_b16 s0, s6, s7
; GFX906-NEXT: v_mov_b32_e32 v1, s0
; GFX906-NEXT: global_store_dword v0, v1, s[4:5]
; GFX906-NEXT: s_endpgm
;
; GFX11-LABEL: uniform_vec_i16_LH:
Expand Down Expand Up @@ -466,22 +466,22 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32
;
; GFX9-LABEL: uniform_vec_i16_HH:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_pack_hh_b32_b16 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_pack_hh_b32_b16 s0, s6, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX906-LABEL: uniform_vec_i16_HH:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX906-NEXT: v_mov_b32_e32 v0, 0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: s_pack_hh_b32_b16 s2, s2, s3
; GFX906-NEXT: v_mov_b32_e32 v1, s2
; GFX906-NEXT: global_store_dword v0, v1, s[0:1]
; GFX906-NEXT: s_pack_hh_b32_b16 s0, s6, s7
; GFX906-NEXT: v_mov_b32_e32 v1, s0
; GFX906-NEXT: global_store_dword v0, v1, s[4:5]
; GFX906-NEXT: s_endpgm
;
; GFX11-LABEL: uniform_vec_i16_HH:
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1842,21 +1842,21 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half
;
; GFX9-LABEL: s_copysign_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: s_movk_i32 s0, 0x7fff
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_mov_b32_e32 v2, s3
; GFX9-NEXT: s_lshr_b32 s3, s3, 16
; GFX9-NEXT: s_lshr_b32 s2, s2, 16
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: s_lshr_b32 s1, s7, 16
; GFX9-NEXT: s_lshr_b32 s2, s6, 16
; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: s_copysign_v2f16:
Expand Down
86 changes: 43 additions & 43 deletions llvm/test/CodeGen/AMDGPU/fdiv.ll
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,10 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa
;
; GFX10-LABEL: s_fdiv_f32_ninf:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2
; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2
; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6
; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6
; GFX10-NEXT: v_rcp_f32_e32 v1, v0
; GFX10-NEXT: s_denorm_mode 15
; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0
Expand All @@ -133,8 +133,8 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa
; GFX10-NEXT: s_denorm_mode 12
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_div_fixup_f32 v0, v0, s3, s2
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: v_div_fixup_f32 v0, v0, s7, s6
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_f32_ninf:
Expand Down Expand Up @@ -275,21 +275,21 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa
;
; GFX10-LABEL: s_fdiv_f32_ieee:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2
; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6
; GFX10-NEXT: v_rcp_f32_e32 v1, v0
; GFX10-NEXT: v_fma_f32 v2, -v0, v1, 1.0
; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v1
; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2
; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6
; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1
; GFX10-NEXT: v_fma_f32 v4, -v0, v3, v2
; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1
; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_div_fixup_f32 v0, v0, s3, s2
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: v_div_fixup_f32 v0, v0, s7, s6
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_f32_ieee:
Expand Down Expand Up @@ -370,16 +370,16 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo
;
; GFX10-LABEL: s_fdiv_25ulp_f32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |s3|
; GFX10-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s4
; GFX10-NEXT: v_mul_f32_e32 v1, s3, v0
; GFX10-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |s7|
; GFX10-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s0
; GFX10-NEXT: v_mul_f32_e32 v1, s7, v0
; GFX10-NEXT: v_rcp_f32_e32 v1, v1
; GFX10-NEXT: v_mul_f32_e32 v1, s2, v1
; GFX10-NEXT: v_mul_f32_e32 v1, s6, v1
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10-NEXT: global_store_dword v2, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_25ulp_f32:
Expand Down Expand Up @@ -482,18 +482,18 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a
;
; GFX10-LABEL: s_fdiv_25ulp_ieee_f32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_frexp_mant_f32_e32 v0, s3
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, s3
; GFX10-NEXT: v_frexp_mant_f32_e32 v2, s2
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v3, s2
; GFX10-NEXT: v_frexp_mant_f32_e32 v0, s7
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, s7
; GFX10-NEXT: v_frexp_mant_f32_e32 v2, s6
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v3, s6
; GFX10-NEXT: v_rcp_f32_e32 v0, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v1, v3, v1
; GFX10-NEXT: v_mul_f32_e32 v0, v2, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10-NEXT: global_store_dword v2, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_25ulp_ieee_f32:
Expand Down Expand Up @@ -559,12 +559,12 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a,
;
; GFX10-LABEL: s_fdiv_fast_ieee_f32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_rcp_f32_e32 v0, s3
; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: v_rcp_f32_e32 v0, s7
; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_fast_ieee_f32:
Expand Down Expand Up @@ -623,12 +623,12 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a,
;
; GFX10-LABEL: s_fdiv_f32_fast_math:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_rcp_f32_e32 v0, s3
; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: v_rcp_f32_e32 v0, s7
; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_f32_fast_math:
Expand Down Expand Up @@ -687,12 +687,12 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo
;
; GFX10-LABEL: s_fdiv_ulp25_f32_fast_math:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_rcp_f32_e32 v0, s3
; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: v_rcp_f32_e32 v0, s7
; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_ulp25_f32_fast_math:
Expand Down Expand Up @@ -829,10 +829,10 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a,
;
; GFX10-LABEL: s_fdiv_f32_arcp_daz:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2
; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2
; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6
; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6
; GFX10-NEXT: v_rcp_f32_e32 v1, v0
; GFX10-NEXT: s_denorm_mode 15
; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0
Expand All @@ -844,8 +844,8 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a,
; GFX10-NEXT: s_denorm_mode 12
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_div_fixup_f32 v0, v0, s3, s2
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: v_div_fixup_f32 v0, v0, s7, s6
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_f32_arcp_daz:
Expand Down Expand Up @@ -916,12 +916,12 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a,
;
; GFX10-LABEL: s_fdiv_f32_arcp_ninf:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_rcp_f32_e32 v0, s3
; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: v_rcp_f32_e32 v0, s7
; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_f32_arcp_ninf:
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/flat_atomics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4315,12 +4315,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old
;
; GCN3-LABEL: atomic_cmpxchg_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v2, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v3, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
Expand Down Expand Up @@ -4570,12 +4570,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) {
;
; GCN3-LABEL: atomic_cmpxchg_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v2, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v3, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
Expand Down
56 changes: 28 additions & 28 deletions llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3883,21 +3883,21 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
;
; GCN3-LABEL: atomic_max_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_ashr_i32 s5, s3, 31
; GCN3-NEXT: s_mov_b32 s4, s3
; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN3-NEXT: s_add_u32 s0, s0, s4
; GCN3-NEXT: s_addc_u32 s1, s1, s5
; GCN3-NEXT: s_ashr_i32 s1, s7, 31
; GCN3-NEXT: s_mov_b32 s0, s7
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_max_i32_e32 v2, s2, v3
; GCN3-NEXT: v_max_i32_e32 v2, s6, v3
; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
Expand Down Expand Up @@ -4085,21 +4085,21 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
;
; GCN3-LABEL: atomic_max_i32_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_ashr_i32 s5, s3, 31
; GCN3-NEXT: s_mov_b32 s4, s3
; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN3-NEXT: s_add_u32 s0, s0, s4
; GCN3-NEXT: s_addc_u32 s1, s1, s5
; GCN3-NEXT: s_ashr_i32 s1, s7, 31
; GCN3-NEXT: s_mov_b32 s0, s7
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v3, v[0:1]
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_max_i32_e32 v2, s2, v3
; GCN3-NEXT: v_max_i32_e32 v2, s6, v3
; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
Expand Down Expand Up @@ -5026,21 +5026,21 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
;
; GCN3-LABEL: atomic_umax_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_ashr_i32 s5, s3, 31
; GCN3-NEXT: s_mov_b32 s4, s3
; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN3-NEXT: s_add_u32 s0, s0, s4
; GCN3-NEXT: s_addc_u32 s1, s1, s5
; GCN3-NEXT: s_ashr_i32 s1, s7, 31
; GCN3-NEXT: s_mov_b32 s0, s7
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_max_u32_e32 v2, s2, v3
; GCN3-NEXT: v_max_u32_e32 v2, s6, v3
; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
Expand Down Expand Up @@ -6820,21 +6820,21 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
;
; GCN3-LABEL: atomic_min_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_ashr_i32 s5, s3, 31
; GCN3-NEXT: s_mov_b32 s4, s3
; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN3-NEXT: s_add_u32 s0, s0, s4
; GCN3-NEXT: s_addc_u32 s1, s1, s5
; GCN3-NEXT: s_ashr_i32 s1, s7, 31
; GCN3-NEXT: s_mov_b32 s0, s7
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_min_i32_e32 v2, s2, v3
; GCN3-NEXT: v_min_i32_e32 v2, s6, v3
; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
Expand Down
180 changes: 90 additions & 90 deletions llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll

Large diffs are not rendered by default.

30 changes: 15 additions & 15 deletions llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -271,12 +271,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre
;
; GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc
; GFX10-NEXT: v_mov_b32_e32 v1, s6
; GFX10-NEXT: v_mov_b32_e32 v0, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
; GFX10-NEXT: v_mov_b32_e32 v1, s10
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ds_write_b32 v1, v0
; GFX10-NEXT: s_endpgm
Expand Down Expand Up @@ -657,14 +657,14 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre
;
; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
; GFX10-NEXT: v_mov_b32_e32 v0, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dword v1, v0, s[6:7]
; GFX10-NEXT: global_store_dword v1, v0, s[10:11]
; GFX10-NEXT: s_endpgm
;
; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
Expand Down Expand Up @@ -736,14 +736,14 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre
;
; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
; G_GFX10: ; %bb.0: ; %main_body
; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX10-NEXT: v_mov_b32_e32 v0, s4
; G_GFX10-NEXT: v_mov_b32_e32 v1, s5
; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
; G_GFX10-NEXT: v_mov_b32_e32 v0, s8
; G_GFX10-NEXT: v_mov_b32_e32 v1, s9
; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
; G_GFX10-NEXT: v_mov_b32_e32 v1, 0
; G_GFX10-NEXT: s_waitcnt vmcnt(0)
; G_GFX10-NEXT: global_store_dword v1, v0, s[6:7]
; G_GFX10-NEXT: global_store_dword v1, v0, s[10:11]
; G_GFX10-NEXT: s_endpgm
;
; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
Expand Down
30 changes: 15 additions & 15 deletions llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -248,12 +248,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp
;
; GFX10-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc
; GFX10-NEXT: v_mov_b32_e32 v1, s6
; GFX10-NEXT: v_mov_b32_e32 v0, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
; GFX10-NEXT: v_mov_b32_e32 v1, s10
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ds_write_b32 v1, v0
; GFX10-NEXT: s_endpgm
Expand Down Expand Up @@ -600,14 +600,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp
;
; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
; GFX10-NEXT: v_mov_b32_e32 v0, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dword v1, v0, s[6:7]
; GFX10-NEXT: global_store_dword v1, v0, s[10:11]
; GFX10-NEXT: s_endpgm
;
; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
Expand Down Expand Up @@ -665,14 +665,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp
;
; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
; G_GFX10: ; %bb.0: ; %main_body
; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX10-NEXT: v_mov_b32_e32 v0, s4
; G_GFX10-NEXT: v_mov_b32_e32 v1, s5
; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
; G_GFX10-NEXT: v_mov_b32_e32 v0, s8
; G_GFX10-NEXT: v_mov_b32_e32 v1, s9
; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
; G_GFX10-NEXT: v_mov_b32_e32 v1, 0
; G_GFX10-NEXT: s_waitcnt vmcnt(0)
; G_GFX10-NEXT: global_store_dword v1, v0, s[6:7]
; G_GFX10-NEXT: global_store_dword v1, v0, s[10:11]
; G_GFX10-NEXT: s_endpgm
;
; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -452,13 +452,13 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
;
; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: v_mov_b32_e32 v2, s6
; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
; GFX10-NEXT: v_mov_b32_e32 v2, s7
; GFX10-NEXT: v_mov_b32_e32 v0, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: v_mov_b32_e32 v2, s10
; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
; GFX10-NEXT: v_mov_b32_e32 v2, s11
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ds_write_b64 v2, v[0:1]
; GFX10-NEXT: s_endpgm
Expand Down Expand Up @@ -506,13 +506,13 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
;
; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; G_GFX10: ; %bb.0: ; %main_body
; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX10-NEXT: v_mov_b32_e32 v0, s4
; G_GFX10-NEXT: v_mov_b32_e32 v1, s5
; G_GFX10-NEXT: v_mov_b32_e32 v2, s6
; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
; G_GFX10-NEXT: v_mov_b32_e32 v2, s7
; G_GFX10-NEXT: v_mov_b32_e32 v0, s8
; G_GFX10-NEXT: v_mov_b32_e32 v1, s9
; G_GFX10-NEXT: v_mov_b32_e32 v2, s10
; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
; G_GFX10-NEXT: v_mov_b32_e32 v2, s11
; G_GFX10-NEXT: s_waitcnt vmcnt(0)
; G_GFX10-NEXT: ds_write_b64 v2, v[0:1]
; G_GFX10-NEXT: s_endpgm
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -452,13 +452,13 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
;
; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: v_mov_b32_e32 v2, s6
; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
; GFX10-NEXT: v_mov_b32_e32 v2, s7
; GFX10-NEXT: v_mov_b32_e32 v0, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: v_mov_b32_e32 v2, s10
; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
; GFX10-NEXT: v_mov_b32_e32 v2, s11
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ds_write_b64 v2, v[0:1]
; GFX10-NEXT: s_endpgm
Expand Down Expand Up @@ -506,13 +506,13 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
;
; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
; G_GFX10: ; %bb.0: ; %main_body
; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX10-NEXT: v_mov_b32_e32 v0, s4
; G_GFX10-NEXT: v_mov_b32_e32 v1, s5
; G_GFX10-NEXT: v_mov_b32_e32 v2, s6
; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
; G_GFX10-NEXT: v_mov_b32_e32 v2, s7
; G_GFX10-NEXT: v_mov_b32_e32 v0, s8
; G_GFX10-NEXT: v_mov_b32_e32 v1, s9
; G_GFX10-NEXT: v_mov_b32_e32 v2, s10
; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
; G_GFX10-NEXT: v_mov_b32_e32 v2, s11
; G_GFX10-NEXT: s_waitcnt vmcnt(0)
; G_GFX10-NEXT: ds_write_b64 v2, v[0:1]
; G_GFX10-NEXT: s_endpgm
Expand Down
42 changes: 21 additions & 21 deletions llvm/test/CodeGen/AMDGPU/fshl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -135,12 +135,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
;
; GFX9-LABEL: fshl_i32_imm:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 25
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 25
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshl_i32_imm:
Expand All @@ -157,11 +157,11 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
;
; GFX10-LABEL: fshl_i32_imm:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: v_alignbit_b32 v1, s6, s7, 25
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fshl_i32_imm:
Expand Down Expand Up @@ -732,15 +732,15 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
;
; GFX9-LABEL: orxor2or1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s4, s2, 7
; GFX9-NEXT: s_or_b32 s4, s3, s4
; GFX9-NEXT: s_cmp_eq_u32 s4, 0
; GFX9-NEXT: s_cselect_b32 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_lshl_b32 s0, s6, 7
; GFX9-NEXT: s_or_b32 s0, s7, s0
; GFX9-NEXT: s_cmp_eq_u32 s0, 0
; GFX9-NEXT: s_cselect_b32 s0, s6, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: orxor2or1:
Expand All @@ -759,15 +759,15 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
;
; GFX10-LABEL: orxor2or1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_lshl_b32 s4, s2, 7
; GFX10-NEXT: s_or_b32 s4, s3, s4
; GFX10-NEXT: s_cmp_eq_u32 s4, 0
; GFX10-NEXT: s_cselect_b32 s2, s2, s3
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_lshl_b32 s0, s6, 7
; GFX10-NEXT: s_or_b32 s0, s7, s0
; GFX10-NEXT: s_cmp_eq_u32 s0, 0
; GFX10-NEXT: s_cselect_b32 s0, s6, s7
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: orxor2or1:
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/AMDGPU/fshr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -127,12 +127,12 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
;
; GFX9-LABEL: fshr_i32_imm:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 7
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 7
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshr_i32_imm:
Expand All @@ -149,11 +149,11 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
;
; GFX10-LABEL: fshr_i32_imm:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: v_alignbit_b32 v1, s6, s7, 7
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fshr_i32_imm:
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/global_atomics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4155,12 +4155,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32
;
; GFX9-LABEL: atomic_cmpxchg_i32_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
Expand Down Expand Up @@ -4406,12 +4406,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3
;
; GFX9-LABEL: atomic_cmpxchg_i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
Expand Down
96 changes: 48 additions & 48 deletions llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4679,28 +4679,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i
;
; GFX9-LABEL: atomic_max_i32_addr64_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s5, s3, 31
; GFX9-NEXT: s_mov_b32 s4, s3
; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GFX9-NEXT: s_add_u32 s0, s0, s4
; GFX9-NEXT: s_addc_u32 s1, s1, s5
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: s_ashr_i32 s1, s7, 31
; GFX9-NEXT: s_mov_b32 s0, s7
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GFX9-NEXT: s_add_u32 s0, s4, s0
; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_i32_e32 v0, s2, v1
; GFX9-NEXT: v_max_i32_e32 v0, s6, v1
; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB91_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
Expand Down Expand Up @@ -4890,28 +4890,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in,
;
; GFX9-LABEL: atomic_max_i32_addr64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s5, s3, 31
; GFX9-NEXT: s_mov_b32 s4, s3
; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GFX9-NEXT: s_add_u32 s0, s0, s4
; GFX9-NEXT: s_addc_u32 s1, s1, s5
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: s_ashr_i32 s1, s7, 31
; GFX9-NEXT: s_mov_b32 s0, s7
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GFX9-NEXT: s_add_u32 s0, s4, s0
; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_i32_e32 v0, s2, v1
; GFX9-NEXT: v_max_i32_e32 v0, s6, v1
; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB93_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
Expand Down Expand Up @@ -5932,28 +5932,28 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out,
;
; GFX9-LABEL: atomic_umax_i32_addr64_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s5, s3, 31
; GFX9-NEXT: s_mov_b32 s4, s3
; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GFX9-NEXT: s_add_u32 s0, s0, s4
; GFX9-NEXT: s_addc_u32 s1, s1, s5
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: s_ashr_i32 s1, s7, 31
; GFX9-NEXT: s_mov_b32 s0, s7
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GFX9-NEXT: s_add_u32 s0, s4, s0
; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_u32_e32 v0, s2, v1
; GFX9-NEXT: v_max_u32_e32 v0, s6, v1
; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB105_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
Expand Down Expand Up @@ -7923,28 +7923,28 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i
;
; GFX9-LABEL: atomic_min_i32_addr64_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s5, s3, 31
; GFX9-NEXT: s_mov_b32 s4, s3
; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GFX9-NEXT: s_add_u32 s0, s0, s4
; GFX9-NEXT: s_addc_u32 s1, s1, s5
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: s_ashr_i32 s1, s7, 31
; GFX9-NEXT: s_mov_b32 s0, s7
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GFX9-NEXT: s_add_u32 s0, s4, s0
; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_min_i32_e32 v0, s2, v1
; GFX9-NEXT: v_min_i32_e32 v0, s6, v1
; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB128_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
Expand Down
161 changes: 81 additions & 80 deletions llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
Original file line number Diff line number Diff line change
Expand Up @@ -438,121 +438,121 @@ entry:
define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
; GFX9-LABEL: udiv_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
; GFX9-NEXT: s_sub_i32 s4, 0, s3
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
; GFX9-NEXT: s_sub_i32 s0, 0, s7
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s5, v0
; GFX9-NEXT: s_mul_i32 s4, s4, s5
; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
; GFX9-NEXT: s_add_i32 s5, s5, s4
; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5
; GFX9-NEXT: s_mul_i32 s5, s4, s3
; GFX9-NEXT: s_sub_i32 s2, s2, s5
; GFX9-NEXT: s_add_i32 s6, s4, 1
; GFX9-NEXT: s_sub_i32 s5, s2, s3
; GFX9-NEXT: s_cmp_ge_u32 s2, s3
; GFX9-NEXT: s_cselect_b32 s4, s6, s4
; GFX9-NEXT: s_cselect_b32 s2, s5, s2
; GFX9-NEXT: s_add_i32 s5, s4, 1
; GFX9-NEXT: s_cmp_ge_u32 s2, s3
; GFX9-NEXT: s_cselect_b32 s2, s5, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: v_readfirstlane_b32 s1, v0
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0
; GFX9-NEXT: s_add_i32 s1, s1, s0
; GFX9-NEXT: s_mul_hi_u32 s0, s6, s1
; GFX9-NEXT: s_mul_i32 s1, s0, s7
; GFX9-NEXT: s_sub_i32 s1, s6, s1
; GFX9-NEXT: s_add_i32 s2, s0, 1
; GFX9-NEXT: s_sub_i32 s3, s1, s7
; GFX9-NEXT: s_cmp_ge_u32 s1, s7
; GFX9-NEXT: s_cselect_b32 s0, s2, s0
; GFX9-NEXT: s_cselect_b32 s1, s3, s1
; GFX9-NEXT: s_add_i32 s2, s0, 1
; GFX9-NEXT: s_cmp_ge_u32 s1, s7
; GFX9-NEXT: s_cselect_b32 s0, s2, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX90A-LABEL: udiv_i32:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, 0
; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3
; GFX90A-NEXT: s_sub_i32 s4, 0, s3
; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7
; GFX90A-NEXT: s_sub_i32 s0, 0, s7
; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX90A-NEXT: v_readfirstlane_b32 s5, v0
; GFX90A-NEXT: s_mul_i32 s4, s4, s5
; GFX90A-NEXT: s_mul_hi_u32 s4, s5, s4
; GFX90A-NEXT: s_add_i32 s5, s5, s4
; GFX90A-NEXT: s_mul_hi_u32 s4, s2, s5
; GFX90A-NEXT: s_mul_i32 s5, s4, s3
; GFX90A-NEXT: s_sub_i32 s2, s2, s5
; GFX90A-NEXT: s_add_i32 s6, s4, 1
; GFX90A-NEXT: s_sub_i32 s5, s2, s3
; GFX90A-NEXT: s_cmp_ge_u32 s2, s3
; GFX90A-NEXT: s_cselect_b32 s4, s6, s4
; GFX90A-NEXT: s_cselect_b32 s2, s5, s2
; GFX90A-NEXT: s_add_i32 s5, s4, 1
; GFX90A-NEXT: s_cmp_ge_u32 s2, s3
; GFX90A-NEXT: s_cselect_b32 s2, s5, s4
; GFX90A-NEXT: v_mov_b32_e32 v0, s2
; GFX90A-NEXT: global_store_dword v1, v0, s[0:1]
; GFX90A-NEXT: v_readfirstlane_b32 s1, v0
; GFX90A-NEXT: s_mul_i32 s0, s0, s1
; GFX90A-NEXT: s_mul_hi_u32 s0, s1, s0
; GFX90A-NEXT: s_add_i32 s1, s1, s0
; GFX90A-NEXT: s_mul_hi_u32 s0, s6, s1
; GFX90A-NEXT: s_mul_i32 s1, s0, s7
; GFX90A-NEXT: s_sub_i32 s1, s6, s1
; GFX90A-NEXT: s_add_i32 s2, s0, 1
; GFX90A-NEXT: s_sub_i32 s3, s1, s7
; GFX90A-NEXT: s_cmp_ge_u32 s1, s7
; GFX90A-NEXT: s_cselect_b32 s0, s2, s0
; GFX90A-NEXT: s_cselect_b32 s1, s3, s1
; GFX90A-NEXT: s_add_i32 s2, s0, 1
; GFX90A-NEXT: s_cmp_ge_u32 s1, s7
; GFX90A-NEXT: s_cselect_b32 s0, s2, s0
; GFX90A-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-NEXT: global_store_dword v1, v0, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_endpgm
;
; GFX10-LABEL: udiv_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3
; GFX10-NEXT: s_sub_i32 s5, 0, s3
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7
; GFX10-NEXT: s_sub_i32 s1, 0, s7
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_mul_i32 s5, s5, s4
; GFX10-NEXT: s_mul_hi_u32 s5, s4, s5
; GFX10-NEXT: s_add_i32 s4, s4, s5
; GFX10-NEXT: s_mul_hi_u32 s4, s2, s4
; GFX10-NEXT: s_mul_i32 s5, s4, s3
; GFX10-NEXT: s_sub_i32 s2, s2, s5
; GFX10-NEXT: s_add_i32 s5, s4, 1
; GFX10-NEXT: s_sub_i32 s6, s2, s3
; GFX10-NEXT: s_cmp_ge_u32 s2, s3
; GFX10-NEXT: s_cselect_b32 s4, s5, s4
; GFX10-NEXT: s_cselect_b32 s2, s6, s2
; GFX10-NEXT: s_add_i32 s5, s4, 1
; GFX10-NEXT: s_cmp_ge_u32 s2, s3
; GFX10-NEXT: s_cselect_b32 s2, s5, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_mul_i32 s1, s1, s0
; GFX10-NEXT: s_mul_hi_u32 s1, s0, s1
; GFX10-NEXT: s_add_i32 s0, s0, s1
; GFX10-NEXT: s_mul_hi_u32 s0, s6, s0
; GFX10-NEXT: s_mul_i32 s1, s0, s7
; GFX10-NEXT: s_add_i32 s2, s0, 1
; GFX10-NEXT: s_sub_i32 s1, s6, s1
; GFX10-NEXT: s_sub_i32 s3, s1, s7
; GFX10-NEXT: s_cmp_ge_u32 s1, s7
; GFX10-NEXT: s_cselect_b32 s0, s2, s0
; GFX10-NEXT: s_cselect_b32 s1, s3, s1
; GFX10-NEXT: s_add_i32 s2, s0, 1
; GFX10-NEXT: s_cmp_ge_u32 s1, s7
; GFX10-NEXT: s_cselect_b32 s0, s2, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
; GFX9-FLATSCR-LABEL: udiv_i32:
; GFX9-FLATSCR: ; %bb.0:
; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0
; GFX9-FLATSCR-NEXT: v_cvt_f32_u32_e32 v0, s3
; GFX9-FLATSCR-NEXT: s_sub_i32 s4, 0, s3
; GFX9-FLATSCR-NEXT: v_cvt_f32_u32_e32 v0, s7
; GFX9-FLATSCR-NEXT: s_sub_i32 s0, 0, s7
; GFX9-FLATSCR-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-FLATSCR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-FLATSCR-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s5, v0
; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, s5
; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s4, s5, s4
; GFX9-FLATSCR-NEXT: s_add_i32 s5, s5, s4
; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s4, s2, s5
; GFX9-FLATSCR-NEXT: s_mul_i32 s5, s4, s3
; GFX9-FLATSCR-NEXT: s_sub_i32 s2, s2, s5
; GFX9-FLATSCR-NEXT: s_add_i32 s6, s4, 1
; GFX9-FLATSCR-NEXT: s_sub_i32 s5, s2, s3
; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s2, s3
; GFX9-FLATSCR-NEXT: s_cselect_b32 s4, s6, s4
; GFX9-FLATSCR-NEXT: s_cselect_b32 s2, s5, s2
; GFX9-FLATSCR-NEXT: s_add_i32 s5, s4, 1
; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s2, s3
; GFX9-FLATSCR-NEXT: s_cselect_b32 s2, s5, s4
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s2
; GFX9-FLATSCR-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s1, v0
; GFX9-FLATSCR-NEXT: s_mul_i32 s0, s0, s1
; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s0, s1, s0
; GFX9-FLATSCR-NEXT: s_add_i32 s1, s1, s0
; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s0, s6, s1
; GFX9-FLATSCR-NEXT: s_mul_i32 s1, s0, s7
; GFX9-FLATSCR-NEXT: s_sub_i32 s1, s6, s1
; GFX9-FLATSCR-NEXT: s_add_i32 s2, s0, 1
; GFX9-FLATSCR-NEXT: s_sub_i32 s3, s1, s7
; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s1, s7
; GFX9-FLATSCR-NEXT: s_cselect_b32 s0, s2, s0
; GFX9-FLATSCR-NEXT: s_cselect_b32 s1, s3, s1
; GFX9-FLATSCR-NEXT: s_add_i32 s2, s0, 1
; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s1, s7
; GFX9-FLATSCR-NEXT: s_cselect_b32 s0, s2, s0
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0
; GFX9-FLATSCR-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: s_endpgm
;
Expand Down Expand Up @@ -1660,3 +1660,4 @@ entry:
%bc = bitcast <2 x i32> %r.1 to <2 x float>
ret <2 x float> %bc
}

14 changes: 7 additions & 7 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
Original file line number Diff line number Diff line change
Expand Up @@ -32,21 +32,21 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x
;
; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s2, v1
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s6, v1
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_cvt_pkrtz_v2f16_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s2, s3
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s6, s7
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_cvt_pkrtz_v2f16_f32:
Expand Down
32 changes: 16 additions & 16 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
;
; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs:
; SDAG-GFX10: ; %bb.0:
; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3|
; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7|
; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs:
Expand All @@ -50,12 +50,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
;
; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs:
; GISEL-GFX10: ; %bb.0:
; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3|
; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7|
; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%temp = call float @llvm.fabs.f32(float %a)
%result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float %temp, i32 1)
Expand All @@ -78,12 +78,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(
;
; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
; SDAG-GFX10: ; %bb.0:
; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3|
; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s0, |s6|, |s7|
; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
Expand All @@ -101,12 +101,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(
;
; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
; GISEL-GFX10: ; %bb.0:
; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3|
; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s0, |s6|, |s7|
; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%temp = call float @llvm.fabs.f32(float %a)
%src_input = call float @llvm.fabs.f32(float %src)
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,14 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
;
; GFX9-LABEL: v_fcmp_f32_oeq_with_fabs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0|
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |v0|
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f32_oeq_with_fabs:
Expand Down Expand Up @@ -88,14 +88,14 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(
;
; GFX9-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0|
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NEXT: v_cmp_eq_f32_e64 s[0:1], |s6|, |v0|
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1), i32)

; GCN-LABEL: {{^}}global_atomic_csub_rtn:
; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9:]+}}, s{{\[[0-9]+:[0-9]+\]}} glc
; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} th:TH_ATOMIC_RETURN
define amdgpu_kernel void @global_atomic_csub_rtn(ptr addrspace(1) %ptr, i32 %data) {
main_body:
%ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data)
Expand All @@ -15,7 +15,7 @@ main_body:

; GCN-LABEL: {{^}}global_atomic_csub_no_rtn:
; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v1, s[0:1]
; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @global_atomic_csub_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 {
main_body:
%ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data)
Expand All @@ -24,7 +24,7 @@ main_body:

; GCN-LABEL: {{^}}global_atomic_csub_off4_rtn:
; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 glc
; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] offset:4 th:TH_ATOMIC_RETURN
; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 th:TH_ATOMIC_RETURN
define amdgpu_kernel void @global_atomic_csub_off4_rtn(ptr addrspace(1) %ptr, i32 %data) {
main_body:
%p = getelementptr i32, ptr addrspace(1) %ptr, i64 1
Expand All @@ -34,7 +34,7 @@ main_body:

; GCN-LABEL: {{^}}global_atomic_csub_off4_no_rtn:
; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v1, s[0:1] offset:4
; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
define amdgpu_kernel void @global_atomic_csub_off4_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 {
main_body:
%p = getelementptr i32, ptr addrspace(1) %ptr, i64 1
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1759,16 +1759,16 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b)
;
; GFX10-LABEL: v_icmp_i1_ne0:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cmp_gt_u32 s2, 1
; GFX10-NEXT: s_cselect_b32 s2, -1, 0
; GFX10-NEXT: s_cmp_gt_u32 s3, 2
; GFX10-NEXT: s_cselect_b32 s3, -1, 0
; GFX10-NEXT: s_and_b32 s2, s2, s3
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_cmp_gt_u32 s6, 1
; GFX10-NEXT: s_cselect_b32 s0, -1, 0
; GFX10-NEXT: s_cmp_gt_u32 s7, 2
; GFX10-NEXT: s_cselect_b32 s1, -1, 0
; GFX10-NEXT: s_and_b32 s0, s0, s1
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
%c0 = icmp ugt i32 %a, 1
%c1 = icmp ugt i32 %b, 2
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1986,17 +1986,17 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b)
;
; GFX9-LABEL: v_icmp_i1_ne0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_gt_u32 s2, 1
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX9-NEXT: s_cmp_gt_u32 s3, 2
; GFX9-NEXT: s_cmp_gt_u32 s6, 1
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: s_cmp_gt_u32 s7, 2
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX9-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%c0 = icmp ugt i32 %a, 1
%c1 = icmp ugt i32 %b, 2
Expand Down
Loading