Expand Up
@@ -15,8 +15,8 @@ declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0
define amdgpu_kernel void @fnearbyint_f16 (ptr addrspace (1 ) %out , half %in ) #1 {
; SI-LABEL: fnearbyint_f16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[0:1 ], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1 ], 0x9
; SI-NEXT: s_load_dword s4, s[2:3 ], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3 ], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
Expand All
@@ -28,23 +28,24 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 {
;
; CI-LABEL: fnearbyint_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[0:1], 0xb
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_load_dword s0, s[2:3], 0xb
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: v_rndne_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
;
; VI-LABEL: fnearbyint_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2 , s[0:1 ], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1 ], 0x24
; VI-NEXT: s_load_dword s4 , s[2:3 ], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3 ], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f16_e32 v2, s2
; VI-NEXT: v_rndne_f16_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_short v[0:1], v2
Expand All
@@ -53,11 +54,11 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 {
; GFX11-LABEL: fnearbyint_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2 , s[0:1 ], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1 ], 0x24
; GFX11-NEXT: s_load_b32 s4 , s[2:3 ], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3 ], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f16_e32 v1, s2
; GFX11-NEXT: v_rndne_f16_e32 v1, s4
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand All
@@ -70,8 +71,8 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 {
define amdgpu_kernel void @fnearbyint_f32 (ptr addrspace (1 ) %out , float %in ) #1 {
; SICI-LABEL: fnearbyint_f32:
; SICI: ; %bb.0: ; %entry
; SICI-NEXT: s_load_dword s4, s[0:1 ], 0xb
; SICI-NEXT: s_load_dwordx2 s[0:1], s[0:1 ], 0x9
; SICI-NEXT: s_load_dword s4, s[2:3 ], 0xb
; SICI-NEXT: s_load_dwordx2 s[0:1], s[2:3 ], 0x9
; SICI-NEXT: s_mov_b32 s3, 0xf000
; SICI-NEXT: s_mov_b32 s2, -1
; SICI-NEXT: s_waitcnt lgkmcnt(0)
Expand All
@@ -81,10 +82,10 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 {
;
; VI-LABEL: fnearbyint_f32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s2 , s[0:1 ], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1 ], 0x24
; VI-NEXT: s_load_dword s4 , s[2:3 ], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3 ], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f32_e32 v2, s2
; VI-NEXT: v_rndne_f32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
Expand All
@@ -93,11 +94,11 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 {
; GFX11-LABEL: fnearbyint_f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2 , s[0:1 ], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1 ], 0x24
; GFX11-NEXT: s_load_b32 s4 , s[2:3 ], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3 ], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f32_e32 v1, s2
; GFX11-NEXT: v_rndne_f32_e32 v1, s4
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand All
@@ -111,7 +112,7 @@ entry:
define amdgpu_kernel void @fnearbyint_v2f32 (ptr addrspace (1 ) %out , <2 x float > %in ) #1 {
; SICI-LABEL: fnearbyint_v2f32:
; SICI: ; %bb.0: ; %entry
; SICI-NEXT: s_load_dwordx4 s[0:3], s[0:1 ], 0x9
; SICI-NEXT: s_load_dwordx4 s[0:3], s[2:3 ], 0x9
; SICI-NEXT: s_mov_b32 s7, 0xf000
; SICI-NEXT: s_mov_b32 s6, -1
; SICI-NEXT: s_waitcnt lgkmcnt(0)
Expand All
@@ -124,7 +125,7 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %
;
; VI-LABEL: fnearbyint_v2f32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1 ], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3 ], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_rndne_f32_e32 v1, s3
Expand All
@@ -135,7 +136,7 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %
;
; GFX11-LABEL: fnearbyint_v2f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1 ], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3 ], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f32_e32 v1, s3
Expand All
@@ -153,8 +154,8 @@ entry:
define amdgpu_kernel void @fnearbyint_v4f32 (ptr addrspace (1 ) %out , <4 x float > %in ) #1 {
; SICI-LABEL: fnearbyint_v4f32:
; SICI: ; %bb.0: ; %entry
; SICI-NEXT: s_load_dwordx4 s[4:7], s[0:1 ], 0xd
; SICI-NEXT: s_load_dwordx2 s[0:1], s[0:1 ], 0x9
; SICI-NEXT: s_load_dwordx4 s[4:7], s[2:3 ], 0xd
; SICI-NEXT: s_load_dwordx2 s[0:1], s[2:3 ], 0x9
; SICI-NEXT: s_mov_b32 s3, 0xf000
; SICI-NEXT: s_mov_b32 s2, -1
; SICI-NEXT: s_waitcnt lgkmcnt(0)
Expand All
@@ -167,8 +168,8 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %
;
; VI-LABEL: fnearbyint_v4f32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1 ], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1 ], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3 ], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3 ], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f32_e32 v3, s7
; VI-NEXT: v_mov_b32_e32 v5, s1
Expand All
@@ -182,8 +183,8 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %
; GFX11-LABEL: fnearbyint_v4f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1 ], 0x34
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1 ], 0x24
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3 ], 0x34
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3 ], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f32_e32 v3, s7
Expand All
@@ -203,7 +204,7 @@ entry:
define amdgpu_kernel void @nearbyint_f64 (ptr addrspace (1 ) %out , double %in ) {
; SI-LABEL: nearbyint_f64:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1 ], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3 ], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_brev_b32 s8, -2
Expand All
@@ -227,7 +228,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) {
;
; CI-LABEL: nearbyint_f64:
; CI: ; %bb.0: ; %entry
; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1 ], 0x9
; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3 ], 0x9
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_rndne_f64_e32 v[0:1], s[2:3]
; CI-NEXT: s_mov_b32 s3, 0xf000
Expand All
@@ -237,7 +238,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) {
;
; VI-LABEL: nearbyint_f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1 ], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3 ], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f64_e32 v[0:1], s[2:3]
; VI-NEXT: v_mov_b32_e32 v2, s0
Expand All
@@ -247,7 +248,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) {
;
; GFX11-LABEL: nearbyint_f64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1 ], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3 ], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[2:3]
Expand All
@@ -263,41 +264,41 @@ entry:
define amdgpu_kernel void @nearbyint_v2f64 (ptr addrspace (1 ) %out , <2 x double > %in ) {
; SI-LABEL: nearbyint_v2f64:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[4:5 ], s[0:1 ], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3 ], s[0:1 ], 0xd
; SI-NEXT: s_mov_b32 s7 , 0xf000
; SI-NEXT: s_mov_b32 s6 , -1
; SI-NEXT: s_load_dwordx2 s[0:1 ], s[2:3 ], 0x9
; SI-NEXT: s_load_dwordx4 s[4:7 ], s[2:3 ], 0xd
; SI-NEXT: s_mov_b32 s3 , 0xf000
; SI-NEXT: s_mov_b32 s2 , -1
; SI-NEXT: s_brev_b32 s10, -2
; SI-NEXT: v_mov_b32_e32 v6, 0x43300000
; SI-NEXT: s_mov_b32 s9, 0x432fffff
; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_mov_b32 s8, s2
; SI-NEXT: v_mov_b32_e32 v4, s8
; SI-NEXT: v_mov_b32_e32 v5, s9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v7, s3
; SI-NEXT: v_mov_b32_e32 v7, s7
; SI-NEXT: v_bfi_b32 v1, s10, v6, v7
; SI-NEXT: v_mov_b32_e32 v8, s2
; SI-NEXT: v_mov_b32_e32 v9, s1
; SI-NEXT: v_mov_b32_e32 v10, s0
; SI-NEXT: v_add_f64 v[2:3], s[2:3 ], v[0:1]
; SI-NEXT: v_mov_b32_e32 v8, s6
; SI-NEXT: v_mov_b32_e32 v9, s5
; SI-NEXT: v_mov_b32_e32 v10, s4
; SI-NEXT: v_add_f64 v[2:3], s[6:7 ], v[0:1]
; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[0:1]
; SI-NEXT: v_bfi_b32 v1, s10, v6, v9
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3 ]|, v[4:5]
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7 ]|, v[4:5]
; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; SI-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
; SI-NEXT: v_add_f64 v[6:7], s[0:1 ], v[0:1]
; SI-NEXT: v_add_f64 v[6:7], s[4:5 ], v[0:1]
; SI-NEXT: v_add_f64 v[0:1], v[6:7], -v[0:1]
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[0:1 ]|, v[4:5]
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5 ]|, v[4:5]
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7 ], 0
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3 ], 0
; SI-NEXT: s_endpgm
;
; CI-LABEL: nearbyint_v2f64:
; CI: ; %bb.0: ; %entry
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1 ], 0xd
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1 ], 0x9
; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3 ], 0xd
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3 ], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
Expand All
@@ -308,8 +309,8 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %
;
; VI-LABEL: nearbyint_v2f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1 ], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1 ], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3 ], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3 ], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f64_e32 v[2:3], s[6:7]
; VI-NEXT: v_rndne_f64_e32 v[0:1], s[4:5]
Expand All
@@ -321,8 +322,8 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %
; GFX11-LABEL: nearbyint_v2f64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1 ], 0x34
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1 ], 0x24
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3 ], 0x34
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3 ], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[6:7]
Expand All
@@ -340,8 +341,8 @@ entry:
define amdgpu_kernel void @nearbyint_v4f64 (ptr addrspace (1 ) %out , <4 x double > %in ) {
; SI-LABEL: nearbyint_v4f64:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1 ], 0x9
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1 ], 0x11
; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3 ], 0x9
; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3 ], 0x11
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_brev_b32 s14, -2
Expand Down
Expand Up
@@ -390,8 +391,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
;
; CI-LABEL: nearbyint_v4f64:
; CI: ; %bb.0: ; %entry
; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1 ], 0x11
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1 ], 0x9
; CI-NEXT: s_load_dwordx8 s[4:11], s[2:3 ], 0x11
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3 ], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
Expand All
@@ -405,8 +406,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
;
; VI-LABEL: nearbyint_v4f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1 ], 0x44
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1 ], 0x24
; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3 ], 0x44
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3 ], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f64_e32 v[6:7], s[10:11]
; VI-NEXT: v_rndne_f64_e32 v[4:5], s[8:9]
Expand All
@@ -425,8 +426,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
; GFX11-LABEL: nearbyint_v4f64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1 ], 0x44
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1 ], 0x24
; GFX11-NEXT: s_load_b256 s[4:11], s[2:3 ], 0x44
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3 ], 0x24
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f64_e32 v[6:7], s[10:11]
Expand Down