diff --git a/llvm/test/CodeGen/AMDGPU/omod.ll b/llvm/test/CodeGen/AMDGPU/omod.ll index efc443c5cb1399..fcc3d6ec736975 100644 --- a/llvm/test/CodeGen/AMDGPU/omod.ll +++ b/llvm/test/CodeGen/AMDGPU/omod.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefixes=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s ; IEEE bit enabled for compute kernel, so shouldn't use. define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 { @@ -38,6 +39,20 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac ; VI-NEXT: v_mul_f32_e32 v2, 0.5, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -84,6 +99,20 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid @@ -130,6 +159,20 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out ; VI-NEXT: v_mul_f32_e32 v2, 0.5, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_div2_f32_enable_ieee_nsz: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -176,6 +219,20 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_div2_f64_enable_ieee_nsz: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid @@ -203,6 +260,15 @@ define amdgpu_ps void @v_omod_div2_f32_signed_zeros(float %a) #4 { ; VI-NEXT: v_mul_f32_e32 v0, 0.5, v0 ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_div2_f32_signed_zeros: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, 0.5, v0 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 %div2 = fmul float %add, 0.5 store float %div2, ptr addrspace(1) undef @@ -226,6 +292,15 @@ define amdgpu_ps void @v_omod_div2_f64_signed_zeros(double %a) #4 { ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_div2_f64_signed_zeros: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 +; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd double %a, 1.0 %div2 = fmul double %add, 0.5 store double %div2, ptr addrspace(1) undef @@ -246,6 +321,13 @@ define amdgpu_ps void @v_omod_div2_f32(float %a) #0 { ; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2 ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_div2_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 %div2 = fmul float %add, 0.5 store float %div2, ptr addrspace(1) undef @@ -266,6 +348,13 @@ define amdgpu_ps void @v_omod_div2_f64(double %a) #5 { ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 div:2 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_div2_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 div:2 +; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd nsz double %a, 1.0 %div2 = fmul nsz double %add, 0.5 store double %div2, ptr addrspace(1) undef @@ -286,6 +375,13 @@ define amdgpu_ps void @v_omod_mul2_f32(float %a) #0 { ; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:2 ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_mul2_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:2 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 %div2 = fmul float %add, 2.0 store float %div2, ptr addrspace(1) undef @@ -306,6 +402,13 @@ define amdgpu_ps void @v_omod_mul2_f64(double %a) #5 { ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:2 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_mul2_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:2 +; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd nsz double %a, 1.0 %div2 = fmul nsz double %add, 2.0 store double %div2, ptr addrspace(1) undef @@ -326,6 +429,13 @@ define amdgpu_ps void @v_omod_mul4_f32(float %a) #0 { ; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4 ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_mul4_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 %div2 = fmul float %add, 4.0 store float %div2, ptr addrspace(1) undef @@ -346,6 +456,13 @@ define amdgpu_ps void @v_omod_mul4_f64(double %a) #5 { ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:4 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_mul4_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:4 +; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd nsz double %a, 1.0 %div2 = fmul nsz double %add, 4.0 store double %div2, ptr addrspace(1) undef @@ -372,6 +489,18 @@ define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 { ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_mul4_multi_use_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, 4.0, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v[0:1], v1, off +; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 %div2 = fmul float %add, 4.0 store float %div2, ptr addrspace(1) undef @@ -393,6 +522,13 @@ define amdgpu_ps void @v_omod_mul4_dbg_use_f32(float %a) #0 { ; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4 ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_mul4_dbg_use_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10 %div2 = fmul float %add, 4.0 @@ -415,6 +551,13 @@ define amdgpu_ps void @v_clamp_omod_div2_f32(float %a) #0 { ; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp div:2 ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_clamp_omod_div2_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp div:2 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 %div2 = fmul float %add, 0.5 @@ -441,6 +584,15 @@ define amdgpu_ps void @v_omod_div2_clamp_f32(float %a) #0 { ; VI-NEXT: v_mul_f32_e32 v0, 0.5, v0 ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_div2_clamp_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, 0.5, v0 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 %max = call float @llvm.maxnum.f32(float %add, float 0.0) %clamp = call float @llvm.minnum.f32(float %max, float 1.0) @@ -465,6 +617,15 @@ define amdgpu_ps void @v_omod_div2_abs_src_f32(float %a) #0 { ; VI-NEXT: v_mul_f32_e64 v0, |v0|, 0.5 ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_div2_abs_src_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e64 v0, |v0|, 0.5 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 %abs.add = call float @llvm.fabs.f32(float %add) %div2 = fmul float %abs.add, 0.5 @@ -486,6 +647,13 @@ define amdgpu_ps void @v_omod_add_self_clamp_f32(float %a) #0 { ; VI-NEXT: v_add_f32_e64 v0, v0, v0 clamp ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_add_self_clamp_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e64 v0, v0, v0 clamp +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd float %a, %a %max = call float @llvm.maxnum.f32(float %add, float 0.0) %clamp = call float @llvm.minnum.f32(float %max, float 1.0) @@ -509,6 +677,15 @@ define amdgpu_ps void @v_omod_add_clamp_self_f32(float %a) #0 { ; VI-NEXT: v_add_f32_e32 v0, v0, v0 ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_add_clamp_self_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_max_f32_e64 v0, v0, v0 clamp +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %max = call float @llvm.maxnum.f32(float %a, float 0.0) %clamp = call float @llvm.minnum.f32(float %max, float 1.0) %add = fadd float %clamp, %clamp @@ -532,6 +709,15 @@ define amdgpu_ps void @v_omod_add_abs_self_f32(float %a) #0 { ; VI-NEXT: v_add_f32_e64 v0, |v0|, |v0| ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_add_abs_self_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e64 v0, |v0|, |v0| +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %x = fadd float %a, 1.0 %abs.x = call float @llvm.fabs.f32(float %x) %add = fadd float %abs.x, %abs.x @@ -555,6 +741,15 @@ define amdgpu_ps void @v_omod_add_abs_x_x_f32(float %a) #0 { ; VI-NEXT: v_add_f32_e64 v0, |v0|, v0 ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_add_abs_x_x_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e64 v0, |v0|, v0 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %x = fadd float %a, 1.0 %abs.x = call float @llvm.fabs.f32(float %x) %add = fadd float %abs.x, %x @@ -578,6 +773,15 @@ define amdgpu_ps void @v_omod_add_x_abs_x_f32(float %a) #0 { ; VI-NEXT: v_add_f32_e64 v0, v0, |v0| ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_add_x_abs_x_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e64 v0, v0, |v0| +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %x = fadd float %a, 1.0 %abs.x = call float @llvm.fabs.f32(float %x) %add = fadd float %x, %abs.x @@ -602,6 +806,15 @@ define amdgpu_ps void @v_omod_div2_omod_div2_f32(float %a) #0 { ; VI-NEXT: v_mul_f32_e32 v0, 0.5, v0 ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_div2_omod_div2_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, 0.5, v0 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 %div2.0 = fmul float %add, 0.5 %div2.1 = fmul float %div2.0, 0.5 @@ -626,6 +839,15 @@ define amdgpu_ps void @v_omod_div2_f32_denormals(float %a) #2 { ; VI-NEXT: v_mul_f32_e32 v0, 0.5, v0 ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_div2_f32_denormals: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, 0.5, v0 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 %div2 = fmul float %add, 0.5 store float %div2, ptr addrspace(1) undef @@ -649,6 +871,15 @@ define amdgpu_ps void @v_omod_div2_f64_denormals(double %a) #6 { ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_div2_f64_denormals: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 +; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd double %a, 1.0 %div2 = fmul double %add, 0.5 store double %div2, ptr addrspace(1) undef @@ -672,6 +903,15 @@ define amdgpu_ps void @v_omod_mul2_f32_denormals(float %a) #2 { ; VI-NEXT: v_add_f32_e32 v0, v0, v0 ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_mul2_f32_denormals: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 %mul2 = fadd float %add, %add store float %mul2, ptr addrspace(1) undef @@ -695,6 +935,15 @@ define amdgpu_ps void @v_omod_mul2_f64_denormals(double %a) #2 { ; VI-NEXT: v_add_f64 v[0:1], v[0:1], v[0:1] ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_mul2_f64_denormals: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd double %a, 1.0 %mul2 = fadd double %add, %add store double %mul2, ptr addrspace(1) undef @@ -720,6 +969,15 @@ define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 { ; VI-NEXT: v_mul_f16_e32 v0, 0.5, v0 ; VI-NEXT: flat_store_short v[0:1], v0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_div2_f16_denormals: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f16_e32 v0, 0.5, v0 +; GFX11-NEXT: global_store_b16 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd half %a, 1.0 %div2 = fmul half %add, 0.5 store half %div2, ptr addrspace(1) undef @@ -745,6 +1003,15 @@ define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 { ; VI-NEXT: v_add_f16_e32 v0, v0, v0 ; VI-NEXT: flat_store_short v[0:1], v0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_mul2_f16_denormals: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v0, v0, v0 +; GFX11-NEXT: global_store_b16 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd half %a, 1.0 %mul2 = fadd half %add, %add store half %mul2, ptr addrspace(1) undef @@ -768,6 +1035,13 @@ define amdgpu_ps void @v_omod_div2_f16_no_denormals(half %a) #3 { ; VI-NEXT: v_add_f16_e64 v0, v0, 1.0 div:2 ; VI-NEXT: flat_store_short v[0:1], v0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_div2_f16_no_denormals: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f16_e64 v0, v0, 1.0 div:2 +; GFX11-NEXT: global_store_b16 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd half %a, 1.0 %div2 = fmul half %add, 0.5 store half %div2, ptr addrspace(1) undef @@ -790,6 +1064,16 @@ define amdgpu_ps void @v_omod_mac_to_mad(float %b, float %a) #0 { ; VI-NEXT: v_mul_f32_e32 v0, v1, v0 ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_omod_mac_to_mad: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mul_f32_e32 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e64 v1, v1, v0 mul:2 +; GFX11-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %mul = fmul float %a, %a %add = fadd float %mul, %b %mad = fmul float %add, 2.0