diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp index 0b2e3fcfd76df..d81822ac75777 100644 --- a/llvm/lib/Analysis/MemorySSA.cpp +++ b/llvm/lib/Analysis/MemorySSA.cpp @@ -311,6 +311,14 @@ instructionClobbersQuery(const MemoryDef *MD, const MemoryLocation &UseLoc, } if (auto *CB = dyn_cast_or_null(UseInst)) { + if (auto *CU = dyn_cast_or_null(DefInst)) { + MemoryEffects CBME = CB->getMemoryEffects(); + MemoryEffects CUME = CU->getMemoryEffects(); + if (CBME.onlyAccessesInaccessibleMem() || + CUME.onlyAccessesInaccessibleMem()) + if ((CBME & CUME & MemoryEffects::writeOnly()).onlyReadsMemory()) + return false; + } ModRefInfo I = AA.getModRefInfo(DefInst, CB); return isModOrRefSet(I); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll index 8a53c862371cf..8291058858bfa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll @@ -885,7 +885,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -894,16 +894,17 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB7_9 ; SI-NEXT: ; %bb.2: ; %.demote0 -; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] -; SI-NEXT: s_and_b64 exec, exec, s[6:7] +; SI-NEXT: s_wqm_b64 s[4:5], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[4:5] ; SI-NEXT: .LBB7_3: ; %.continue0.preheader ; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] ; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_branch .LBB7_5 ; SI-NEXT: .LBB7_4: ; %.continue1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 ; SI-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 ; SI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] @@ -911,7 +912,6 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: s_cbranch_execz .LBB7_8 ; SI-NEXT: .LBB7_5: ; %.continue0 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_mov_b64 s[4:5], s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v3, v2 ; SI-NEXT: s_nop 1 @@ -920,10 +920,10 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; SI-NEXT: s_and_b64 s[4:5], s[0:1], vcc -; SI-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; SI-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SI-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; SI-NEXT: s_and_b64 s[6:7], s[4:5], vcc +; SI-NEXT: s_xor_b64 s[6:7], s[6:7], -1 +; SI-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; SI-NEXT: s_xor_b64 s[6:7], exec, s[8:9] ; SI-NEXT: s_cbranch_execz .LBB7_4 ; SI-NEXT: ; %bb.6: ; %.demote1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -931,8 +931,8 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: s_cbranch_scc0 .LBB7_9 ; SI-NEXT: ; %bb.7: ; %.demote1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] -; SI-NEXT: s_and_b64 exec, exec, s[6:7] +; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[8:9] ; SI-NEXT: s_branch .LBB7_4 ; SI-NEXT: .LBB7_8: ; %.return ; SI-NEXT: s_or_b64 exec, exec, s[2:3] @@ -951,7 +951,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -960,16 +960,17 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX9-NEXT: ; %bb.2: ; %.demote0 -; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] ; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_branch .LBB7_5 ; GFX9-NEXT: .LBB7_4: ; %.continue1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_add_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] @@ -977,7 +978,6 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: s_cbranch_execz .LBB7_8 ; GFX9-NEXT: .LBB7_5: ; %.continue0 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-NEXT: s_nop 1 @@ -986,10 +986,10 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: s_and_b64 s[4:5], s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], -1 +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[8:9] ; GFX9-NEXT: s_cbranch_execz .LBB7_4 ; GFX9-NEXT: ; %bb.6: ; %.demote1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -997,8 +997,8 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX9-NEXT: ; %bb.7: ; %.demote1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_wqm_b64 s[8:9], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_branch .LBB7_4 ; GFX9-NEXT: .LBB7_8: ; %.return ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] @@ -1031,10 +1031,11 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-32-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-32-NEXT: s_mov_b32 s2, s0 ; GFX10-32-NEXT: s_branch .LBB7_5 ; GFX10-32-NEXT: .LBB7_4: ; %.continue1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-32-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, v0, v1 ; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -1042,17 +1043,16 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: s_cbranch_execz .LBB7_8 ; GFX10-32-NEXT: .LBB7_5: ; %.continue0 ; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-32-NEXT: s_mov_b32 s2, s0 ; GFX10-32-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2 ; GFX10-32-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-32-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX10-32-NEXT: s_and_b32 s2, s0, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s2, s2, -1 -; GFX10-32-NEXT: s_and_saveexec_b32 s3, s2 -; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s3 +; GFX10-32-NEXT: s_and_b32 s3, s2, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s3, s3, -1 +; GFX10-32-NEXT: s_and_saveexec_b32 s4, s3 +; GFX10-32-NEXT: s_xor_b32 s3, exec_lo, s4 ; GFX10-32-NEXT: s_cbranch_execz .LBB7_4 ; GFX10-32-NEXT: ; %bb.6: ; %.demote1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -1060,8 +1060,8 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-32-NEXT: ; %bb.7: ; %.demote1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-32-NEXT: s_wqm_b32 s3, s0 -; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: s_wqm_b32 s4, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4 ; GFX10-32-NEXT: s_branch .LBB7_4 ; GFX10-32-NEXT: .LBB7_8: ; %.return ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 @@ -1094,29 +1094,29 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-64-NEXT: s_mov_b64 s[2:3], 0 +; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1] +; GFX10-64-NEXT: s_mov_b64 s[4:5], 0 ; GFX10-64-NEXT: s_branch .LBB7_5 ; GFX10-64-NEXT: .LBB7_4: ; %.continue1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX10-64-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 -; GFX10-64-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: s_cbranch_execz .LBB7_8 ; GFX10-64-NEXT: .LBB7_5: ; %.continue0 ; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX10-64-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] +; GFX10-64-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[2:3] ; GFX10-64-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-64-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX10-64-NEXT: s_and_b64 s[4:5], s[0:1], vcc -; GFX10-64-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GFX10-64-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GFX10-64-NEXT: s_and_b64 s[6:7], s[2:3], vcc +; GFX10-64-NEXT: s_xor_b64 s[6:7], s[6:7], -1 +; GFX10-64-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; GFX10-64-NEXT: s_xor_b64 s[6:7], exec, s[8:9] ; GFX10-64-NEXT: s_cbranch_execz .LBB7_4 ; GFX10-64-NEXT: ; %bb.6: ; %.demote1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -1124,11 +1124,11 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-64-NEXT: ; %bb.7: ; %.demote1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] -; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: s_wqm_b64 s[8:9], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[8:9] ; GFX10-64-NEXT: s_branch .LBB7_4 ; GFX10-64-NEXT: .LBB7_8: ; %.return -; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll index c98feeb96232d..499d257cf38d4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -887,7 +887,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -901,31 +901,31 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: .LBB7_3: ; %.continue0.preheader ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_mov_b64 s[2:3], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] +; SI-NEXT: s_xor_b64 s[6:7], s[0:1], -1 ; SI-NEXT: s_branch .LBB7_5 ; SI-NEXT: .LBB7_4: ; %.continue1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_add_i32 s6, s6, 1 -; SI-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_add_i32 s10, s10, 1 +; SI-NEXT: v_cmp_ge_i32_e32 vcc, s10, v1 ; SI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] ; SI-NEXT: s_cbranch_execz .LBB7_8 ; SI-NEXT: .LBB7_5: ; %.continue0 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: s_mov_b64 s[4:5], s[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s10 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: s_xor_b64 s[4:5], s[0:1], -1 -; SI-NEXT: s_nop 0 +; SI-NEXT: s_nop 1 ; SI-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: s_nop 1 ; SI-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; SI-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; SI-NEXT: s_xor_b64 s[4:5], exec, s[8:9] +; SI-NEXT: s_or_b64 s[8:9], s[6:7], vcc +; SI-NEXT: s_and_saveexec_b64 s[12:13], s[8:9] +; SI-NEXT: s_xor_b64 s[8:9], exec, s[12:13] ; SI-NEXT: s_cbranch_execz .LBB7_4 ; SI-NEXT: ; %bb.6: ; %.demote1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -933,8 +933,8 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: s_cbranch_scc0 .LBB7_9 ; SI-NEXT: ; %bb.7: ; %.demote1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] -; SI-NEXT: s_and_b64 exec, exec, s[8:9] +; SI-NEXT: s_wqm_b64 s[12:13], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[12:13] ; SI-NEXT: s_branch .LBB7_4 ; SI-NEXT: .LBB7_8: ; %.return ; SI-NEXT: s_or_b64 exec, exec, s[2:3] @@ -953,7 +953,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: s_mov_b32 s10, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -967,31 +967,31 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], -1 ; GFX9-NEXT: s_branch .LBB7_5 ; GFX9-NEXT: .LBB7_4: ; %.continue1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_add_i32 s6, s6, 1 -; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_add_i32 s10, s10, 1 +; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, s10, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB7_8 ; GFX9-NEXT: .LBB7_5: ; %.continue0 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s10 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_xor_b64 s[4:5], s[0:1], -1 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[8:9] +; GFX9-NEXT: s_or_b64 s[8:9], s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[12:13], s[8:9] +; GFX9-NEXT: s_xor_b64 s[8:9], exec, s[12:13] ; GFX9-NEXT: s_cbranch_execz .LBB7_4 ; GFX9-NEXT: ; %bb.6: ; %.demote1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -999,8 +999,8 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX9-NEXT: ; %bb.7: ; %.demote1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX9-NEXT: s_wqm_b64 s[8:9], s[0:1] -; GFX9-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_wqm_b64 s[12:13], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: s_branch .LBB7_4 ; GFX9-NEXT: .LBB7_8: ; %.return ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] @@ -1032,29 +1032,29 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 ; GFX10-32-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-32-NEXT: s_mov_b32 s2, 0 +; GFX10-32-NEXT: s_mov_b32 s2, s0 +; GFX10-32-NEXT: s_xor_b32 s3, s0, -1 +; GFX10-32-NEXT: s_mov_b32 s4, 0 ; GFX10-32-NEXT: s_branch .LBB7_5 ; GFX10-32-NEXT: .LBB7_4: ; %.continue1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10-32-NEXT: s_add_i32 s2, s2, 1 -; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, s2, v1 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-32-NEXT: s_add_i32 s4, s4, 1 +; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, s4, v1 ; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: s_cbranch_execz .LBB7_8 ; GFX10-32-NEXT: .LBB7_5: ; %.continue0 ; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-32-NEXT: s_mov_b32 s3, s0 -; GFX10-32-NEXT: v_cndmask_b32_e64 v0, s2, 0, s3 -; GFX10-32-NEXT: s_xor_b32 s3, s0, -1 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, s4, 0, s2 ; GFX10-32-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-32-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_or_b32 s3, s3, vcc_lo -; GFX10-32-NEXT: s_and_saveexec_b32 s4, s3 -; GFX10-32-NEXT: s_xor_b32 s3, exec_lo, s4 +; GFX10-32-NEXT: s_or_b32 s5, s3, vcc_lo +; GFX10-32-NEXT: s_and_saveexec_b32 s6, s5 +; GFX10-32-NEXT: s_xor_b32 s5, exec_lo, s6 ; GFX10-32-NEXT: s_cbranch_execz .LBB7_4 ; GFX10-32-NEXT: ; %bb.6: ; %.demote1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -1062,8 +1062,8 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-32-NEXT: ; %bb.7: ; %.demote1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-32-NEXT: s_wqm_b32 s4, s0 -; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4 +; GFX10-32-NEXT: s_wqm_b32 s6, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s6 ; GFX10-32-NEXT: s_branch .LBB7_4 ; GFX10-32-NEXT: .LBB7_8: ; %.return ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 @@ -1082,7 +1082,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: s_mov_b64 s[0:1], exec ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX10-64-NEXT: s_mov_b32 s6, 0 +; GFX10-64-NEXT: s_mov_b32 s10, 0 ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -1095,29 +1095,29 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX10-64-NEXT: s_mov_b64 s[2:3], 0 +; GFX10-64-NEXT: s_xor_b64 s[6:7], s[0:1], -1 ; GFX10-64-NEXT: s_branch .LBB7_5 ; GFX10-64-NEXT: .LBB7_4: ; %.continue1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX10-64-NEXT: s_add_i32 s6, s6, 1 -; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX10-64-NEXT: s_add_i32 s10, s10, 1 +; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, s10, v1 ; GFX10-64-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: s_cbranch_execz .LBB7_8 ; GFX10-64-NEXT: .LBB7_5: ; %.continue0 ; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX10-64-NEXT: v_cndmask_b32_e64 v0, s6, 0, s[4:5] -; GFX10-64-NEXT: s_xor_b64 s[4:5], s[0:1], -1 +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, s10, 0, s[4:5] ; GFX10-64-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-64-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX10-64-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[8:9] +; GFX10-64-NEXT: s_or_b64 s[8:9], s[6:7], vcc +; GFX10-64-NEXT: s_and_saveexec_b64 s[12:13], s[8:9] +; GFX10-64-NEXT: s_xor_b64 s[8:9], exec, s[12:13] ; GFX10-64-NEXT: s_cbranch_execz .LBB7_4 ; GFX10-64-NEXT: ; %bb.6: ; %.demote1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -1125,8 +1125,8 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-64-NEXT: ; %bb.7: ; %.demote1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-64-NEXT: s_wqm_b64 s[8:9], s[0:1] -; GFX10-64-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX10-64-NEXT: s_wqm_b64 s[12:13], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX10-64-NEXT: s_branch .LBB7_4 ; GFX10-64-NEXT: .LBB7_8: ; %.return ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] diff --git a/llvm/test/Transforms/LICM/hoist-inaccesiblemem-call.ll b/llvm/test/Transforms/LICM/hoist-inaccesiblemem-call.ll new file mode 100644 index 0000000000000..8b2ac7d8fdaa6 --- /dev/null +++ b/llvm/test/Transforms/LICM/hoist-inaccesiblemem-call.ll @@ -0,0 +1,90 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -aa-pipeline=basic-aa -passes='require,require,loop-mssa(licm)' < %s -S | FileCheck %s + +define void @inaccessible_hoist(ptr noalias %loc, ptr noalias %loc2){ +; CHECK-LABEL: define void @inaccessible_hoist( +; CHECK-SAME: ptr noalias [[LOC:%.*]], ptr noalias [[LOC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[LOC2]], align 4 +; CHECK-NEXT: store i32 [[VAL]], ptr [[LOC]], align 4 +; CHECK-NEXT: call void @fn_write_inaccessible_mem() +; CHECK-NEXT: call void @fn_read_inaccessible_mem() +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP:.*:]] +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: br label %[[FOR_BODY]] +; +entry: + br label %for.body +for.cond.cleanup: ; preds = %for.body + ret void +for.body: + %val = load i32, ptr %loc2 + store i32 %val, ptr %loc + call void @fn_write_inaccessible_mem() + call void @fn_read_inaccessible_mem() + br label %for.body +} + + +define void @neg_inaccessible_hoist(ptr noalias %loc, ptr noalias %loc2){ +; CHECK-LABEL: define void @neg_inaccessible_hoist( +; CHECK-SAME: ptr noalias [[LOC:%.*]], ptr noalias [[LOC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[LOC2]], align 4 +; CHECK-NEXT: store i32 [[VAL]], ptr [[LOC]], align 4 +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: call void @fn_write_inaccessible_mem() +; CHECK-NEXT: call void @fn_read_inaccessible_mem() +; CHECK-NEXT: call void @fn_readwrite_inaccessible_mem() +; CHECK-NEXT: br label %[[FOR_BODY]] +; +entry: + br label %for.body +for.body: + %val = load i32, ptr %loc2 + store i32 %val, ptr %loc + call void @fn_write_inaccessible_mem() + call void @fn_read_inaccessible_mem() + call void @fn_readwrite_inaccessible_mem() + br label %for.body +} + + +; Nothing should be hoisted from the loop because volatile +; sets inaccessible memory to read write +define void @neg_volatile(ptr %loc, ptr %loc2) { +; CHECK-LABEL: define void @neg_volatile( +; CHECK-SAME: ptr [[LOC:%.*]], ptr [[LOC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: store volatile i32 0, ptr [[LOC]], align 4 +; CHECK-NEXT: call void @fn_write_inaccessible_mem() +; CHECK-NEXT: call void @fn_read_inaccessible_mem() +; CHECK-NEXT: br label %[[LOOP]] +; +entry: + br label %loop + +loop: + %val = load i32, ptr %loc2 + store volatile i32 0, ptr %loc + call void @fn_write_inaccessible_mem() + call void @fn_read_inaccessible_mem() + br label %loop +} + +declare void @fn_write_inaccessible_mem()#0 + memory(inaccessiblemem: write) + +declare void @fn_read_inaccessible_mem()#0 + memory(inaccessiblemem: read) + +declare void @fn_readwrite_inaccessible_mem()#0 + memory(inaccessiblemem: readwrite) + +; Needs to set nounwind because of doesNotThrow +attributes #0 = { mustprogress nofree norecurse nosync nounwind}